public class AnalyzeSpark
extends java.lang.Object
RDD<List<Writable>> and RDD<List<List<Writable>>| Modifier and Type | Field and Description |
|---|---|
static int |
DEFAULT_HISTOGRAM_BUCKETS |
| Constructor and Description |
|---|
AnalyzeSpark() |
| Modifier and Type | Method and Description |
|---|---|
static DataAnalysis |
analyze(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data) |
static DataAnalysis |
analyze(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
int maxHistogramBuckets) |
static DataQualityAnalysis |
analyzeQuality(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data) |
static DataQualityAnalysis |
analyzeQualitySequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static SequenceDataAnalysis |
analyzeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static SequenceDataAnalysis |
analyzeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
int maxHistogramBuckets) |
static java.util.List<Writable> |
getUnique(java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Get a list of unique values from the specified column.
|
static java.util.List<Writable> |
getUniqueSequence(java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
Get a list of unique values from the specified column of a sequence
|
static java.util.List<java.util.List<Writable>> |
sample(int count,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample a set of examples
|
static java.util.List<Writable> |
sampleFromColumn(int count,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample values from a single column
|
static java.util.List<Writable> |
sampleFromColumnSequence(int count,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
Randomly sample values from a single column, in all sequences.
|
static java.util.List<Writable> |
sampleInvalidFromColumn(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample a set of invalid values from a specified column.
|
static java.util.List<Writable> |
sampleInvalidFromColumn(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
boolean ignoreMissing)
Randomly sample a set of invalid values from a specified column.
|
static java.util.List<Writable> |
sampleInvalidFromColumnSequence(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
Randomly sample a set of invalid values from a specified column, for a sequence data set.
|
static java.util.Map<Writable,java.lang.Long> |
sampleMostFrequentFromColumn(int nMostFrequent,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Sample the N most frequently occurring values in the specified column
|
static java.util.List<java.util.List<java.util.List<Writable>>> |
sampleSequence(int count,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
Randomly sample a number of sequences from the data
|
public static final int DEFAULT_HISTOGRAM_BUCKETS
public static SequenceDataAnalysis analyzeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
public static SequenceDataAnalysis analyzeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, int maxHistogramBuckets)
schema - data - maxHistogramBuckets - public static DataAnalysis analyze(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
public static DataAnalysis analyze(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, int maxHistogramBuckets)
public static java.util.List<Writable> sampleFromColumn(int count, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
count - Number of values to samplecolumnName - Name of the column to sample fromschema - Schemadata - Data to sample frompublic static java.util.List<Writable> sampleFromColumnSequence(int count, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
count - Number of values to samplecolumnName - Name of the column to sample fromschema - SchemasequenceData - Data to sample frompublic static java.util.List<Writable> getUnique(java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
getUniqueSequence(String, Schema, JavaRDD)columnName - Name of the column to get unique values fromschema - Data schemadata - Data to get unique values frompublic static java.util.List<Writable> getUniqueSequence(java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
columnName - Name of the column to get unique values fromschema - Data schemasequenceData - Sequence data to get unique values frompublic static java.util.List<java.util.List<Writable>> sample(int count, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
count - Number of samples to generatedata - Data to sample frompublic static java.util.List<java.util.List<java.util.List<Writable>>> sampleSequence(int count, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
count - Number of sequences to sampledata - Data to sample frompublic static DataQualityAnalysis analyzeQualitySequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
schema - data - public static DataQualityAnalysis analyzeQuality(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema - data - public static java.util.List<Writable> sampleInvalidFromColumn(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
numToSample - Maximum number of invalid values to samplecolumnName - Same of the column from which to sample invalid valuesschema - Data schemadata - Datapublic static java.util.List<Writable> sampleInvalidFromColumn(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, boolean ignoreMissing)
numToSample - Maximum number of invalid values to samplecolumnName - Same of the column from which to sample invalid valuesschema - Data schemadata - DataignoreMissing - If true: ignore missing values (NullWritable or empty/null string) when sampling. If false: include missing values in samplingpublic static java.util.List<Writable> sampleInvalidFromColumnSequence(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
numToSample - Maximum number of invalid values to samplecolumnName - Same of the column from which to sample invalid valuesschema - Data schemadata - Datapublic static java.util.Map<Writable,java.lang.Long> sampleMostFrequentFromColumn(int nMostFrequent, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
nMostFrequent - Top N values to samplecolumnName - Name of the column to sample fromschema - Schema of the datadata - RDD containing the data