public class AnalyzeSpark
extends java.lang.Object
RDD<List<Writable>>
and RDD<List<List<Writable>>
Modifier and Type | Field and Description |
---|---|
static int |
DEFAULT_HISTOGRAM_BUCKETS |
Constructor and Description |
---|
AnalyzeSpark() |
Modifier and Type | Method and Description |
---|---|
static DataAnalysis |
analyze(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data) |
static DataAnalysis |
analyze(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
int maxHistogramBuckets) |
static DataQualityAnalysis |
analyzeQuality(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data) |
static DataQualityAnalysis |
analyzeQualitySequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static SequenceDataAnalysis |
analyzeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static SequenceDataAnalysis |
analyzeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
int maxHistogramBuckets) |
static java.util.List<Writable> |
getUnique(java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Get a list of unique values from the specified column.
|
static java.util.List<Writable> |
getUniqueSequence(java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
Get a list of unique values from the specified column of a sequence
|
static java.util.List<java.util.List<Writable>> |
sample(int count,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample a set of examples
|
static java.util.List<Writable> |
sampleFromColumn(int count,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample values from a single column
|
static java.util.List<Writable> |
sampleFromColumnSequence(int count,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
Randomly sample values from a single column, in all sequences.
|
static java.util.List<Writable> |
sampleInvalidFromColumn(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Randomly sample a set of invalid values from a specified column.
|
static java.util.List<Writable> |
sampleInvalidFromColumn(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
boolean ignoreMissing)
Randomly sample a set of invalid values from a specified column.
|
static java.util.List<Writable> |
sampleInvalidFromColumnSequence(int numToSample,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
Randomly sample a set of invalid values from a specified column, for a sequence data set.
|
static java.util.Map<Writable,java.lang.Long> |
sampleMostFrequentFromColumn(int nMostFrequent,
java.lang.String columnName,
Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Sample the N most frequently occurring values in the specified column
|
static java.util.List<java.util.List<java.util.List<Writable>>> |
sampleSequence(int count,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
Randomly sample a number of sequences from the data
|
public static final int DEFAULT_HISTOGRAM_BUCKETS
public static SequenceDataAnalysis analyzeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
public static SequenceDataAnalysis analyzeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, int maxHistogramBuckets)
schema
- data
- maxHistogramBuckets
- public static DataAnalysis analyze(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
public static DataAnalysis analyze(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, int maxHistogramBuckets)
public static java.util.List<Writable> sampleFromColumn(int count, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
count
- Number of values to samplecolumnName
- Name of the column to sample fromschema
- Schemadata
- Data to sample frompublic static java.util.List<Writable> sampleFromColumnSequence(int count, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
count
- Number of values to samplecolumnName
- Name of the column to sample fromschema
- SchemasequenceData
- Data to sample frompublic static java.util.List<Writable> getUnique(java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
getUniqueSequence(String, Schema, JavaRDD)
columnName
- Name of the column to get unique values fromschema
- Data schemadata
- Data to get unique values frompublic static java.util.List<Writable> getUniqueSequence(java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequenceData)
columnName
- Name of the column to get unique values fromschema
- Data schemasequenceData
- Sequence data to get unique values frompublic static java.util.List<java.util.List<Writable>> sample(int count, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
count
- Number of samples to generatedata
- Data to sample frompublic static java.util.List<java.util.List<java.util.List<Writable>>> sampleSequence(int count, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
count
- Number of sequences to sampledata
- Data to sample frompublic static DataQualityAnalysis analyzeQualitySequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
schema
- data
- public static DataQualityAnalysis analyzeQuality(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema
- data
- public static java.util.List<Writable> sampleInvalidFromColumn(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
numToSample
- Maximum number of invalid values to samplecolumnName
- Same of the column from which to sample invalid valuesschema
- Data schemadata
- Datapublic static java.util.List<Writable> sampleInvalidFromColumn(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, boolean ignoreMissing)
numToSample
- Maximum number of invalid values to samplecolumnName
- Same of the column from which to sample invalid valuesschema
- Data schemadata
- DataignoreMissing
- If true: ignore missing values (NullWritable or empty/null string) when sampling. If false: include missing values in samplingpublic static java.util.List<Writable> sampleInvalidFromColumnSequence(int numToSample, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
numToSample
- Maximum number of invalid values to samplecolumnName
- Same of the column from which to sample invalid valuesschema
- Data schemadata
- Datapublic static java.util.Map<Writable,java.lang.Long> sampleMostFrequentFromColumn(int nMostFrequent, java.lang.String columnName, Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
nMostFrequent
- Top N values to samplecolumnName
- Name of the column to sample fromschema
- Schema of the datadata
- RDD containing the data