public class Normalization
extends java.lang.Object
| Constructor and Description |
|---|
Normalization() |
| Modifier and Type | Method and Description |
|---|---|
static java.util.List<org.apache.spark.sql.Row> |
aggregate(DataRowsFacade data,
java.lang.String[] columns,
java.lang.String[] functions)
Aggregate based on an arbitrary list
of aggregation and grouping functions
|
static java.util.List<org.apache.spark.sql.Row> |
minMaxColumns(DataRowsFacade data,
java.util.List<java.lang.String> columns)
Returns the min and max of the given columns
|
static java.util.List<org.apache.spark.sql.Row> |
minMaxColumns(DataRowsFacade data,
java.lang.String... columns)
Returns the min and max of the given columns.
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
double min,
double max)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
double min,
double max,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
double min,
double max)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
double min,
double max,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
java.util.List<java.lang.String> skipColumns)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
double min,
double max)
Normalize each column of a sequence, based on min/max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
double min,
double max,
java.util.List<java.lang.String> excludeColumns)
Normalize each column of a sequence, based on min/max
|
static java.util.List<org.apache.spark.sql.Row> |
stdDevMeanColumns(DataRowsFacade data,
java.util.List<java.lang.String> columns)
Returns the standard deviation and mean of the given columns
|
static java.util.List<org.apache.spark.sql.Row> |
stdDevMeanColumns(DataRowsFacade data,
java.lang.String... columns)
Returns the standard deviation
and mean of the given columns
The list returned is a list of size 2 where each row
represents the standard deviation of each column and the mean of each column
|
static DataRowsFacade |
zeromeanUnitVariance(DataRowsFacade frame)
Normalize by zero mean unit variance
|
static DataRowsFacade |
zeromeanUnitVariance(DataRowsFacade frame,
java.util.List<java.lang.String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
java.util.List<java.lang.String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence)
Normalize the sequence by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence,
java.util.List<java.lang.String> excludeColumns)
Normalize the sequence by zero mean unit variance
|
public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame)
frame - the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema - the schema to use
to create the data framedata - the data to normalizepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max)
dataFrame - the dataframe to scalemin - the minimum valuemax - the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, double min, double max)
schema - the schema of the data to scaledata - the data to sclaemin - the minimum valuemax - the maximum valuepublic static DataRowsFacade normalize(DataRowsFacade dataFrame)
dataFrame - the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema - the schema of the data to scaledata - the data to scalepublic static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, java.util.List<java.lang.String> skipColumns)
frame - the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, java.util.List<java.lang.String> skipColumns)
schema - the schema to use
to create the data framedata - the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence)
schema - Schema of the data to normalizesequence - Sequence datapublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence, java.util.List<java.lang.String> excludeColumns)
schema - Schema of the data to normalizesequence - Sequence dataexcludeColumns - List of columns to exclude from the normalizationpublic static java.util.List<org.apache.spark.sql.Row> minMaxColumns(DataRowsFacade data, java.util.List<java.lang.String> columns)
data - the data to get the max forcolumns - the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> minMaxColumns(DataRowsFacade data, java.lang.String... columns)
data - the data to get the max forcolumns - the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> stdDevMeanColumns(DataRowsFacade data, java.util.List<java.lang.String> columns)
data - the data to get the max forcolumns - the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> stdDevMeanColumns(DataRowsFacade data, java.lang.String... columns)
data - the data to get the standard deviation and mean for forcolumns - the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> aggregate(DataRowsFacade data, java.lang.String[] columns, java.lang.String[] functions)
data - the dataframe to aggregatecolumns - the columns to aggregatefunctions - the functions to usepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, java.util.List<java.lang.String> skipColumns)
dataFrame - the dataframe to scalemin - the minimum valuemax - the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, double min, double max, java.util.List<java.lang.String> skipColumns)
schema - the schema of the data to scaledata - the data to scalemin - the minimum valuemax - the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
schema - data - public static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, double min, double max)
schema - Schema of the datadata - Data to normalizemin - New minimum valuemax - New maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, double min, double max, java.util.List<java.lang.String> excludeColumns)
schema - Schema of the datadata - Data to normalizemin - New minimum valuemax - New maximum valueexcludeColumns - List of columns to excludepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, java.util.List<java.lang.String> skipColumns)
dataFrame - the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, java.util.List<java.lang.String> skipColumns)
schema - the schema of the data to scaledata - the data to scale