public class Normalization
extends java.lang.Object
Constructor and Description |
---|
Normalization() |
Modifier and Type | Method and Description |
---|---|
static java.util.List<org.apache.spark.sql.Row> |
aggregate(DataRowsFacade data,
java.lang.String[] columns,
java.lang.String[] functions)
Aggregate based on an arbitrary list
of aggregation and grouping functions
|
static java.util.List<org.apache.spark.sql.Row> |
minMaxColumns(DataRowsFacade data,
java.util.List<java.lang.String> columns)
Returns the min and max of the given columns
|
static java.util.List<org.apache.spark.sql.Row> |
minMaxColumns(DataRowsFacade data,
java.lang.String... columns)
Returns the min and max of the given columns.
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
double min,
double max)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
double min,
double max,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static DataRowsFacade |
normalize(DataRowsFacade dataFrame,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
double min,
double max)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
double min,
double max,
java.util.List<java.lang.String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
java.util.List<java.lang.String> skipColumns)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data) |
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
double min,
double max)
Normalize each column of a sequence, based on min/max
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data,
double min,
double max,
java.util.List<java.lang.String> excludeColumns)
Normalize each column of a sequence, based on min/max
|
static java.util.List<org.apache.spark.sql.Row> |
stdDevMeanColumns(DataRowsFacade data,
java.util.List<java.lang.String> columns)
Returns the standard deviation and mean of the given columns
|
static java.util.List<org.apache.spark.sql.Row> |
stdDevMeanColumns(DataRowsFacade data,
java.lang.String... columns)
Returns the standard deviation
and mean of the given columns
The list returned is a list of size 2 where each row
represents the standard deviation of each column and the mean of each column
|
static DataRowsFacade |
zeromeanUnitVariance(DataRowsFacade frame)
Normalize by zero mean unit variance
|
static DataRowsFacade |
zeromeanUnitVariance(DataRowsFacade frame,
java.util.List<java.lang.String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data,
java.util.List<java.lang.String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence)
Normalize the sequence by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence,
java.util.List<java.lang.String> excludeColumns)
Normalize the sequence by zero mean unit variance
|
public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame)
frame
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema
- the schema to use
to create the data framedata
- the data to normalizepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max)
dataFrame
- the dataframe to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, double min, double max)
schema
- the schema of the data to scaledata
- the data to sclaemin
- the minimum valuemax
- the maximum valuepublic static DataRowsFacade normalize(DataRowsFacade dataFrame)
dataFrame
- the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data)
schema
- the schema of the data to scaledata
- the data to scalepublic static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, java.util.List<java.lang.String> skipColumns)
frame
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, java.util.List<java.lang.String> skipColumns)
schema
- the schema to use
to create the data framedata
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence)
schema
- Schema of the data to normalizesequence
- Sequence datapublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> sequence, java.util.List<java.lang.String> excludeColumns)
schema
- Schema of the data to normalizesequence
- Sequence dataexcludeColumns
- List of columns to exclude from the normalizationpublic static java.util.List<org.apache.spark.sql.Row> minMaxColumns(DataRowsFacade data, java.util.List<java.lang.String> columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> minMaxColumns(DataRowsFacade data, java.lang.String... columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> stdDevMeanColumns(DataRowsFacade data, java.util.List<java.lang.String> columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> stdDevMeanColumns(DataRowsFacade data, java.lang.String... columns)
data
- the data to get the standard deviation and mean for forcolumns
- the columns to get thepublic static java.util.List<org.apache.spark.sql.Row> aggregate(DataRowsFacade data, java.lang.String[] columns, java.lang.String[] functions)
data
- the dataframe to aggregatecolumns
- the columns to aggregatefunctions
- the functions to usepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, java.util.List<java.lang.String> skipColumns)
dataFrame
- the dataframe to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, double min, double max, java.util.List<java.lang.String> skipColumns)
schema
- the schema of the data to scaledata
- the data to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data)
schema
- data
- public static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, double min, double max)
schema
- Schema of the datadata
- Data to normalizemin
- New minimum valuemax
- New maximum valuepublic static org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<java.util.List<Writable>>> data, double min, double max, java.util.List<java.lang.String> excludeColumns)
schema
- Schema of the datadata
- Data to normalizemin
- New minimum valuemax
- New maximum valueexcludeColumns
- List of columns to excludepublic static DataRowsFacade normalize(DataRowsFacade dataFrame, java.util.List<java.lang.String> skipColumns)
dataFrame
- the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<java.util.List<Writable>> data, java.util.List<java.lang.String> skipColumns)
schema
- the schema of the data to scaledata
- the data to scale