public abstract class TextVectorizer<VECTOR_TYPE> extends java.lang.Object implements Vectorizer<VECTOR_TYPE>
Vectorizer.RecordCallBack
Modifier and Type | Field and Description |
---|---|
protected VocabCache |
cache |
static java.lang.String |
MIN_WORD_FREQUENCY |
protected int |
minWordFrequency |
static java.lang.String |
STOP_WORDS |
protected java.util.Collection<java.lang.String> |
stopWords |
static java.lang.String |
TOKENIZER |
protected TokenizerFactory |
tokenizerFactory |
static java.lang.String |
VOCAB_CACHE |
Constructor and Description |
---|
TextVectorizer() |
Modifier and Type | Method and Description |
---|---|
abstract TokenizerFactory |
createTokenizerFactory(Configuration conf)
Create tokenizer factory based on the configuration
|
abstract void |
doWithTokens(Tokenizer tokenizer)
Increment counts, add to collection,...
|
void |
fit(RecordReader reader)
Fit based on a record reader
|
void |
fit(RecordReader reader,
Vectorizer.RecordCallBack callBack)
Fit based on a record reader
|
void |
initialize(Configuration conf)
Initialize based on a configuration
|
protected java.lang.String |
toString(java.util.Collection<Writable> record) |
protected Counter<java.lang.String> |
wordFrequenciesForRecord(java.util.Collection<Writable> record) |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
createVector, fitTransform, fitTransform, transform
protected TokenizerFactory tokenizerFactory
protected int minWordFrequency
public static final java.lang.String MIN_WORD_FREQUENCY
public static final java.lang.String STOP_WORDS
public static final java.lang.String TOKENIZER
public static final java.lang.String VOCAB_CACHE
protected java.util.Collection<java.lang.String> stopWords
protected VocabCache cache
public void initialize(Configuration conf)
Vectorizer
initialize
in interface Vectorizer<VECTOR_TYPE>
conf
- the configuration to usepublic void fit(RecordReader reader)
Vectorizer
fit
in interface Vectorizer<VECTOR_TYPE>
public void fit(RecordReader reader, Vectorizer.RecordCallBack callBack)
Vectorizer
fit
in interface Vectorizer<VECTOR_TYPE>
protected Counter<java.lang.String> wordFrequenciesForRecord(java.util.Collection<Writable> record)
protected java.lang.String toString(java.util.Collection<Writable> record)
public abstract void doWithTokens(Tokenizer tokenizer)
tokenizer
- public abstract TokenizerFactory createTokenizerFactory(Configuration conf)
conf
- the configuration to use