public abstract class TextVectorizer<VECTOR_TYPE> extends java.lang.Object implements Vectorizer<VECTOR_TYPE>
Vectorizer.RecordCallBack| Modifier and Type | Field and Description |
|---|---|
protected VocabCache |
cache |
static java.lang.String |
MIN_WORD_FREQUENCY |
protected int |
minWordFrequency |
static java.lang.String |
STOP_WORDS |
protected java.util.Collection<java.lang.String> |
stopWords |
static java.lang.String |
TOKENIZER |
protected TokenizerFactory |
tokenizerFactory |
static java.lang.String |
VOCAB_CACHE |
| Constructor and Description |
|---|
TextVectorizer() |
| Modifier and Type | Method and Description |
|---|---|
abstract TokenizerFactory |
createTokenizerFactory(Configuration conf)
Create tokenizer factory based on the configuration
|
abstract void |
doWithTokens(Tokenizer tokenizer)
Increment counts, add to collection,...
|
void |
fit(RecordReader reader)
Fit based on a record reader
|
void |
fit(RecordReader reader,
Vectorizer.RecordCallBack callBack)
Fit based on a record reader
|
void |
initialize(Configuration conf)
Initialize based on a configuration
|
protected java.lang.String |
toString(java.util.Collection<Writable> record) |
protected Counter<java.lang.String> |
wordFrequenciesForRecord(java.util.Collection<Writable> record) |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, waitcreateVector, fitTransform, fitTransform, transformprotected TokenizerFactory tokenizerFactory
protected int minWordFrequency
public static final java.lang.String MIN_WORD_FREQUENCY
public static final java.lang.String STOP_WORDS
public static final java.lang.String TOKENIZER
public static final java.lang.String VOCAB_CACHE
protected java.util.Collection<java.lang.String> stopWords
protected VocabCache cache
public void initialize(Configuration conf)
Vectorizerinitialize in interface Vectorizer<VECTOR_TYPE>conf - the configuration to usepublic void fit(RecordReader reader)
Vectorizerfit in interface Vectorizer<VECTOR_TYPE>public void fit(RecordReader reader, Vectorizer.RecordCallBack callBack)
Vectorizerfit in interface Vectorizer<VECTOR_TYPE>protected Counter<java.lang.String> wordFrequenciesForRecord(java.util.Collection<Writable> record)
protected java.lang.String toString(java.util.Collection<Writable> record)
public abstract void doWithTokens(Tokenizer tokenizer)
tokenizer - public abstract TokenizerFactory createTokenizerFactory(Configuration conf)
conf - the configuration to use