public abstract class BaseTextVectorizer extends java.lang.Object implements TextVectorizer
Modifier and Type | Field and Description |
---|---|
protected InvertedIndex<VocabWord> |
index |
protected boolean |
isParallel |
protected LabelAwareIterator |
iterator |
protected LabelsSource |
labelsSource |
protected int |
minWordFrequency |
protected java.util.Collection<java.lang.String> |
stopWords |
protected TokenizerFactory |
tokenizerFactory |
protected VocabCache<VocabWord> |
vocabCache |
Constructor and Description |
---|
BaseTextVectorizer() |
Modifier and Type | Method and Description |
---|---|
void |
buildVocab() |
void |
fit()
Train the model
|
protected LabelsSource |
getLabelsSource() |
long |
numWordsEncountered()
Returns the number of words encountered so far
|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
getIndex, getVocabCache, transform, transform, vectorize, vectorize, vectorize
vectorize
protected transient TokenizerFactory tokenizerFactory
protected transient LabelAwareIterator iterator
protected int minWordFrequency
protected VocabCache<VocabWord> vocabCache
protected LabelsSource labelsSource
protected java.util.Collection<java.lang.String> stopWords
protected transient InvertedIndex<VocabWord> index
protected boolean isParallel
protected LabelsSource getLabelsSource()
public void buildVocab()
public void fit()
TextVectorizer
fit
in interface TextVectorizer
public long numWordsEncountered()
numWordsEncountered
in interface TextVectorizer