public class DefaultVocabCache extends java.lang.Object implements VocabCache
| Constructor and Description |
|---|
DefaultVocabCache() |
DefaultVocabCache(int minWordFrequency)
Instantiate with a given min word frequency
|
| Modifier and Type | Method and Description |
|---|---|
int |
getMinWordFrequency() |
double |
idf(java.lang.String word)
Number of documents word has occurred in
|
void |
incrementCount(java.lang.String word)
Increment a word count by 1
|
void |
incrementCount(java.lang.String word,
double by)
Increment count for a word
|
void |
incrementDocCount(java.lang.String word)
Increment the doc count for a word by 1
|
void |
incrementDocCount(java.lang.String word,
double by)
Increment the document count for a particular word
|
void |
incrementNumDocs(double by)
Increment the number of documents
|
void |
initialize(Configuration conf)
Configuration for initializing
|
int |
minWordFrequency()
The min word frequency
needed to be included in the vocab
(default 5)
|
double |
numDocs()
Number of documents
|
void |
setMinWordFrequency(int minWordFrequency) |
double |
tfidf(java.lang.String word,
double frequency)
Calculate the tfidf of the word given the document frequency
|
Index |
vocabWords()
All of the vocab words (ordered)
note that these are not all the possible tokens
|
java.lang.String |
wordAt(int i)
Returns a word in the vocab at a particular index
|
double |
wordFrequency(java.lang.String word)
Get the word frequency for a word
|
public DefaultVocabCache(int minWordFrequency)
minWordFrequency - public DefaultVocabCache()
public void incrementNumDocs(double by)
VocabCacheincrementNumDocs in interface VocabCachepublic double numDocs()
VocabCachenumDocs in interface VocabCachepublic java.lang.String wordAt(int i)
VocabCachewordAt in interface VocabCachei - the index to getpublic void initialize(Configuration conf)
VocabCacheinitialize in interface VocabCacheconf - the configuration to initialize withpublic double wordFrequency(java.lang.String word)
VocabCachewordFrequency in interface VocabCacheword - the word to get frequency forpublic int minWordFrequency()
VocabCacheminWordFrequency in interface VocabCachepublic Index vocabWords()
VocabCachevocabWords in interface VocabCachepublic void incrementDocCount(java.lang.String word)
VocabCacheincrementDocCount in interface VocabCacheword - the word to increment the count forpublic void incrementDocCount(java.lang.String word,
double by)
VocabCacheincrementDocCount in interface VocabCacheword - the word to increment the count forby - the amount to increment bypublic void incrementCount(java.lang.String word)
VocabCacheincrementCount in interface VocabCacheword - the word to increment the count forpublic void incrementCount(java.lang.String word,
double by)
VocabCacheincrementCount in interface VocabCacheword - the word to increment the count forby - the amount to increment bypublic double idf(java.lang.String word)
VocabCacheidf in interface VocabCacheword - the word to get the idf forpublic double tfidf(java.lang.String word,
double frequency)
VocabCachetfidf in interface VocabCacheword - the word to get frequency forfrequency - the frequencypublic int getMinWordFrequency()
public void setMinWordFrequency(int minWordFrequency)