public interface VocabCache<T extends SequenceElement>
extends java.io.Serializable
Modifier and Type | Method and Description |
---|---|
void |
addToken(T element)
Adds a token
to the cache
|
void |
addWordToIndex(int index,
long elementId) |
void |
addWordToIndex(int index,
java.lang.String word) |
boolean |
containsWord(java.lang.String word)
Returns true if the cache contains the given word
|
int |
docAppearedIn(java.lang.String word)
Count of documents a word appeared in
|
T |
elementAtIndex(int index)
Returns SequenceElement at the given index or null
|
boolean |
hasToken(java.lang.String token)
Returns whether the cache
contains this token or not
|
void |
importVocabulary(VocabCache<T> vocabCache)
imports vocabulary
|
void |
incrementDocCount(java.lang.String word,
long howMuch)
Increment the document count
|
void |
incrementTotalDocCount()
Increment the doc count
|
void |
incrementTotalDocCount(long by)
Increment the doc count
|
void |
incrementWordCount(java.lang.String word)
Increment the count for the given word
|
void |
incrementWordCount(java.lang.String word,
int increment)
Increment the count for the given word by
the amount increment
|
int |
indexOf(java.lang.String word)
Returns the index of a given word
|
void |
loadVocab()
Load vocab
|
int |
numWords()
Returns the number of words in the cache
|
void |
putVocabWord(java.lang.String word)
Deprecated.
|
void |
removeElement(java.lang.String label)
Removes element with specified label from vocabulary
Please note: Huffman index should be updated after element removal
|
void |
removeElement(T element)
Removes specified element from vocabulary
Please note: Huffman index should be updated after element removal
|
void |
saveVocab()
Saves the vocab: this allow for reuse of word frequencies
|
void |
setCountForDoc(java.lang.String word,
long count)
Set the count for the number of documents the word appears in
|
T |
tokenFor(long id) |
T |
tokenFor(java.lang.String word)
Returns the token (again not necessarily in the vocab)
for this word
|
java.util.Collection<T> |
tokens()
All of the tokens in the cache, (not necessarily apart of the vocab)
|
long |
totalNumberOfDocs()
Returns the total of number of documents encountered in the corpus
|
long |
totalWordOccurrences()
The total number of word occurrences
|
void |
updateWordsOccurencies()
Updates counters
|
boolean |
vocabExists()
Vocab exists already
|
java.util.Collection<T> |
vocabWords()
Returns all of the vocab word nodes
|
java.lang.String |
wordAtIndex(int index)
Returns the word contained at the given index or null
|
T |
wordFor(long id) |
T |
wordFor(java.lang.String word) |
int |
wordFrequency(java.lang.String word)
Returns the number of times the word has occurred
|
java.util.Collection<java.lang.String> |
words()
Returns all of the words in the vocab
|
void loadVocab()
boolean vocabExists()
void saveVocab()
java.util.Collection<java.lang.String> words()
void incrementWordCount(java.lang.String word)
word
- the word to increment the count forvoid incrementWordCount(java.lang.String word, int increment)
word
- the word to increment the count forincrement
- the amount to increment byint wordFrequency(java.lang.String word)
word
- the word to retrieve the occurrence frequency forboolean containsWord(java.lang.String word)
word
- the word to check forjava.lang.String wordAtIndex(int index)
index
- the index of the word to getT elementAtIndex(int index)
index
- int indexOf(java.lang.String word)
word
- the index of a given wordjava.util.Collection<T> vocabWords()
long totalWordOccurrences()
T wordFor(java.lang.String word)
word
- T wordFor(long id)
void addWordToIndex(int index, java.lang.String word)
index
- word
- void addWordToIndex(int index, long elementId)
@Deprecated void putVocabWord(java.lang.String word)
word
- the word to add to the vocabint numWords()
int docAppearedIn(java.lang.String word)
word
- the number of documents the word appeared invoid incrementDocCount(java.lang.String word, long howMuch)
word
- the word to increment byhowMuch
- void setCountForDoc(java.lang.String word, long count)
word
- the word to set the count forcount
- the count of the wordlong totalNumberOfDocs()
void incrementTotalDocCount()
void incrementTotalDocCount(long by)
by
- the number to increment byjava.util.Collection<T> tokens()
void addToken(T element)
element
- the word to addT tokenFor(java.lang.String word)
word
- the word to get the token forT tokenFor(long id)
boolean hasToken(java.lang.String token)
token
- the token to tesvoid importVocabulary(VocabCache<T> vocabCache)
vocabCache
- void updateWordsOccurencies()
void removeElement(java.lang.String label)
label
- label of the element to be removedvoid removeElement(T element)
element
- SequenceElement to be removed