public class TextPipeline
extends java.lang.Object
Constructor and Description |
---|
TextPipeline() |
TextPipeline(org.apache.spark.api.java.JavaRDD<java.lang.String> corpusRDD,
org.apache.spark.broadcast.Broadcast<java.util.Map<java.lang.String,java.lang.Object>> broadcasTokenizerVarMap) |
Modifier and Type | Method and Description |
---|---|
void |
buildVocabCache() |
void |
buildVocabWordListRDD() |
void |
filterMinWordAddVocab(Counter<java.lang.String> wordFreq) |
org.apache.spark.broadcast.Broadcast<VocabCache<VocabWord>> |
getBroadCastVocabCache() |
org.apache.spark.api.java.JavaRDD<java.util.concurrent.atomic.AtomicLong> |
getSentenceCountRDD() |
org.apache.spark.api.java.JavaRDD<Pair<java.util.List<java.lang.String>,java.util.concurrent.atomic.AtomicLong>> |
getSentenceWordsCountRDD() |
java.lang.Long |
getTotalWordCount() |
VocabCache<VocabWord> |
getVocabCache() |
org.apache.spark.api.java.JavaRDD<java.util.List<VocabWord>> |
getVocabWordListRDD() |
org.apache.spark.Accumulator<Counter<java.lang.String>> |
getWordFreqAcc() |
void |
setRDDVarMap(org.apache.spark.api.java.JavaRDD<java.lang.String> corpusRDD,
org.apache.spark.broadcast.Broadcast<java.util.Map<java.lang.String,java.lang.Object>> broadcasTokenizerVarMap) |
org.apache.spark.api.java.JavaRDD<java.util.List<java.lang.String>> |
tokenize() |
org.apache.spark.api.java.JavaRDD<Pair<java.util.List<java.lang.String>,java.util.concurrent.atomic.AtomicLong>> |
updateAndReturnAccumulatorVal(org.apache.spark.api.java.JavaRDD<java.util.List<java.lang.String>> tokenizedRDD) |
public TextPipeline()
public TextPipeline(org.apache.spark.api.java.JavaRDD<java.lang.String> corpusRDD, org.apache.spark.broadcast.Broadcast<java.util.Map<java.lang.String,java.lang.Object>> broadcasTokenizerVarMap) throws java.lang.Exception
java.lang.Exception
public void setRDDVarMap(org.apache.spark.api.java.JavaRDD<java.lang.String> corpusRDD, org.apache.spark.broadcast.Broadcast<java.util.Map<java.lang.String,java.lang.Object>> broadcasTokenizerVarMap)
public org.apache.spark.api.java.JavaRDD<java.util.List<java.lang.String>> tokenize()
public org.apache.spark.api.java.JavaRDD<Pair<java.util.List<java.lang.String>,java.util.concurrent.atomic.AtomicLong>> updateAndReturnAccumulatorVal(org.apache.spark.api.java.JavaRDD<java.util.List<java.lang.String>> tokenizedRDD)
public void filterMinWordAddVocab(Counter<java.lang.String> wordFreq)
public void buildVocabCache()
public void buildVocabWordListRDD()
public org.apache.spark.Accumulator<Counter<java.lang.String>> getWordFreqAcc()
public org.apache.spark.broadcast.Broadcast<VocabCache<VocabWord>> getBroadCastVocabCache() throws java.lang.IllegalStateException
java.lang.IllegalStateException
public VocabCache<VocabWord> getVocabCache() throws java.lang.IllegalStateException
java.lang.IllegalStateException
public org.apache.spark.api.java.JavaRDD<Pair<java.util.List<java.lang.String>,java.util.concurrent.atomic.AtomicLong>> getSentenceWordsCountRDD()
public org.apache.spark.api.java.JavaRDD<java.util.List<VocabWord>> getVocabWordListRDD() throws java.lang.IllegalStateException
java.lang.IllegalStateException
public org.apache.spark.api.java.JavaRDD<java.util.concurrent.atomic.AtomicLong> getSentenceCountRDD() throws java.lang.IllegalStateException
java.lang.IllegalStateException
public java.lang.Long getTotalWordCount()