def generate_dict_corpus_all_review():
'''
generate the gensim dict&corpus on the whole review corpus
:return:
'''
print('Generating new dict and corpus on all Yelp reviews')
review_file = open(FULL_YELP_REVIEW_PATH, 'r')
# output_review = open("review.json", 'w')
# output_tip = open("tip.json", 'w')
texts = []
stoplist = load_stopword(STOPWORD_PATH)
count = 0
for line in review_file:
count += 1
if count % 10000 ==0:
print(count)
json_review = json.loads(line.strip())
text = json_review.get("text").decode('utf-8').lower()
# tokenize and clean. Split non-word&number: re.sub(r'\W+|\d+', '', word.decode('utf-8')). Keep all words:r'\d+'
tokens = [re.sub(r'\W+|\d+', '', word) for word in text.split()]
# remove stop words and short tokens
tokens = [token for token in tokens if ((not token.strip()=='') and (not token in stoplist))]
# stemming, experiment shows that stemming works nothing...
# if (stemming):
# stemmer = PorterStemmer()
# texts = [[ stemmer.stem(token) for token in text] for text in texts]
texts.append(tokens)
review_file.close()
# remove words that appear only once
# from collections import defaultdict
# frequency = defaultdict(int)
# for token in tokens:
# frequency[token] += 1
# for text in texts:
# tokens = []
# for token in text:
# if (frequency[token] > 1):
# tokens.append(token)
# text = tokens
# texts = [[token for token in text if (frequency[token] > 1)] for text in texts]
print('Corpus preprocessing and counting complished!')
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5)
dictionary.save(DICT_PATH) # store the dictionary, for future reference
dictionary.save_as_text(DICT_TXT_PATH)
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(CORPUS_PATH, corpus) # store to disk, for later use
print('Generating dict and corpus complished!')
评论列表
文章目录