reader.py 文件源码-python代码片段

def generate_dict_corpus_all_review():
    '''
    generate the gensim dict&corpus on the whole review corpus
    :return:
    '''

    print('Generating new dict and corpus on all Yelp reviews')

    review_file = open(FULL_YELP_REVIEW_PATH, 'r')
    # output_review = open("review.json", 'w')
    # output_tip = open("tip.json", 'w')

    texts = []
    stoplist = load_stopword(STOPWORD_PATH)

    count = 0

    for line in review_file:
        count += 1
        if count % 10000 ==0:
            print(count)
        json_review = json.loads(line.strip())

        text = json_review.get("text").decode('utf-8').lower()
        # tokenize and clean. Split non-word&number: re.sub(r'\W+|\d+', '', word.decode('utf-8')). Keep all words:r'\d+'
        tokens = [re.sub(r'\W+|\d+', '', word) for word in text.split()]
        # remove stop words and short tokens
        tokens = [token for token in tokens if ((not token.strip()=='') and (not token in stoplist))]
        # stemming, experiment shows that stemming works nothing...
        # if (stemming):
        #     stemmer = PorterStemmer()
        #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
        texts.append(tokens)

    review_file.close()

    # remove words that appear only once
    # from collections import defaultdict
    # frequency = defaultdict(int)
    # for token in tokens:
    #     frequency[token] += 1
    # for text in texts:
    #     tokens = []
    #     for token in text:
    #         if (frequency[token] > 1):
    #             tokens.append(token)
    #     text = tokens
    # texts = [[token for token in text if (frequency[token] > 1)] for text in texts]

    print('Corpus preprocessing and counting complished!')

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=5)

    dictionary.save(DICT_PATH) # store the dictionary, for future reference
    dictionary.save_as_text(DICT_TXT_PATH)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus) # store to disk, for later use
    print('Generating dict and corpus complished!')