raw_analysis.py 文件源码-python代码片段

def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)