def analysis(reviews_collection_text):
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
raw_data = f.read()
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
comments = f.readlines()
data = raw_data.replace('\n', ' ')
data_lower = data.lower()
tokens_with_punc = word_tokenize(data_lower)
tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
print("--- Most frequent tokens ---\n",
FreqDist(tokens_with_punc).most_common(15))
print("--- Tokens without punctuation ---\n",
FreqDist(tokens).most_common(15))
stop = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop]
print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
tagged = pos_tag(words)
nouns = [word for word, pos in tagged if (pos == 'NN')]
print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
adjts = [word for word, pos in tagged if (pos == 'JJ')]
print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
avgld = sum(lxdst) / len(comments)
print("--- Average lexical density ---\n", avgld)
评论列表
文章目录