def process_text(self, text):
flags = (UNICODE if sys.version < '3' and type(text) is unicode
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
words = findall(regexp, text, flags)
# remove stopwords
words = [word for word in words]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
if self.collocations:
word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
else:
word_counts, _ = process_tokens(words, self.normalize_plurals)
return word_counts
评论列表
文章目录