def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True):
porter = PorterStemmer()
lmtz = WordNetLemmatizer()
# empty bag of words
bag_of_words = []
# Iterate for string
for string in tqdm(list_of_strings):
string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True)
bag_of_words.extend(string_tokens)
if remove_alnums:
bag_of_words = [bag for bag in bag_of_words if bag.isalpha()]
elif remove_digits:
bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))]
bag_of_words.sort()
# Stem and Lemmatize the data
bag_of_words_stemmed = []
for word in bag_of_words:
try:
bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word)))
except:
bag_of_words_stemmed.append(word)
bag_of_words = list(bag_of_words_stemmed)
# Remove stop words
stop = set(stopwords.words('english'))
print('Removing Stop words...')
bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)]
bow_counter = Counter(bag_of_words)
bow_counter = OrderedDict(sorted(bow_counter.items()))
return bow_counter
评论列表
文章目录