def _get_base_doge_words(self, eng_text):
"""
Get all base words from text to make doge phrases from.
eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy']
Args:
eng_text (str): Text to get words from.
Returns:
list[str]: List of lower case words to use from text.
"""
phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation])
tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()])
chosen_words = []
for word, tag in tagged_words:
if tag[0] in ['N', 'V', 'J']:
# make noun singular
if tag[0] == 'N':
word = self._lemmatizer.lemmatize(word, pos='n')
# make verb infinitive
elif tag[0] == 'V':
word = self._lemmatizer.lemmatize(word, pos='v')
chosen_words.append(word.encode('ascii', 'ignore')) # lemmatize makes word unicode
return list(set(chosen_words))
评论列表
文章目录