def lemmatize_text(text, stop_words=STOPLIST, keep_pos=KEEP_POS):
'''
Function to lemmatize a single document of the corpus
INPUT:
text: string, text of review
stop_words: words to remove from text, default STOPLIST defined above
keep_pos: parts of speech to keep in text, default KEEP_POS def above
OUTPUT:
lemmatized text
'''
x = nlp(text)
words = [tok.lemma_.strip(punctuation) for tok in x if (
tok.pos_ in keep_pos) and (tok.lemma_.strip(punctuation) not in STOPLIST)]
words.extend(['boss' for tok in x if tok.lemma_ == 'bos'])
return ' '.join(words)
评论列表
文章目录