def paragraph_to_words(paragraph, remove_stopwords=False, lemmatize=True, stem=False):
words = BeautifulSoup(paragraph["review"], "html.parser").get_text()
words = re.sub("[^a-zA-Z]", " ", words)
# tokenizer = RegexpTokenizer(r'\w+')
# words = tokenizer.tokenize(words.strip().lower())
words = words.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
if lemmatize:
words = [lemmatizer.lemmatize(w) for w in words]
if stem:
words = [stemmer.stem(w) for w in words]
return LabelDoc(words, paragraph["id"])
评论列表
文章目录