def textToTokens(text):
"""Converts input string to a corpus of tokenized sentences.
Assumes that the sentences are divided by newlines (but will ignore empty sentences).
You can use this to try out your own datasets, but is not needed for reading the homework data.
"""
corpus = []
sents = text.split("\n")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(sents)
tokenizer = count_vect.build_tokenizer()
for s in sents:
toks = tokenizer(s)
if len(toks) > 0:
corpus.append(toks)
return corpus
评论列表
文章目录