def create_corpus(fileids, max_length=None):
"""
Creates a corpus from fileids
Removes stopwords and punctuation
Returns a list of strings
"""
sw = set(stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r"[A-Za-z]+")
corpus = []
for doc in fileids:
words = (w.lower() for w in tokenizer.tokenize(reuters.raw(doc)))
words = [w for w in words if w not in sw]
if max_length:
words = words[:max_length]
corpus.append(" ".join(words))
return corpus
评论列表
文章目录