def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, V.w))
return data
评论列表
文章目录