def load_data():
global N, words, labels
posts = corpus.xml_posts()[:10000]
freqs = [ FreqDist(post.text) for post in posts ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
labels = list(set([ post.get('class') for post in posts ]))
data = []
N = len(words)
for post, dist in zip(posts, freqs):
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, labels.index(post.get('class'))))
return data
评论列表
文章目录