def load_data():
global N, words
raw = list(word
for fileid in corpus.fileids()
for word in corpus.words(fileid))
words = list(token for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
tokens = set(words)
tokens_l = list(tokens)
N = len(tokens)
print 'Corpus size: {} words'.format(N)
step = 4
data = []
for gram in ngrams(words, step):
w1, w2, w3, pred = gram
V = Vol(1, 1, N, 0.0)
V.w[tokens_l.index(w1)] = 1
V.w[tokens_l.index(w2)] = 1
V.w[tokens_l.index(w3)] = 1
label = tokens_l.index(pred)
data.append((V, label))
return data
评论列表
文章目录