def prepare_data(self):
corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
vocabs = []
for l in corpus_cut:
for i in l:
vocabs.append(i)
# vocabs = reduce(lambda x, y: x+y, corpus_cut)
# count every vocab frequency
# but currently we don't think about the 'most' frequent one, just let it go
counter = collections.Counter(vocabs)
counter = counter.most_common()
vocabs_set, _ = zip(*counter)
vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}
data_flatten = np.array([vocab_int_map[v] for v in vocabs])
#step=3
data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(0,data_flatten.shape[0]-self.n_steps -1,3)])
# let's shuffle data to see anything happens
np.random.shuffle(data)
return len(vocabs_set), vocab_int_map, data
shakespeare.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录