def preprocess(self, input_file, vocab_file, tensor_file):
with codecs.open(input_file, 'r', 'utf-8') as f:
lines = f.readlines()
if lines[0][:1] == codecs.BOM_UTF8:
lines[0] = lines[0][1:]
lines = [line.strip().split() for line in lines]
self.vocab, self.words = self.build_vocab(lines)
self.vocab_size = len(self.words)
#print 'word num: ', self.vocab_size
with open(vocab_file, 'wb') as f:
cPickle.dump(self.words, f)
raw_data = [[0] * self.seq_length +
[self.vocab.get(w, 1) for w in line] +
[2] * self.seq_length for line in lines]
self.raw_data = raw_data #???????
# np.save(tensor_file, self.raw_data)
input_data.py 文件源码
python
阅读 26
收藏 0
点赞 0
评论 0
评论列表
文章目录