def preprocess(self, input_file, vocab_file, tensor_file):
with open(input_file, "r") as f:
data = f.read()
# Optional text cleaning or make them lower case, etc.
data = self.clean_str(data)
x_text = data.split()
self.vocab, self.words = self.build_vocab(x_text)
with open(vocab_file, 'wb') as f:
cPickle.dump(self.words, f)
self.tensor = []
for word in x_text:
if not self.vocab.has_key(word):
self.tensor.append(self.vocab['UNK'])
else:
self.tensor.append(self.vocab[word])
self.tensor = np.asarray(self.tensor)
# Save the data to data.npy
np.save(tensor_file, self.tensor)
评论列表
文章目录