def _build_vocab(self, filename):
counts = Counter()
with tf.gfile.GFile(filename, "r") as f:
#for line in f:
# words = line.replace("\n"," ").split()
# counts += Counter(words)
while True:
chunk = f.read(int(500000000/2))
if not chunk:
break
counts += Counter(chunk.replace("\n", " ").split())
sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
self.word_to_id[EOS] = IEOS
self.word_to_id[BOS] = IBOS
self.word_to_id[PAD] = IPAD
评论列表
文章目录