dataset.py 文件源码-python代码片段

dataset.py 文件源码

python

阅读 30 收藏 0 点赞 0 评论 0

def _build_vocab(self, filename):
    counts = Counter()
    with tf.gfile.GFile(filename, "r") as f:
      #for line in f:
      #  words = line.replace("\n"," ").split()
      #  counts += Counter(words)
      while True:
        chunk = f.read(int(500000000/2))
        if not chunk: 
          break
        counts += Counter(chunk.replace("\n", " ").split())

    sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
    self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
    self.word_to_id[EOS] = IEOS
    self.word_to_id[BOS] = IBOS
    self.word_to_id[PAD] = IPAD