Vocab.py 文件源码-python代码片段

Vocab.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

项目：act-rte-inference 作者: DeNeutoy 项目源码文件源码

def create_vocab(self,dataset_path, vocab_path ,max_vocab_size):

        print("generating vocab from dataset at {}".format(dataset_path))
        all_words = []
        for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]:
            for line in open(os.path.join(dataset_path, dataset),"r").readlines():
                data = json.loads(line)
                all_words += word_tokenize(data["sentence1"].lower())
                all_words += word_tokenize(data["sentence2"].lower())


        counter = Counter(all_words)
        count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0]))

        words, _ = list(zip(*count_pairs))
        words = ["PAD"] + ["UNK"] + list(words)
        word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size)))

        with open(vocab_path, "w") as file:
            for word, id in word_to_id.items():
                file.write("{}\t{}\n".format(word,id))

        print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))