def create_vocab(self,dataset_path, vocab_path ,max_vocab_size):
print("generating vocab from dataset at {}".format(dataset_path))
all_words = []
for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]:
for line in open(os.path.join(dataset_path, dataset),"r").readlines():
data = json.loads(line)
all_words += word_tokenize(data["sentence1"].lower())
all_words += word_tokenize(data["sentence2"].lower())
counter = Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
words = ["PAD"] + ["UNK"] + list(words)
word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size)))
with open(vocab_path, "w") as file:
for word, id in word_to_id.items():
file.write("{}\t{}\n".format(word,id))
print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))
评论列表
文章目录