def build_vocab(train_data, test_data):
counter = collections.Counter()
for stories, questions, answers in [train_data, test_data]:
for story in stories:
for sent in story:
for word in nltk.word_tokenize(sent):
counter[word.lower()] += 1
for question in questions:
for word in nltk.word_tokenize(question):
counter[word.lower()] += 1
for answer in answers:
for word in nltk.word_tokenize(answer):
counter[word.lower()] += 1
# no OOV here because there are not too many words in dataset
word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
word2idx["PAD"] = 0
idx2word = {v:k for k, v in word2idx.items()}
return word2idx, idx2word
mem-network.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录