def generate_squad_vocab(path, vocabulary_size=30000):
import json
import itertools
# from operator import itemgetter
from nltk.probability import FreqDist
d = json.load(open(path))
tokenized_sentences = []
for reading in d['data']:
for paragraph in reading['paragraphs']:
sentence = paragraph['context'].lower()
tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
for question in paragraph['qas']:
sentence = question['question'].lower() #TODO later check whether to add answer as well or not
tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print('total uniq words:', len(word_freq))
# sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1]
full_vocab = word_freq.most_common(len(word_freq))
vocab = open('vocab_full.txt','w')
for w in full_vocab:
vocab.write(w[0]+'\t'+str(w[1])+'\n')
vocab.close()
shorted_vocab = word_freq.most_common(vocabulary_size-1)
vocab = open('vocab.txt','w')
for w in shorted_vocab:
vocab.write(w[0]+'\t'+str(w[1])+'\n')
vocab.close()
评论列表
文章目录