utils.py 文件源码-python代码片段

def generate_squad_vocab(path, vocabulary_size=30000):
    import json
    import itertools
    # from operator import itemgetter
    from nltk.probability import FreqDist
    d = json.load(open(path))
    tokenized_sentences = []
    for reading in d['data']:
        for paragraph in reading['paragraphs']:
            sentence = paragraph['context'].lower()
            tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
            for question in paragraph['qas']:
                sentence = question['question'].lower()     #TODO later check whether to add answer as well or not
                tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))

    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print('total uniq words:', len(word_freq))
    # sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1]
    full_vocab = word_freq.most_common(len(word_freq))
    vocab = open('vocab_full.txt','w')
    for w in full_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close()
    shorted_vocab = word_freq.most_common(vocabulary_size-1)
    vocab = open('vocab.txt','w')
    for w in shorted_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close()