python类en()的实例源码

create_spacy_paraphraser.py 文件源码 项目:dsr16_nlp 作者: honnibal 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def main(params):
    input_train_json = json.load(open(params['input_train_json'], 'r'))
    print("Load spaCy with GloVe vectors")
    nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
    words_to_keep = build_vocab(
                        nlp.tokenizer,
                        [img['question'] for img in input_train_json],
                        int(params['word_count_threshold']))
    vectors = sense2vec.vectors.VectorMap(nlp.vocab.vectors_length)
    for string in words_to_keep:
        word = nlp.vocab[string]
        vectors.borrow(word.orth_, 1, numpy.ascontiguousarray(word.vector))
    replaced = 0
    paraphrases = []
    for i, word in enumerate(nlp.vocab):
        if word.orth_ in words_to_keep:
            word.norm_ = word.orth_
        elif word.lower_ in words_to_keep:
            word.norm_ = word.lower_
        elif word.is_alpha and word.has_vector:
            vector = numpy.ascontiguousarray(word.vector, dtype='float32')
            synonyms, scores = vectors.most_similar(vector, 1)
            word.norm_ = synonyms[0]
            paraphrases.append((word.orth_, word.norm_))
        else:
            word.norm_ = word.shape_
        if i and i % 10000 == 0:
            print(i, 'words processed. Example: %s --> %s' % random.choice(paraphrases))
    print('%d vector-based paraphrases' % len(paraphrases))
    if not os.path.exists(params['spacy_data']):
        os.mkdir(params['spacy_data'])
    if not os.path.exists(os.path.join(params['spacy_data'], 'vocab')):
        os.mkdir(os.path.join(params['spacy_data'], 'vocab'))
    if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')):
        os.mkdir(os.path.join(params['spacy_data'], 'tokenizer'))

    nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin'))
    with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w',
            encoding='utf8') as file_:
        nlp.vocab.strings.dump(file_)
create_spacy_paraphraser.py 文件源码 项目:dsr16_nlp 作者: honnibal 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def main(params):
    input_train_json = json.load(open(params['input_train_json'], 'r'))
    print("Load spaCy with GloVe vectors")
    nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
    words_to_keep = build_vocab(
                        nlp.tokenizer,
                        [img['question'] for img in input_train_json],
                        int(params['word_count_threshold']))
    vectors = sense2vec.vectors.VectorMap(nlp.vocab.vectors_length)
    for string in words_to_keep:
        word = nlp.vocab[string]
        vectors.borrow(word.orth_, 1, numpy.ascontiguousarray(word.vector))
    replaced = 0
    paraphrases = []
    for i, word in enumerate(nlp.vocab):
        if word.orth_ in words_to_keep:
            word.norm_ = word.orth_
        elif word.lower_ in words_to_keep:
            word.norm_ = word.lower_
        elif word.is_alpha and word.has_vector:
            vector = numpy.ascontiguousarray(word.vector, dtype='float32')
            synonyms, scores = vectors.most_similar(vector, 1)
            word.norm_ = synonyms[0]
            paraphrases.append((word.orth_, word.norm_))
        else:
            word.norm_ = word.shape_
        if i and i % 10000 == 0:
            print(i, 'words processed. Example: %s --> %s' % random.choice(paraphrases))
    print('%d vector-based paraphrases' % len(paraphrases))
    if not os.path.exists(params['spacy_data']):
        os.mkdir(params['spacy_data'])
    if not os.path.exists(os.path.join(params['spacy_data'], 'vocab')):
        os.mkdir(os.path.join(params['spacy_data'], 'vocab'))
    if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')):
        os.mkdir(os.path.join(params['spacy_data'], 'tokenizer'))

    nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin'))
    with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w',
            encoding='utf8') as file_:
        nlp.vocab.strings.dump(file_)
eval_entity_coref.py 文件源码 项目:OKR 作者: vered1986 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def is_stop(w):
    return w in spacy.en.STOP_WORDS
demo.py 文件源码 项目:VQA-Demo-GUI 作者: anujshah1003 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def get_question_features(question):
    ''' For a given question, a unicode string, returns the timeseris vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
#    word_embeddings = spacy.load('en')#, vectors='en_glove_cc_300_1m_vectors')

#    nlp = English()
#    n_dimensions = nlp.vocab.load_vectors('glove.840B.300d.txt.bz2')
#    print n_dimensions
#    tokens = n_dimensions

#    embeddings_index = {}
#    f = open('glove.6B.300d.txt')
#    for line in f:
#        values = line.split()
#        word = values[0]
#        coefs = np.asarray(values[1:], dtype='float32')
#        embeddings_index[word] = coefs
#    f.close()
#
#    print('Found %s word vectors.' % len(embeddings_index))
#    
#    word_embeddings = spacy.load('en', vectors='glove.6B.30d.txt')

    tokens = word_embeddings(question)
    question_tensor = np.zeros((1, 30, 300))
    for j in xrange(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor


问题


面经


文章

微信
公众号

扫码关注公众号