def main(params):
input_train_json = json.load(open(params['input_train_json'], 'r'))
print("Load spaCy with GloVe vectors")
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
words_to_keep = build_vocab(
nlp.tokenizer,
[img['question'] for img in input_train_json],
int(params['word_count_threshold']))
vectors = sense2vec.vectors.VectorMap(nlp.vocab.vectors_length)
for string in words_to_keep:
word = nlp.vocab[string]
vectors.borrow(word.orth_, 1, numpy.ascontiguousarray(word.vector))
replaced = 0
paraphrases = []
for i, word in enumerate(nlp.vocab):
if word.orth_ in words_to_keep:
word.norm_ = word.orth_
elif word.lower_ in words_to_keep:
word.norm_ = word.lower_
elif word.is_alpha and word.has_vector:
vector = numpy.ascontiguousarray(word.vector, dtype='float32')
synonyms, scores = vectors.most_similar(vector, 1)
word.norm_ = synonyms[0]
paraphrases.append((word.orth_, word.norm_))
else:
word.norm_ = word.shape_
if i and i % 10000 == 0:
print(i, 'words processed. Example: %s --> %s' % random.choice(paraphrases))
print('%d vector-based paraphrases' % len(paraphrases))
if not os.path.exists(params['spacy_data']):
os.mkdir(params['spacy_data'])
if not os.path.exists(os.path.join(params['spacy_data'], 'vocab')):
os.mkdir(os.path.join(params['spacy_data'], 'vocab'))
if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')):
os.mkdir(os.path.join(params['spacy_data'], 'tokenizer'))
nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin'))
with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w',
encoding='utf8') as file_:
nlp.vocab.strings.dump(file_)
python类en()的实例源码
def main(params):
input_train_json = json.load(open(params['input_train_json'], 'r'))
print("Load spaCy with GloVe vectors")
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
words_to_keep = build_vocab(
nlp.tokenizer,
[img['question'] for img in input_train_json],
int(params['word_count_threshold']))
vectors = sense2vec.vectors.VectorMap(nlp.vocab.vectors_length)
for string in words_to_keep:
word = nlp.vocab[string]
vectors.borrow(word.orth_, 1, numpy.ascontiguousarray(word.vector))
replaced = 0
paraphrases = []
for i, word in enumerate(nlp.vocab):
if word.orth_ in words_to_keep:
word.norm_ = word.orth_
elif word.lower_ in words_to_keep:
word.norm_ = word.lower_
elif word.is_alpha and word.has_vector:
vector = numpy.ascontiguousarray(word.vector, dtype='float32')
synonyms, scores = vectors.most_similar(vector, 1)
word.norm_ = synonyms[0]
paraphrases.append((word.orth_, word.norm_))
else:
word.norm_ = word.shape_
if i and i % 10000 == 0:
print(i, 'words processed. Example: %s --> %s' % random.choice(paraphrases))
print('%d vector-based paraphrases' % len(paraphrases))
if not os.path.exists(params['spacy_data']):
os.mkdir(params['spacy_data'])
if not os.path.exists(os.path.join(params['spacy_data'], 'vocab')):
os.mkdir(os.path.join(params['spacy_data'], 'vocab'))
if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')):
os.mkdir(os.path.join(params['spacy_data'], 'tokenizer'))
nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin'))
with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w',
encoding='utf8') as file_:
nlp.vocab.strings.dump(file_)
def is_stop(w):
return w in spacy.en.STOP_WORDS
def get_question_features(question):
''' For a given question, a unicode string, returns the timeseris vector
with each word (token) transformed into a 300 dimension representation
calculated using Glove Vector '''
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
# word_embeddings = spacy.load('en')#, vectors='en_glove_cc_300_1m_vectors')
# nlp = English()
# n_dimensions = nlp.vocab.load_vectors('glove.840B.300d.txt.bz2')
# print n_dimensions
# tokens = n_dimensions
# embeddings_index = {}
# f = open('glove.6B.300d.txt')
# for line in f:
# values = line.split()
# word = values[0]
# coefs = np.asarray(values[1:], dtype='float32')
# embeddings_index[word] = coefs
# f.close()
#
# print('Found %s word vectors.' % len(embeddings_index))
#
# word_embeddings = spacy.load('en', vectors='glove.6B.30d.txt')
tokens = word_embeddings(question)
question_tensor = np.zeros((1, 30, 300))
for j in xrange(len(tokens)):
question_tensor[0,j,:] = tokens[j].vector
return question_tensor