def get_question_features(question):
''' For a given question, a unicode string, returns the timeseris vector
with each word (token) transformed into a 300 dimension representation
calculated using Glove Vector '''
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
# word_embeddings = spacy.load('en')#, vectors='en_glove_cc_300_1m_vectors')
# nlp = English()
# n_dimensions = nlp.vocab.load_vectors('glove.840B.300d.txt.bz2')
# print n_dimensions
# tokens = n_dimensions
# embeddings_index = {}
# f = open('glove.6B.300d.txt')
# for line in f:
# values = line.split()
# word = values[0]
# coefs = np.asarray(values[1:], dtype='float32')
# embeddings_index[word] = coefs
# f.close()
#
# print('Found %s word vectors.' % len(embeddings_index))
#
# word_embeddings = spacy.load('en', vectors='glove.6B.30d.txt')
tokens = word_embeddings(question)
question_tensor = np.zeros((1, 30, 300))
for j in xrange(len(tokens)):
question_tensor[0,j,:] = tokens[j].vector
return question_tensor
评论列表
文章目录