def text_to_vector(sentence_list, MAX_SENTENCE=78, model=None):
if model is None:
model = models.Word2Vec.load_word2vec_format(
local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True)
X = np.zeros((MAX_SENTENCE, len(sentence_list), 300))
capitals = np.zeros((MAX_SENTENCE, len(sentence_list), 3))
vectorize = lambda x: model[x] if x in model else np.zeros(300)
mask = []
for i, sentence in enumerate(sentence_list):
for j, word in enumerate(sentence):
if j == MAX_SENTENCE:
j -= 1
break
X[j][i] = vectorize(word)
capitals[j][i] = cap_vector(word)
mask.append(j + 1)
mask = np.array(mask)
return X, capitals, mask
评论列表
文章目录