def text2idx2(texts, vocab, dim, use_mask=False):
'''
Convert a list of texts to their corresponding vocabulary indexes.
'''
if use_mask:
out = -np.ones((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
else:
out = -2 * np.ones((len(texts), dim), dtype=np.int32)
out_lst = []
for i, text in enumerate(texts):
words = wordpunct_tokenize(text)[:dim]
for j, word in enumerate(words):
if word in vocab:
out[i,j] = vocab[word]
else:
out[i,j] = -1 # Unknown words
out_lst.append(words)
if use_mask:
mask[i,:j] = 1.
if use_mask:
return out, mask, out_lst
else:
return out, out_lst
评论列表
文章目录