def txt_to_sent(sentences, word_vec, tokenize=True):
sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences]
n_w = np.sum([len(x) for x in sentences])
# filters words without glove vectors
for i in range(len(sentences)):
s_f = [word for word in sentences[i] if word in word_vec]
if not s_f:
import warnings
warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i))
s_f = ['</s>']
sentences[i] = s_f
lengths = np.array([len(s) for s in sentences])
n_wk = np.sum(lengths)
print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2)))
return sentences
评论列表
文章目录