data.py 文件源码-python代码片段

data.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

def txt_to_sent(sentences, word_vec, tokenize=True):

    sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences]
    n_w = np.sum([len(x) for x in sentences])

    # filters words without glove vectors
    for i in range(len(sentences)):
        s_f = [word for word in sentences[i] if word in word_vec]
        if not s_f:
            import warnings
            warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i))
            s_f = ['</s>']
        sentences[i] = s_f

    lengths = np.array([len(s) for s in sentences])
    n_wk = np.sum(lengths)

    print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2)))

    return sentences