DataHandler.py 文件源码-python代码片段

DataHandler.py 文件源码

python

阅读 26 收藏 0 点赞 0 评论 0

项目：TextClassification 作者: AlgorTroy 项目源码文件源码

def get_encoded_vector(list_of_words, new_string):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    if 'START_SEQ' not in list_of_words:
        list_of_words.append('START_SEQ')

    if 'UNKNOWN_WORDS' not in list_of_words:
        list_of_words.append('UNKNOWN_WORDS')

    if 'END_SEQ' not in list_of_words:
        list_of_words.append('END_SEQ')

    tokens = text_to_word_sequence(new_string, lower=True, split=" ")

    # Stem and Lemmatize the data
    token_stemmed = []

    for token in tokens:
        try:
            token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
        except:
            token_stemmed.append(token)

    tokens = list(token_stemmed)

    out = []

    all_unknown_words = True

    for token in tokens:
        if token in list_of_words:
            all_unknown_words = False
            out.append(list_of_words.index(token))
        else:
            out.append(list_of_words.index('UNKNOWN_WORDS'))
    if all_unknown_words:
        print('Sentence not recognised:', new_string)

    out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
    return out