def get_encoded_vector(list_of_words, new_string):
porter = PorterStemmer()
lmtz = WordNetLemmatizer()
if 'START_SEQ' not in list_of_words:
list_of_words.append('START_SEQ')
if 'UNKNOWN_WORDS' not in list_of_words:
list_of_words.append('UNKNOWN_WORDS')
if 'END_SEQ' not in list_of_words:
list_of_words.append('END_SEQ')
tokens = text_to_word_sequence(new_string, lower=True, split=" ")
# Stem and Lemmatize the data
token_stemmed = []
for token in tokens:
try:
token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
except:
token_stemmed.append(token)
tokens = list(token_stemmed)
out = []
all_unknown_words = True
for token in tokens:
if token in list_of_words:
all_unknown_words = False
out.append(list_of_words.index(token))
else:
out.append(list_of_words.index('UNKNOWN_WORDS'))
if all_unknown_words:
print('Sentence not recognised:', new_string)
out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
return out
评论列表
文章目录