def token_func(input_string):
tokens = nltk.word_tokenize(input_string)
long_tokens = []
refined_tokens = []
# lemmatized_tokens = []
stopwordlist = get_stopwordlist("../data/first_stopwordlist.txt")
regex = re.compile('[^1-9a-zA-Z]')
for token in tokens:
token = regex.sub('', token)
if len(token) > 3:
long_tokens.append(token)
lemmatized_tokens = dhh_preprocess_tools.hfst_words(long_tokens,
filter=('VERB',
'NOUN',
'ADJ',
'PROPN'))
for token in lemmatized_tokens:
token = token.lower()
if token not in stopwordlist:
refined_tokens.append(token)
return refined_tokens
评论列表
文章目录