def extract_features(tweet, combinedKeywords, pronDict):
preprocessed = ut.preprocess(tweet)
# ignore empty preprocessed tweets or retweets
if len(preprocessed) == 0 or ut.is_retweet(tweet):
return []
else:
sentLength = la.get_sentence_length(preprocessed)
exclMarks = la.get_exclamation_marks(tweet)
gradeLvl = get_flesch_grade_level(preprocessed, pronDict)
keyCount = get_keywords_count(preprocessed, combinedKeywords)
# ensure same order everytime
keys = sorted(list(keyCount.keys()))
# put all features together
features = [ sentLength, exclMarks, gradeLvl ]
for key in keys:
features.append(keyCount.get(key))
# return array
return features
评论列表
文章目录