dataset.py 文件源码-python代码片段

dataset.py 文件源码

python

阅读 44 收藏 0 点赞 0 评论 0

项目：LinguisticAnalysis 作者: DucAnhPhi 项目源码文件源码

def extract_features(tweet, combinedKeywords, pronDict):
    preprocessed = ut.preprocess(tweet)

    # ignore empty preprocessed tweets or retweets
    if len(preprocessed) == 0 or ut.is_retweet(tweet):
        return []
    else:
        sentLength = la.get_sentence_length(preprocessed)
        exclMarks = la.get_exclamation_marks(tweet)
        gradeLvl = get_flesch_grade_level(preprocessed, pronDict)
        keyCount = get_keywords_count(preprocessed, combinedKeywords)
        # ensure same order everytime
        keys = sorted(list(keyCount.keys()))

        # put all features together
        features = [ sentLength, exclMarks, gradeLvl ]
        for key in keys:
            features.append(keyCount.get(key))
        # return array
        return features