load_test_data.py 文件源码-python代码片段

load_test_data.py 文件源码

python

阅读 18 收藏 0 点赞 0 评论 0

项目：BotBoosted 作者: brityboy 项目源码文件源码

def create_processed_dataframe_from_mongo(dbname):
    '''
    INPUT
         - dbname: this is the name of the mongo database where the
         information will be extracted from
    OUTPUT
         - df

    Returns a dataframe that has everything needed in order to do modelling
    '''
    df = extract_user_information_from_mongo(dbname, 'topictweets')
    # df = pd.read_csv('data/clinton_df.csv')
    # df.id = df.id.apply(str)
    feature_dict = extract_feature_information_from_mongo(dbname,
                                                          'timelinetweets')
    # with open('data/clinton_tweets_dict.pkl', 'r') as f:
    #     feature_dict = pickle.load(f)
    df = df.drop_duplicates(subset='id', keep='last')
    users_who_tweeted = set(feature_dict.keys())
    dfusers_who_tweeted = df[df.id.isin(users_who_tweeted)]
    # subset the initial user dataframe to have ONLY the users who tweeted
    df = combine_user_info_with_feature_dict(dfusers_who_tweeted, feature_dict)
    df = process_feature_information_for_modelling(df, feature_dict)
    df = drop_unnecessary_columns_from_test_data(df)
    return df