def create_processed_dataframe_from_mongo(dbname):
'''
INPUT
- dbname: this is the name of the mongo database where the
information will be extracted from
OUTPUT
- df
Returns a dataframe that has everything needed in order to do modelling
'''
df = extract_user_information_from_mongo(dbname, 'topictweets')
# df = pd.read_csv('data/clinton_df.csv')
# df.id = df.id.apply(str)
feature_dict = extract_feature_information_from_mongo(dbname,
'timelinetweets')
# with open('data/clinton_tweets_dict.pkl', 'r') as f:
# feature_dict = pickle.load(f)
df = df.drop_duplicates(subset='id', keep='last')
users_who_tweeted = set(feature_dict.keys())
dfusers_who_tweeted = df[df.id.isin(users_who_tweeted)]
# subset the initial user dataframe to have ONLY the users who tweeted
df = combine_user_info_with_feature_dict(dfusers_who_tweeted, feature_dict)
df = process_feature_information_for_modelling(df, feature_dict)
df = drop_unnecessary_columns_from_test_data(df)
return df
评论列表
文章目录