def transform(self):
tweet_df = json_normalize(self.tweet)
# drop all columns from tweet_df that we're not using in extract_fields
with open(self.loc.format('../../etl/extract/extract_fields.json')) as fp:
fields_dict = json.load(fp)
fields_subset = fields_dict.get('fields')
tweet_df = tweet_df.loc[:, fields_subset]
# perform transformations on DF to get into same form as DB table
tweet_df.loc[:, 'retweets_to_faves'] = 0
# this feature isn't scaled properly since we're pulling from the stream
#tweet_df.loc[:, 'retweets_to_faves'] = tweet_df.loc[:, 'retweet_count'] / tweet_df.loc[:, 'favorite_count']
tweet_df.loc[:, 'num_characters'] = tweet_df.text.apply(lambda x: len(x))
tweet_df.loc[:, 'num_exclamation_points'] = tweet_df.text.apply(lambda x: x.count('!'))
tweet_df.loc[:, 'is_tweetstorm'] = 0
tweet_df.loc[:, 'is_trump_retweet'] = tweet_df.text.apply(lambda x: is_retweet(x))
tweet_df.loc[:, 'num_uppercase_strings'] = tweet_df.text.apply(lambda x: count_uppercase_substrings(x))
tweet_df.loc[:, 'source'] = tweet_df.source.apply(lambda x: normalize_tweet_sources(x))
tweet_df.rename(columns={
'favorite_count': 'favorites',
'quoted_status.text': 'quoted_status_text',
'retweet_count': 'retweets',
'source': 'tweet_source',
'user.id_str': 'user_id_str',
'user.name': 'user_name',
'user.followers_count': 'followers',
'user.screen_name': 'user_screen_name',
'user.statuses_count': 'num_statuses'
}, inplace=True)
self.tweet_df = tweet_df
评论列表
文章目录