_stream_processor.py 文件源码-python代码片段

def transform(self):

        tweet_df = json_normalize(self.tweet)

        # drop all columns from tweet_df that we're not using in extract_fields
        with open(self.loc.format('../../etl/extract/extract_fields.json')) as fp:
            fields_dict = json.load(fp)
            fields_subset = fields_dict.get('fields')

        tweet_df = tweet_df.loc[:, fields_subset]

        # perform transformations on DF to get into same form as DB table
        tweet_df.loc[:, 'retweets_to_faves'] = 0

        # this feature isn't scaled properly since we're pulling from the stream
        #tweet_df.loc[:, 'retweets_to_faves'] = tweet_df.loc[:, 'retweet_count'] / tweet_df.loc[:, 'favorite_count']
        tweet_df.loc[:, 'num_characters'] = tweet_df.text.apply(lambda x: len(x))
        tweet_df.loc[:, 'num_exclamation_points'] = tweet_df.text.apply(lambda x: x.count('!'))
        tweet_df.loc[:, 'is_tweetstorm'] = 0
        tweet_df.loc[:, 'is_trump_retweet'] = tweet_df.text.apply(lambda x: is_retweet(x))
        tweet_df.loc[:, 'num_uppercase_strings'] = tweet_df.text.apply(lambda x: count_uppercase_substrings(x))
        tweet_df.loc[:, 'source'] = tweet_df.source.apply(lambda x: normalize_tweet_sources(x))

        tweet_df.rename(columns={
            'favorite_count': 'favorites',
            'quoted_status.text': 'quoted_status_text',
            'retweet_count': 'retweets',
            'source': 'tweet_source',
            'user.id_str': 'user_id_str',
            'user.name': 'user_name',
            'user.followers_count': 'followers',
            'user.screen_name': 'user_screen_name',
            'user.statuses_count': 'num_statuses'

        }, inplace=True)

        self.tweet_df = tweet_df