twitter_extractor.py 文件源码-python代码片段

def _get_user_tweets(self, screen_name):

        # TODO: Implement tweet limit

        # Twitter only allows access to a users most recent 3240 tweets with this method

        # initialize a list to hold all the tweepy Tweets
        alltweets = []

        # make initial request for most recent tweets (200 is the maximum allowed count)
        new_tweets = self._api.user_timeline(screen_name = screen_name,count=200)

        # save most recent tweets
        alltweets.extend(new_tweets)

        # save the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        # keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:

            # all subsequent requests use the max_id param to prevent duplicates
            new_tweets = self._api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)

            # save most recent tweets
            alltweets.extend(new_tweets)

            # update the id of the oldest tweet less one
            oldest = alltweets[-1].id - 1

        # transform the tweepy tweets into a 2D array that will populate the csv
        outtweets = {tweet.id_str: {'created':tweet.created_at,'text':tweet.text} for tweet in alltweets}

        # Twitter-aware tokenizer
        tknzr = TweetTokenizer()

        # Extend data with linguistic processing
        for tweet_id in outtweets:

            # Get tweet data from dictionary
            tweet = outtweets[tweet_id]

            # Lowercase tokenized tweet text
            tweet_tokens = tknzr.tokenize(tweet['text'])

            # Parts-of-speech tags for tokenized text
            tweet_pos = nltk.pos_tag(tweet_tokens)

            # Is the tweet a rewteet?
            tweet['retweet'] = tweet_pos[0][0] == 'RT'

            # If retweeted, who was the original author?

            if tweet['retweet'] is True:
                tweet['rt_author'] = tweet_pos[1][0]
            else:
                tweet['rt_author'] = ''

        return outtweets

# TODO: Might have encoding issues. See: https://stackoverflow.com/questions/6539881/python-converting-from-iso-8859-1-latin1-to-utf-8