summarize.py 文件源码-python代码片段

def run(self):

        class Retweet(object):
            def __init__(self, id, count):
                self.id = id
                self.count = count
            def __lt__(self, other):
                # a trick to have bisect reverse sort
                return self.count > other.count
            def __repr__(self):
                return "%s [%s]" % (self.id, self.count)

        retweet_ids = set()
        retweets = []

        for tweet_str in self.input().open('r'):
            tweet = json.loads(tweet_str)
            retweet_count = tweet.get('retweet_count', 0)
            if retweet_count == 0:
                continue

            if 'retweeted_status' in tweet:
                tweet_id = tweet['retweeted_status']['id_str']
            else:
                tweet_id = tweet['id_str']

            # ignore duplicate tweets
            # NOTE: this only works for search data!
            if tweet_id in retweet_ids:
                continue

            bisect.insort_right(
                retweets,
                Retweet(tweet_id, retweet_count)
            )

            retweet_ids.add(tweet_id)
            if len(retweets) > 100:
                rt = retweets.pop()
                retweet_ids.remove(rt.id)

        with self.output().open('w') as fh:
            writer = csv.DictWriter(fh, delimiter=',',
                                    quoting=csv.QUOTE_MINIMAL,
                                    fieldnames=['tweet_id', 'count'])
            writer.writeheader()
            for rt in retweets:
                writer.writerow({'tweet_id': rt.id, 'count': rt.count})