json_collection.py 文件源码-python代码片段

def get_iterator(self):
        tweet_parser = TweetParser()
        if self.compression == 'bz2':
            self.mode = binary_mode(self.mode)
            json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
        elif self.compression == 'gzip':
            self.mode = binary_mode(self.mode)
            json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
        else:    
            json_handle = open(self.filepath, self.mode, encoding=self.encoding)
        bad_lines = 0
        for count, tweet in enumerate(json_handle):
            if not self.throw_error:
                try:
                    tweet = json_util.loads(tweet)
                except:
                    bad_lines += 1
            else:
                tweet = json_util.loads(tweet)
            if self.limit != 0 and self.limit <= count:
                return
            elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
            and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
                if self.should_strip:
                    yield tweet_parser.strip_tweet(self.keep_fields, tweet)
                else:
                    yield tweet
        if self.verbose:
            print("{} rows are ok.".format(count - bad_lines))
            print("{} rows are corrupt.".format(bad_lines))
        json_handle.close()