def get_iterator(self):
tweet_parser = TweetParser()
if self.compression == 'bz2':
self.mode = binary_mode(self.mode)
json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
elif self.compression == 'gzip':
self.mode = binary_mode(self.mode)
json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
else:
json_handle = open(self.filepath, self.mode, encoding=self.encoding)
bad_lines = 0
for count, tweet in enumerate(json_handle):
if not self.throw_error:
try:
tweet = json_util.loads(tweet)
except:
bad_lines += 1
else:
tweet = json_util.loads(tweet)
if self.limit != 0 and self.limit <= count:
return
elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
if self.should_strip:
yield tweet_parser.strip_tweet(self.keep_fields, tweet)
else:
yield tweet
if self.verbose:
print("{} rows are ok.".format(count - bad_lines))
print("{} rows are corrupt.".format(bad_lines))
json_handle.close()
评论列表
文章目录