def articles(wiki_json_fn, limit=None):
count = 0
_, ext = os.path.splitext(wiki_json_fn)
if ext == '.gz':
f = GzipFile(wiki_json_fn, mode='r')
elif ext == '.bz2':
f = BZ2File(wiki_json_fn, mode='r')
else:
f = io.open(wiki_json_fn, mode='rb')
while True:
line = f.readline()
if line == b'':
break
action = json.loads(line.decode('utf-8'))
line = f.readline()
if line == b'':
break
source = json.loads(line.decode('utf-8'))
if is_page(action, source):
yield {'id': action['index']['_id'], 'title': source['title'], 'text': source['text']}
count += 1
if limit and count > limit:
return
if count % 10000 == 0:
logging.info("read %d articles" % count)
f.close()
评论列表
文章目录