def articles():
n = 0
with bz2.BZ2File("articles.xml.bz2", 'r') as infile:
for event, elem in iterparse(infile, events=("start", "end")):
if event == 'start':
if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}mediawiki':
root = elem
elif event == 'end':
if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
title_elem = elem.find('{http://www.mediawiki.org/xml/export-0.10/}title')
if title_elem is None: continue
title = title_elem.text
if title is None or ':' in title: continue
revision = elem.find('{http://www.mediawiki.org/xml/export-0.10/}revision')
if revision is None: continue
text_elem = revision.find('{http://www.mediawiki.org/xml/export-0.10/}text')
if text_elem is None: continue
text = text_elem.text
if text is None: continue
yield Article(n, title, text)
n += 1
#if title == 'Zhang Heng':
# break
root.clear()
评论列表
文章目录