def get_title2id(self, dump_date):
print('get_title2id...')
title2id = {}
regex = re.compile(r"\((\d+),0,'(.+?)','")
fname = '/home/ddimitrov/data/enwiki20150304_plus_clickstream/enwiki-' + dump_date + '-page.sql.gz'
fname = '/home/ddimitrov/data/enwiki20150304_plus_clickstream/enwiki-' + dump_date + '-page.sql'
#with gzip.GzipFile(fname, 'rb') as infile:
with open(fname) as f:
content = f.readlines()
for line in content:
line = line.decode('utf-8')
if not line.startswith('INSERT'):
continue
for pid, title in regex.findall(line):
title2id[DataHandler.unescape_mysql(title)] = int(pid)
return title2id
评论列表
文章目录