def get_rpid2pid(self, dump_date):
print('get_rpid2pid...')
title2id = self.get_title2id(dump_date)
rpid2pid = {}
regex = re.compile(r"\((\d+),0,'(.+?)','")
fname = '/home/ddimitrov/data/enwiki20150304_plus_clickstream/enwiki-' + dump_date + '-redirect.sql.gz'
with gzip.GzipFile(fname, 'rb') as infile:
for line in infile:
line = line.decode('utf-8')
if not line.startswith('INSERT'):
continue
line = line.replace('NULL', "''")
for pid, title in regex.findall(line):
try:
rpid2pid[pid] = title2id[DataHandler.unescape_mysql(title)]
except KeyError:
print(pid, title)
# pdb.set_trace()
return rpid2pid
评论列表
文章目录