check.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:wikilinks 作者: trovdimi 项目源码 文件源码
def get_rpid2pid(self, dump_date):
        print('get_rpid2pid...')
        title2id = self.get_title2id(dump_date)
        rpid2pid = {}
        regex = re.compile(r"\((\d+),0,'(.+?)','")
        fname = '/home/ddimitrov/data/enwiki20150304_plus_clickstream/enwiki-' + dump_date + '-redirect.sql.gz'
        with gzip.GzipFile(fname, 'rb') as infile:
            for line in infile:
                line = line.decode('utf-8')
                if not line.startswith('INSERT'):
                    continue
                line = line.replace('NULL', "''")
                for pid, title in regex.findall(line):
                    try:
                        rpid2pid[pid] = title2id[DataHandler.unescape_mysql(title)]
                    except KeyError:
                        print(pid, title)
                        # pdb.set_trace()
        return rpid2pid
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号