classifier.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:quantulum 作者: marcolagi 项目源码 文件源码
def download_wiki():
    """Download WikiPedia pages of ambiguous units."""
    ambiguous = [i for i in l.UNITS.items() if len(i[1]) > 1]
    ambiguous += [i for i in l.DERIVED_ENT.items() if len(i[1]) > 1]
    pages = set([(j.name, j.uri) for i in ambiguous for j in i[1]])

    print
    objs = []
    for num, page in enumerate(pages):

        obj = {'url': page[1]}
        obj['_id'] = obj['url'].replace('https://en.wikipedia.org/wiki/', '')
        obj['clean'] = obj['_id'].replace('_', ' ')

        print '---> Downloading %s (%d of %d)' % \
              (obj['clean'], num + 1, len(pages))

        obj['text'] = wikipedia.page(obj['clean']).content
        obj['unit'] = page[0]
        objs.append(obj)

    path = os.path.join(l.TOPDIR, 'wiki.json')
    os.remove(path)
    json.dump(objs, open(path, 'w'), indent=4, sort_keys=True)

    print '\n---> All done.\n'


###############################################################################
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号