export.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:scibot 作者: SciCrunch 项目源码 文件源码
def export_json_impl():
    h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000)
    params = {'group' : h.group }
    rows = h.search_all(params)
    annos = [HypothesisAnnotation(row) for row in rows]

    # clean up bugs from old curation workflow
    for anno in annos:
        if anno.tags:
            new_tags = []
            for tag in anno.tags:
                if tag in bad_tags:
                    new_tags.append(tag.replace('RRID:', 'RRIDCUR:'))  # scibot made a mistake early, might be able to correct tags in bulk someday
                else:
                    new_tags.append(tag)  # horribly inefficient...
            anno.tags = new_tags

        if anno.text.startswith('RRID:'):  # catch cases where the RRID was put in text instead of in tags
            if 'RRIDCUR:Missing' in anno.tags or 'RRIDCUR:Unrecognized' in anno.tags:
                rtag = anno.text.split(None,1)[0]  # trap for cases where there is more text after an RRID...
                if rtag not in anno.tags:
                    anno.tags.append(rtag)
                    print('TEXT ISSUE for %s at https://hyp.is/%s' % (anno.user, anno.id))
        elif anno.exact and anno.exact.startswith('RRID:'):  # this needs to go second in case of RRIDCUR:Incorrect
            if anno.exact.startswith('RRID: '):  # deal with nospace first
                rtag = anno.exact.replace('RRID: ', 'RRID:')
            else:
                rtag = anno.exact
            rtag = rtag.split(None,1)[0]  # trap more
            if rtag not in anno.tags:
                if anno.user == 'scibot' and len(anno.tags) == 1 and anno.tags[0].startswith('RRID:RRID:'):  # FIXME HACK
                    anno.tags = [rtag]
                else:
                    pass  # anything else we detect in the data doesn't need to be corrected or used to fix tags

    output_json = [anno.__dict__ for anno in annos]
    DATE = date.today().strftime('%Y-%m-%d')
    return output_json, DATE

### URG
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号