util.py 文件源码

python
阅读 29 收藏 0 点赞 0 评论 0

项目:code-uai16 作者: thanhan 项目源码 文件源码
def main(dataset = 'proton-beam-xml'):
    csv.field_size_limit(430000)
    global mat, rel, turk_dic

    if dataset == 'proton-beam-xml':
        pub_dic_tmp = get_pub_dic_xml()
        # pub_dic_items are already sorted by key
        [rec_nums, texts] = zip(*pub_dic.items())
        rel = get_relevant()
    else:
        pub_dic_tmp = get_pub_dic_csv(dataset)
        #[rec_nums, texts] = zip(*pub_dic.items())
        (turk_dic_tmp, rel_dic_tmp) = get_turk_data(dataset)

        texts = []
        pub_dic = {}; turk_dic = {}; rel_dic = {}

        for i in sorted(pub_dic_tmp.keys()):
            if pub_dic_tmp.has_key(i) and turk_dic_tmp.has_key(i) and rel_dic_tmp.has_key(i):
                texts.append(pub_dic_tmp[i])
                pub_dic[i] = pub_dic_tmp[i]
                turk_dic[i] = turk_dic_tmp[i]
                rel_dic[i] = rel_dic_tmp[i]
            #else:
            #    if pub_dic.has_key(i): pub_dic.pop(i)
            #    if turk_dic.has_key(i): turk_dic.pop(i)
            #    if rel_dic.has_key(i): rel_dic.pop(i)

        (_,rel) = zip(*sorted(rel_dic.items()))
        rel = map(int, rel)

    vectorizer = TfidfVectorizer()
    #save_texts = texts
    mat = vectorizer.fit_transform(texts)
    return (pub_dic, texts)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号