score_process.py 文件源码-python代码片段

score_process.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：Diggly-Back-End 作者: WikiDiggly 项目源码文件源码

def score_outlinks(main_text, title_list):
    main_title = "current_selected_topic"
    token_dict = {}
    len_titles = {}
    indices = {}
    res_dict = {}
    index = 0

    for title in title_list:
        lowers = title.lower().replace("_", " ").replace("-", " ")
        len_titles.update({title: len(lowers.split(" "))})
        token_dict[title] = lowers

    len_titles[main_title] = 1
    token_dict[main_title] = main_text

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[main_title]

    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_title:
            continue;
        res_dict.update({tok: (res[0][ind] * 100 / len_titles[tok]) })

    return res_dict