score_process.py 文件源码-python代码片段

score_process.py 文件源码

python

阅读 22 收藏 0 点赞 0 评论 0

项目：Diggly-Back-End 作者: WikiDiggly 项目源码文件源码

def score_topics(source_id, topics_desc_dict):
    token_dict = {}
    indices = {}
    res_dict = {}
    index = 0

    for tid, text in topics_desc_dict.iteritems():
        lowers = text.lower()
        remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
        no_punctuation = lowers.translate(remove_punctuation_map)
        token_dict[tid] = no_punctuation

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[source_id]

    # this can take some time
    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_index:
            continue;
        res_dict.update({tok: res[0][ind]})

    return res_dict