def score_topics(source_id, topics_desc_dict):
token_dict = {}
indices = {}
res_dict = {}
index = 0
for tid, text in topics_desc_dict.iteritems():
lowers = text.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
no_punctuation = lowers.translate(remove_punctuation_map)
token_dict[tid] = no_punctuation
for tok in token_dict.keys():
indices.update({tok: index})
index += 1
main_index = indices[source_id]
# this can take some time
tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
tfidf_matrix = tf_idf.fit_transform(token_dict.values())
res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)
for tok, ind in indices.iteritems():
if tok == main_index:
continue;
res_dict.update({tok: res[0][ind]})
return res_dict
评论列表
文章目录