def calculate_tf_idf(sentence, global_count_of_papers_words_occur_in, paper_bag_of_words):
"""
Calculates the tf-idf score for a sentence based on all of the papers.
:param sentence: the sentence to calculate the score for, as a list of words
:param global_count_of_papers_words_occur_in: a dictionary of the form (word: number of papers the word occurs in)
:param paper_bag_of_words: the bag of words representation for a paper
:return: the tf-idf score of the sentence
"""
bag_of_words = paper_bag_of_words
sentence_tf_idf = 0
length = 0
tf_idfs = []
for word in sentence:
# Get the number of documents containing this word - the idf denominator (1 is added to prevent division by 0)
docs_containing_word = global_count_of_papers_words_occur_in[word] + 1
# Count of word in this paper - the tf score
count_word = bag_of_words[word]
idf = np.log(NUMBER_OF_PAPERS / docs_containing_word)
word_tf_idf = count_word * idf
tf_idfs.append(word_tf_idf)
return [x for x in zip(sentence, tf_idfs)]
tf_idf_visualiser.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录