def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
verb_token_vector = vectorizer.transform([verb_token])
# Here the linear kernel is the same as the cosine similarity, but faster
# cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
scores = linear_kernel(verb_token_vector, tf_idf_matrix)
logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
return scores
评论列表
文章目录