def cosine_knn(corpus_vector, queries_vector, k=10):
"""
:param corpus_vector: vectorized document text
:param queries_vector: vectorized query text
:param k: number of neighbours
:return: (distances, indices) of knn
"""
# based on
# http://scikit-learn.org/stable/modules/neighbors.html
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
# since we want to use cosine similarity to account for document length
# we have to use bruteforce search
# parallelize to number of cores with n_jobs -1
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
nbrs.fit(corpus_vector)
distances, indices = nbrs.kneighbors(queries_vector)
return distances, indices
job_description_feature_extraction.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录