def kullback_leibler(vec1, vec2, num_features=None):
"""
A distance metric between two probability distributions.
Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
If the distribution draws from a certain number of docs, that value must be passed.
"""
if scipy.sparse.issparse(vec1):
vec1 = vec1.toarray()
if scipy.sparse.issparse(vec2):
vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
if num_features != None: # if not None, make as large as the documents drawing from
dense1 = sparse2full(vec1, num_features)
dense2 = sparse2full(vec2, num_features)
return entropy(dense1, dense2)
else:
max_len = max(len(vec1), len(vec2))
dense1 = sparse2full(vec1, max_len)
dense2 = sparse2full(vec2, max_len)
return entropy(dense1, dense2)
else:
# this conversion is made because if it is not in bow format, it might be a list within a list after conversion
# the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
if len(vec1) == 1:
vec1 = vec1[0]
if len(vec2) == 1:
vec2 = vec2[0]
return scipy.stats.entropy(vec1, vec2)
评论列表
文章目录