def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,):
docs=[]
for sent, sim in generatedSentences:
docs.append(sent)
docs.extend(originalSentences)
bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
normalized = TfidfTransformer().fit_transform(bow_matrix)
#simMatrix = (normalized[0:] * normalized[0:].T).A
simindices=[]
#print 'Num original, ', len(originalSentences)
for i in xrange(len(generatedSentences)):
simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
if(max(simGeneratedScores) >= threshold):
simindices.append(i)
#print simindices
finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
#print len(generatedSentences), len(finalGen)
return finalGen
python类linear_kernel()的实例源码
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
verb_token_vector = vectorizer.transform([verb_token])
# Here the linear kernel is the same as the cosine similarity, but faster
# cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
scores = linear_kernel(verb_token_vector, tf_idf_matrix)
logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
return scores
def __init__(self):
start = time.time()
self.item_service = ItemService()
self.data = pd.DataFrame(list(self.item_service.get_rec_data()))
self.tfidf = TfidfVectorizer(
analyzer='word',
ngram_range=(1, 3),
min_df=0,
smooth_idf=False,
stop_words='english')
self.tfidf_matrix = self.tfidf.fit_transform(
self.data['concated_attrs'])
self.cosine_similarities = linear_kernel(
self.tfidf_matrix, self.tfidf_matrix)
info("Training data ingested in %s seconds." % (time.time() - start))
def query(self, query, k=None, indices=None):
if indices is not None:
dvs = self.inferred_docvecs[indices]
else:
dvs = self.inferred_docvecs
analyzed_query = self.analyzer(query)
qv = self.model.infer_vector(analyzed_query).reshape(1, -1)
qv = normalize(qv, copy=False)
dists = linear_kernel(qv, dvs)[0]
ind = argtopk(dists)
return ind
def query(self, query, k=None, indices=None):
if self._fit_X is None:
raise NotFittedError
q = super().transform([query])
if indices is not None:
fit_X = self._fit_X[indices]
else:
fit_X = self._fit_X
# both fit_X and q are l2-normalized
D = linear_kernel(q, fit_X)
ind = argtopk(D[0], k)
return ind
def query(self, query, k=None, indices=None):
centroids = self.centroids
if centroids is None:
raise NotFittedError
if indices is not None:
centroids = centroids[indices]
q = self.vect.transform([query])
q = normalize(q, copy=False)
D = linear_kernel(q, centroids) # l2 normalized, so linear kernel
# ind = np.argsort(D[0, :])[::-1] # similarity metric, so reverse
# if k is not None: # we could use our argtopk in the first place
# ind = ind[:k]
# print(ind)
ret = argtopk(D[0], k=k)
return ret
def __kernel_definition__(self):
if self.Kf == 'rbf':
return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
if self.Kf == 'poly':
return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
if self.Kf == None or self.Kf == 'linear':
return lambda X,Y : linear_kernel(X,Y)
def cluster_texts(textdict, eps=0.45, min_samples=3):
"""
cluster the given texts
Input:
textdict: dictionary with {docid: text}
Returns:
doccats: dictionary with {docid: cluster_id}
"""
doc_ids = list(textdict.keys())
# transform texts into length normalized kpca features
ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False)
docfeats = ft.texts2features(textdict)
X, featurenames = features2mat(docfeats, doc_ids)
e_lkpca = KernelPCA(n_components=250, kernel='linear')
X = e_lkpca.fit_transform(X)
xnorm = np.linalg.norm(X, axis=1)
X = X/xnorm.reshape(X.shape[0], 1)
# compute cosine similarity
D = 1. - linear_kernel(X)
# and cluster with dbscan
clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples)
y_pred = clst.fit_predict(D)
return {did: y_pred[i] for i, did in enumerate(doc_ids)}
def inner(self, x, y):
return linear_kernel(to2d(x), to2d(y))
def test_kernel_symmetry():
# Valid kernels should be symmetric
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
laplacian_kernel, sigmoid_kernel, cosine_similarity):
K = kernel(X, X)
assert_array_almost_equal(K, K.T, 15)
def test_kernel_sparse():
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
X_sparse = csr_matrix(X)
for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
laplacian_kernel, sigmoid_kernel, cosine_similarity):
K = kernel(X, X)
K2 = kernel(X_sparse, X_sparse)
assert_array_almost_equal(K, K2)
def test_linear_kernel():
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
K = linear_kernel(X, X)
# the diagonal elements of a linear kernel are their squared norm
assert_array_almost_equal(K.flat[::6], [linalg.norm(x) ** 2 for x in X])
def query(self, keyword, limit):
vec_keyword = self.vec.transform([keyword])
cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten()
related_email_indices = cosine_sim.argsort()[:-limit:-1]
print(related_email_indices)
return related_email_indices
def similarity_function(vec1,vec2, similarity):
#compute cosine similarity or other similarities
v1 = np.array(vec1)
v2 = np.array(vec2)
if len(v1)*len(v2) == 0: #any of the two is 0
global count
count +=1
return 0
else:
if similarity == 'cosine':
return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]]
elif similarity == 'softmax':
return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons
elif similarity == 'linear_kernel':
return linear_kernel(v1,v2)[0][0]
elif similarity == 'euclidean':
return euclidean_distances(v1,v2)[0][0]
else:
raise NameError('Choose a valid similarity function')