def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,):
docs=[]
for sent, sim in generatedSentences:
docs.append(sent)
docs.extend(originalSentences)
bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
normalized = TfidfTransformer().fit_transform(bow_matrix)
#simMatrix = (normalized[0:] * normalized[0:].T).A
simindices=[]
#print 'Num original, ', len(originalSentences)
for i in xrange(len(generatedSentences)):
simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
if(max(simGeneratedScores) >= threshold):
simindices.append(i)
#print simindices
finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
#print len(generatedSentences), len(finalGen)
return finalGen
评论列表
文章目录