def function_2(text):
paragraphs = text.split('\n\n')
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(paragraphs)
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
similarity_graph.toarray()
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph) #TextRank applied
ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
ten_percent = int(round(10.00/100.00 * len(ranked)))
ten_percent_high_scores = ranked[0:ten_percent]
summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
return "\n\n".join(summary)
#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
papyrus_summary_extraction_tool.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录