def rank(nodes, edges):
''' Creates the graph with the calculates nodes (sentences) and their weight'''
graph = nx.DiGraph()
graph.add_nodes_from(nodes)
graph.add_weighted_edges_from(edges)
''' Uses google's pagerank formula to find the most important senteces'''
return nx.pagerank(graph)
python类pagerank()的实例源码
papyrus_summary_extraction_tool.py 文件源码
项目:Papyrus--simple-but-effective-text-summarization-tool
作者: RebeccaMerrett
项目源码
文件源码
阅读 38
收藏 0
点赞 0
评论 0
def function_2(text):
paragraphs = text.split('\n\n')
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(paragraphs)
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
similarity_graph.toarray()
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph) #TextRank applied
ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
ten_percent = int(round(10.00/100.00 * len(ranked)))
ten_percent_high_scores = ranked[0:ten_percent]
summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
return "\n\n".join(summary)
#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
def extractSentences(document):
# sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
# sentenceTokens = sent_detector.tokenize(text.strip())
sentenceTokens = document.sentences()
graph = buildGraph(sentenceTokens)
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important sentences in ascending order of importance
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#return a 100 word summary
summary = ' '.join(sentences)
summaryWords = summary.split()
summaryWords = summaryWords[0:101]
summary = ' '.join(summaryWords)
return summary
def extractSentences(text): # this should be a bunch of sentences, not just one sentence
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
graph = buildGraph(sentenceTokens)
calculated_page_rank = nx.pagerank(graph, weight='weight') # implemented weight graph here
#most important sentences in ascending order of importance
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#return a 100 word summary
summary = ' '.join(sentences)
summaryWords = summary.split()
summaryWords = summaryWords[0:101]
summary = ' '.join(summaryWords)
return summary
def cosine_similarity_self(A):
similarity = np.dot(A, A.T)
square_mag = np.diag(similarity)
inv_square_mag = 1 / square_mag
inv_square_mag[np.isinf(inv_square_mag)] = 0
inv_mag = np.sqrt(inv_square_mag)
cosine = similarity * inv_mag
cosine = cosine.T * inv_mag
return cosine
# document should be a list of sentences
# method = "word2vec", "lda", "tfidf"
# def extraction(document, method="rawText"):
#
# # graph = build_graph(document, method) # document is a list of sentences
#
# calculated_page_rank = networkx.pagerank(graph, weight="weight")
#
# # most important sentences in descending order of importance
# sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=False)
#
# return sentences[0:4]
def __init__(self):
self.index = Index(config.INDEX_PATH)
# Checks if the full graph for this dataset was already ranked.
# If not, run page rank and store the results
pr_file_path = "%s/page_rank/%s.p" % (config.DATA, config.DATASET)
if not os.path.exists(pr_file_path):
g = nx.DiGraph()
g.add_edges_from(model.get_all_edges())
print "Running pageRank with %d nodes." % g.number_of_nodes()
self.pr = nx.pagerank(g)
cPickle.dump(self.pr, open(pr_file_path, "w"))
# Else, just loads it
else:
self.pr = cPickle.load(open(pr_file_path, 'r'))
document_summarization.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def textrank_text_summarizer(documents, num_sentences=2,
feature_type='frequency'):
vec, dt_matrix = build_feature_matrix(norm_sentences,
feature_type='tfidf')
similarity_matrix = (dt_matrix * dt_matrix.T)
similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
scores = networkx.pagerank(similarity_graph)
ranked_sentences = sorted(((score, index)
for index, score
in scores.items()),
reverse=True)
top_sentence_indices = [ranked_sentences[index][1]
for index in range(num_sentences)]
top_sentence_indices.sort()
for index in top_sentence_indices:
print sentences[index]
def extractSentences(text):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
graph = buildGraph(sentenceTokens)
calculated_page_rank = nx.pagerank(graph, weight='weight')
# most important sentences in ascending order of importance
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,
reverse=True)
# return a 100 word summary
summary = ' '.join(sentences)
summaryWords = summary.split()
summaryWords = summaryWords[0:101]
summary = ' '.join(summaryWords)
return summary
def add_sentences(self, sentences):
"""
@type sentences: list[Sentence]
:param sentences:
:return:
"""
counter = self.counter
G = self.G
for sentence in sentences:
G.add_nodes_from(sentence.concepts)
counter.update(ngrams(sentence.concepts, self.N))
for (keys, value) in counter.items():
for i in range(0, len(keys) - 1):
for j in range(1, len(keys)):
G.add_edge(keys[i], keys[j], weight=value)
# counter.update((keys[i], keys[j]))
# for (key, value) in counter.items():
# G.add_edge(key[0], key[1], attr={"weight": value})
print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))
self.pr = nx.pagerank(G)
def incorporate_feedback(self, flightrecorder):
"""
:param flightrecorder:
:return:
@type flightrecorder: FlightRecorder
"""
G = self.G
print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))
# use the pagerank personalization feature to incorporate flightrecorder feedback
union = flightrecorder.union()
for rejected in union.reject:
if(G.has_node(rejected)):
G.remove_node(rejected)
print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))
self.pr = nx.pagerank(G)
def sort_sentences(sentences, words,model, pagerank_config = {'alpha': 0.85,}):
"""???????????????
Keyword arguments:
sentences -- ????????
words -- ?????????sentences???????????????
sim_func -- ????????????????????????
pagerank_config -- pagerank???
"""
sorted_sentences = []
_source = words
sentences_num = len(_source)
graph = np.zeros((sentences_num, sentences_num))
for x in xrange(sentences_num):
for y in xrange(x, sentences_num):
similarity = get_similarity( _source[x], _source[y], model)
graph[x, y] = similarity
graph[y, x] = similarity
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(index=index, sentence=sentences[index], weight=score)
sorted_sentences.append(item)
return sorted_sentences
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
"""???????????????
Keyword arguments:
sentences -- ????????
words -- ?????????sentences???????????????
sim_func -- ????????????????????????
pagerank_config -- pagerank???
"""
sorted_sentences = []
_source = words
sentences_num = len(_source)
graph = np.zeros((sentences_num, sentences_num))
for x in xrange(sentences_num):
for y in xrange(x, sentences_num):
similarity = sim_func( _source[x], _source[y] )
graph[x, y] = similarity
graph[y, x] = similarity
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(index=index, sentence=sentences[index], weight=score)
sorted_sentences.append(item)
return sorted_sentences
def __init__(self, text):
self.sentences = get_sentences(text)
self.graph = build_graph(self.sentences)
self.pagerank = networkx.pagerank(self.graph, weight='weight')
self.reordered = sorted(self.pagerank, key=self.pagerank.get, reverse=True)
self.nouns = []
for sentence in self.sentences:
self.nouns += sentence.nouns
self.bow = collections.Counter(self.nouns)
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
"""???????????????
Keyword arguments:
sentences -- ????????
words -- ?????????sentences???????????????
sim_func -- ????????????????????????
pagerank_config -- pagerank???
"""
sorted_sentences = []
_source = words
sentences_num = len(_source)
graph = np.zeros((sentences_num, sentences_num))
for x in xrange(sentences_num):
for y in xrange(x, sentences_num):
similarity = sim_func( _source[x], _source[y] )
graph[x, y] = similarity
graph[y, x] = similarity
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(index=index, sentence=sentences[index], weight=score)
sorted_sentences.append(item)
return sorted_sentences
def Page_Rank(G):
PageRank_Centrality = nx.pagerank(G, alpha=0.85)
#print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
return PageRank_Centrality
def Page_Rank(G):
PageRank_Centrality = nx.pagerank(G, alpha=0.85)
#print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
return PageRank_Centrality
def __init__(self, edges, measure='pagerank'):
'''
Class for analysis graph
:param edges: weighted_edges The edges must be given as 3-tuples like (u,v,weight)
:param measure: what measure for analysis to filter,
must be one of 'degree' or 'pagerank' or 'clustering'
'''
self.measures = ['degree', 'pagerank', 'clustering']
self.measure = measure
self.ranks = {}
self.G = nx.Graph()
self.import_data(edges)
def get_pageranks(self):
pageranks = nx.pagerank(self.G)
max_pagerank = max(pageranks.values())
return pageranks, max_pagerank
def extractSentences(self, text):
'''
Extracts sentences from the graph using pagerank
Arguments:
text: input textual data
Returns:
summary: a bunch of sentences
Raises:
None
'''
def extractSentences(self, text):
'''
Extracts sentences from the graph using pagerank
Arguments:
text: input textual data
Returns:
summary: a bunch of sentences
Raises:
None
'''
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
"""???????????????
Keyword arguments:
sentences -- ????????
words -- ?????????sentences???????????????
sim_func -- ????????????????????????
pagerank_config -- pagerank???
"""
sorted_sentences = []
_source = words
sentences_num = len(_source)
graph = np.zeros((sentences_num, sentences_num))
for x in xrange(sentences_num):
for y in xrange(x, sentences_num):
similarity = sim_func( _source[x], _source[y] )
graph[x, y] = similarity
graph[y, x] = similarity
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(index=index, sentence=sentences[index], weight=score)
sorted_sentences.append(item)
return sorted_sentences
def search(self, query, exclude=[], force=False, limit=20):
# Fetches all document that have at least one of the terms
pubs = self.index.search(query,
search_fields=["title", "abstract"],
return_fields=["id"],
ignore=exclude)
# Unpack and convert to a set for fast lookup
pubs = set([pub_id for (pub_id,) in pubs])
# index_ids, _scores = self.index.search(query, ["title", "abstract"], limit=limit, mode="ALL")
# docs = set(self.index.get_documents(index_ids, "id"))
g = nx.DiGraph()
for u, v in self.edges:
if (u in pubs) and (v in pubs):
g.add_edge(u, v)
# print "PageRank with %d nodes." % g.number_of_nodes()
r = nx.pagerank(g, alpha=0.7)
if len(r) == 0:
return []
ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
return ids[:limit]
def search(self, query, exclude=[], limit=50, force=False):
graph = build_graph(query,
self.params['K'],
self.params['H'],
self.params['min_topic_lift'],
self.params['min_ngram_lift'],
exclude, force, load=True)
# Simple method to check if node is a document node.
is_doc = lambda node: node["type"] == "paper"
# Builds a new unweighted graph with only the documents as nodes
docs_graph = nx.DiGraph()
# Removes all non doc nodes
for u, v in graph.edges():
u = graph.node[u]
v = graph.node[v]
if is_doc(u) and is_doc(v):
docs_graph.add_edge(u["entity_id"], v["entity_id"])
r = nx.pagerank(docs_graph, alpha=0.7)
if len(r) == 0:
return []
ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
return ids[:limit]
def venueNet_feature():
# output: compute the author centrialy for each venue
# venue centrality dict
CSpaper = pickle.load(open(cspath+"CSvenuePaper","rb")) # all papers in CS venues
CSvenue_paper = pickle.load(open(cspath+"CSvenue_paper","rb")) #data type, dict, key, value: list
Citations = pickle.load(open(cspath+"Citations","rb"))
CSPV = pickle.load( open(cspath+"CSvenuePaper_Venue","rb")) #data type, dict, key, value: list
nodeSet = set()
edgeSet = set()
for key,val in CSvenue_paper.iteritems():
nodeSet.add(key)
temp = defaultdict(int)
for p in val:
for citing in Citations[p]:
if citing in CSpaper:
temp[(CSPV[citing],key)] +=1
edges = [(key[0],key[1],val) for key,val in temp.iteritems()]
edgeSet.update(edges)
g = nx.DiGraph()
g.add_nodes_from(nodeSet)
g.add_weighted_edges_from(edgeSet)
pr = defaultdict(int)
for node in g.nodes():
pr[node]=1
#DG.add_weighted_edges_from([(1,2,0.5), (3,1,0.75)])
#pr = nx.pagerank(g)
#page rank is time-consuming, replace this in real atmosphere
pickle.dump(pr,open(cspath+"venue_cen","wb"))
print 'venueNet_feature finish'
def pagerank(self):
"""Return the PageRank of the nodes in the graph.
Returns
-------
pagerank : dictionary
Dictionary of nodes with PageRank as value
Examples
--------
>>> g.pagerank()
"""
return nx.pagerank(self._graph, weight=self._weight_field)
def __init__(self, stemmer, language, N=2, G=nx.DiGraph()):
self.G = G
self.stemmer = stemmer
self.language = language
self.N = N
self.counter = Counter()
self.pr = nx.pagerank(G)
def sort_words(vertex_source, edge_source, model, window = 2, pagerank_config = {'alpha': 0.85,}):
"""??????????????
Keyword arguments:
vertex_source -- ???????????????????????????????pagerank????
edge_source -- ?????????????????????????????????pagerank???
window -- ????????window????????????
pagerank_config -- pagerank???
"""
#??????????
sorted_words = []
word_index = {}
index_word = {}
_vertex_source = vertex_source
_edge_source = edge_source
words_number = 0
for word_list in _vertex_source:
for word in word_list:
if not word in word_index:
word_index[word] = words_number
index_word[words_number] = word
words_number += 1
graph = np.zeros((words_number, words_number))
#???
for word_list in _edge_source:
for w1, w2 in combine(word_list, window):
if w1 in word_index and w2 in word_index:
index1 = word_index[w1]
index2 = word_index[w2]
try:
similarity = model.similarity(w1,w2)
if similarity<0:
similarity = 0
#print similarity
except:
similarity = 0
graph[index1][index2] = similarity
graph[index2][index1] = similarity
# graph[index1][index2] = 1.0
# graph[index2][index1] = 1.0
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, max_iter=100,**pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(word=index_word[index], weight=score)
sorted_words.append(item)
return sorted_words
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
"""??????????????
Keyword arguments:
vertex_source -- ???????????????????????????????pagerank????
edge_source -- ?????????????????????????????????pagerank???
window -- ????????window????????????
pagerank_config -- pagerank???
"""
sorted_words = []
word_index = {}
index_word = {}
_vertex_source = vertex_source
_edge_source = edge_source
words_number = 0
for word_list in _vertex_source:
for word in word_list:
if not word in word_index:
word_index[word] = words_number
index_word[words_number] = word
words_number += 1
graph = np.zeros((words_number, words_number))
for word_list in _edge_source:
for w1, w2 in combine(word_list, window):
if w1 in word_index and w2 in word_index:
index1 = word_index[w1]
index2 = word_index[w2]
graph[index1][index2] = 1.0
graph[index2][index1] = 1.0
debug('graph:\n', graph)
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(word=index_word[index], weight=score)
sorted_words.append(item)
return sorted_words
def extractKeyphrases(text):
#tokenize the text using nltk
wordTokens = nltk.word_tokenize(text)
#assign POS tags to the words in the text
tagged = nltk.pos_tag(wordTokens)
textlist = [x[0] for x in tagged]
tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
#this will be used to determine adjacent words in order to construct keyphrases with two words
graph = buildGraph(word_set_list)
#pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
aThird = int(len(word_set_list) / 3)
keyphrases = keyphrases[0:aThird+1]
#take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
#together
modifiedKeyphrases = set([])
dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
i = 0
j = 1
while j < len(textlist):
firstWord = textlist[i]
secondWord = textlist[j]
if firstWord in keyphrases and secondWord in keyphrases:
keyphrase = firstWord + ' ' + secondWord
modifiedKeyphrases.add(keyphrase)
dealtWith.add(firstWord)
dealtWith.add(secondWord)
else:
if firstWord in keyphrases and firstWord not in dealtWith:
modifiedKeyphrases.add(firstWord)
#if this is the last word in the text, and it is a keyword,
#it definitely has no chance of being a keyphrase at this point
if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
modifiedKeyphrases.add(secondWord)
i = i + 1
j = j + 1
return modifiedKeyphrases
def extractKeyphrases(text):
#tokenize the text using nltk
wordTokens = nltk.word_tokenize(text)
#assign POS tags to the words in the text
tagged = nltk.pos_tag(wordTokens)
textlist = [x[0] for x in tagged]
tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
#this will be used to determine adjacent words in order to construct keyphrases with two words
graph = buildGraph(word_set_list)
#pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
aThird = len(word_set_list) / 3
keyphrases = keyphrases[0:aThird+1]
#take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
#together
modifiedKeyphrases = set([])
dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
i = 0
j = 1
while j < len(textlist):
firstWord = textlist[i]
secondWord = textlist[j]
if firstWord in keyphrases and secondWord in keyphrases:
keyphrase = firstWord + ' ' + secondWord
modifiedKeyphrases.add(keyphrase)
dealtWith.add(firstWord)
dealtWith.add(secondWord)
else:
if firstWord in keyphrases and firstWord not in dealtWith:
modifiedKeyphrases.add(firstWord)
#if this is the last word in the text, and it is a keyword,
#it definitely has no chance of being a keyphrase at this point
if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
modifiedKeyphrases.add(secondWord)
i = i + 1
j = j + 1
return modifiedKeyphrases