def score_keyphrases_by_textrank(text, n_keywords=0.05):
from itertools import takewhile, tee, izip
import networkx, nltk
# tokenize for all words, and extract *candidate* words
words = [word.lower()
for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
candidates = extract_candidate_words(text)
# build graph, each node is a unique candidate
graph = networkx.Graph()
graph.add_nodes_from(set(candidates))
# iterate over word-pairs, add unweighted edges into graph
def pairwise(iterable):
"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = tee(iterable)
next(b, None)
return izip(a, b)
for w1, w2 in pairwise(candidates):
if w2:
graph.add_edge(*sorted([w1, w2]))
# score nodes using default pagerank algorithm, sort by score, keep top n_keywords
ranks = networkx.pagerank(graph)
if 0 < n_keywords < 1:
n_keywords = int(round(len(candidates) * n_keywords))
word_ranks = {word_rank[0]: word_rank[1]
for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
keywords = set(word_ranks.keys())
# merge keywords into keyphrases
keyphrases = {}
j = 0
for i, word in enumerate(words):
if i < j:
continue
if word in keywords:
kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
keyphrases[' '.join(kp_words)] = avg_pagerank
# counter as hackish way to ensure merged keyphrases are non-overlapping
j = i + len(kp_words)
return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)
AKE.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录