def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
"""??????????????
Keyword arguments:
vertex_source -- ???????????????????????????????pagerank????
edge_source -- ?????????????????????????????????pagerank???
window -- ????????window????????????
pagerank_config -- pagerank???
"""
sorted_words = []
word_index = {}
index_word = {}
_vertex_source = vertex_source
_edge_source = edge_source
words_number = 0
for word_list in _vertex_source:
for word in word_list:
if not word in word_index:
word_index[word] = words_number
index_word[words_number] = word
words_number += 1
graph = np.zeros((words_number, words_number))
for word_list in _edge_source:
for w1, w2 in combine(word_list, window):
if w1 in word_index and w2 in word_index:
index1 = word_index[w1]
index2 = word_index[w2]
graph[index1][index2] = 1.0
graph[index2][index1] = 1.0
debug('graph:\n', graph)
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(word=index_word[index], weight=score)
sorted_words.append(item)
return sorted_words
python类pagerank()的实例源码
AKE.py 文件源码
项目:NLP-Keyword-Extraction-Ensemble-Method
作者: Ashwin-Ravi
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def score_keyphrases_by_textrank(text, n_keywords=0.05):
from itertools import takewhile, tee, izip
import networkx, nltk
# tokenize for all words, and extract *candidate* words
words = [word.lower()
for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
candidates = extract_candidate_words(text)
# build graph, each node is a unique candidate
graph = networkx.Graph()
graph.add_nodes_from(set(candidates))
# iterate over word-pairs, add unweighted edges into graph
def pairwise(iterable):
"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = tee(iterable)
next(b, None)
return izip(a, b)
for w1, w2 in pairwise(candidates):
if w2:
graph.add_edge(*sorted([w1, w2]))
# score nodes using default pagerank algorithm, sort by score, keep top n_keywords
ranks = networkx.pagerank(graph)
if 0 < n_keywords < 1:
n_keywords = int(round(len(candidates) * n_keywords))
word_ranks = {word_rank[0]: word_rank[1]
for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
keywords = set(word_ranks.keys())
# merge keywords into keyphrases
keyphrases = {}
j = 0
for i, word in enumerate(words):
if i < j:
continue
if word in keywords:
kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
keyphrases[' '.join(kp_words)] = avg_pagerank
# counter as hackish way to ensure merged keyphrases are non-overlapping
j = i + len(kp_words)
return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)
def build_matrix():
######????? ? ? ?????
word_index = {} # ????????
index_word = {} # ????????
weibo_data = handel_weibo_data() # ????????????
index = 0
for sent in weibo_data: # ?????
for word in sent: # ?????????
if not word in word_index.keys():
word_index[word] = index
index_word[index] = word
index += 1
words_number = index
#print "words_number", words_number
#######???????
graph = np.zeros((words_number, words_number)) # ??????
for word_list in weibo_data: # ???
for i in range(len(word_list)): # ???????????????????????????????
for j in range(i, len(word_list)):
w1 = word_list[i]
w2 = word_list[j] # ???????????
index1 = word_index[w1]
index2 = word_index[w2]
graph[index1][index2] += 1 # ?????????1
graph[index2][index1] += 1 # ?????????
######?????networkx??pagerank?????????????????
nx_graph = nx.from_numpy_matrix(graph) # ??networdx
scores = nx.pagerank(nx_graph, alpha=0.85) # ??pagerank??
sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True) # ????????
key_words = [] # ??????????
for index, score in sorted_scores:
if index_word[index] == u'??' or index_word[index] == u'??' or len(index_word[index]) == 1:
continue
key_words.append((index_word[index], score))
########????????100????????
fp_textrank_result = open('f://emotion/mysite/Label_extract/result_textrank.txt', 'w+')
for i in range(100):
fp_textrank_result.write(key_words[i][0] + ' ' + str(round(key_words[i][1], 10)))
fp_textrank_result.write('\n')
fp_textrank_result.close()
"""
fp_test = open('f://emotion/mysite/Label_extract/test.txt', 'w+')
for i in range(100):
fp_test.write(key_words[i][0] + '?')
fp_test.close()
"""
print "textrank key word calculate is success..."
return key_words
remove_cycle_edges_by_hierarchy.py 文件源码
项目:breaking_cycles_in_noisy_hierarchies
作者: zhenv5
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def computing_hierarchy(graph_file,players_score_func_name):
import os.path
if players_score_func_name == "socialagony":
from helper_funs import dir_tail_name
dir_name,tail = dir_tail_name(graph_file)
agony_file = os.path.join(dir_name,tail.split(".")[0] + "_socialagony.txt")
#agony_file = graph_file[:len(graph_file)-6] + "_socialagony.txt"
#from compute_social_agony import compute_social_agony
#players = compute_social_agony(graph_file,agony_path = "agony/agony ")
if False:
#if os.path.isfile(agony_file):
print("load pre-computed socialagony from: %s" % agony_file)
players = read_dict_from_file(agony_file)
else:
print("start computing socialagony...")
from compute_social_agony import compute_social_agony
players = compute_social_agony(graph_file,agony_path = "agony/agony ")
print("write socialagony to file: %s" % agony_file)
return players
g = nx.read_edgelist(graph_file,create_using = nx.DiGraph(),nodetype = int)
if players_score_func_name == "pagerank":
#print("computing pagerank...")
players = nx.pagerank(g, alpha = 0.85)
return players
elif players_score_func_name == "trueskill":
output_file = graph_file[:len(graph_file)-6] + "_trueskill.txt"
output_file_2 = graph_file[:len(graph_file)-6] + "_trueskill.pkl"
#from true_skill import graphbased_trueskill
#players = graphbased_trueskill(g)
#from file_io import write_dict_to_file
#write_dict_to_file(players,output_file)
'''
if os.path.isfile(output_file):
print("load pre-computed trueskill from: %s" % output_file)
players = read_dict_from_file(output_file,key_type = int, value_type = float)
elif os.path.isfile(output_file_2):
print("load pre-computed trueskill from: %s" % output_file_2)
players = read_from_pickle(output_file_2)
'''
if True:
print("start computing trueskill...")
from true_skill import graphbased_trueskill
players = graphbased_trueskill(g)
from file_io import write_dict_to_file
print("write trueskill to file: %s" % output_file)
write_dict_to_file(players,output_file)
return players
remove_cycle_edges_by_hierarchy.py 文件源码
项目:breaking_cycles_in_noisy_hierarchies
作者: zhenv5
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def breaking_cycles_by_hierarchy_performance(graph_file,gt_file,players_score_name):
from measures import report_performance
if players_score_name != "ensembling":
players_score_dict = computing_hierarchy(graph_file,players_score_name)
e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,players_score_name)
if players_score_name == "pagerank":
report_performance(gt_file,e1,"PR")
return
if players_score_name == "socialagony":
note = "SA_"
elif players_score_name == "trueskill":
note = "TS_"
report_performance(gt_file,e1, note+"G")
report_performance(gt_file,e2, note+"F")
report_performance(gt_file,e3, note+"B")
report_performance(gt_file,e4, note+"Voting")
else:
players_score_dict = computing_hierarchy(graph_file,"socialagony")
e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"socialagony")
report_performance(gt_file,e1, "SA_G")
write_pairs_to_file(e1,graph_file[:len(graph_file)-6] + "_removed_by_SA-G.edges")
report_performance(gt_file,e2, "SA_F")
write_pairs_to_file(e2,graph_file[:len(graph_file)-6] + "_removed_by_SA-F.edges")
report_performance(gt_file,e3, "SA_B")
write_pairs_to_file(e3,graph_file[:len(graph_file)-6] + "_removed_by_SA-B.edges")
report_performance(gt_file,e4, "SA_Voting")
write_pairs_to_file(e4,graph_file[:len(graph_file)-6] + "_removed_by_SA-Voting.edges")
players_score_dict = computing_hierarchy(graph_file,"trueskill")
e5,e6,e7,e8 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"trueskill")
report_performance(gt_file,e5, "TS_G")
write_pairs_to_file(e5,graph_file[:len(graph_file)-6] + "_removed_by_TS-G.edges")
report_performance(gt_file,e6, "TS_F")
write_pairs_to_file(e6,graph_file[:len(graph_file)-6] + "_removed_by_TS-F.edges")
report_performance(gt_file,e7, "TS_B")
write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-B.edges")
report_performance(gt_file,e8, "TS_Voting")
write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-Voting.edges")
e9 = remove_cycle_edges_by_voting(graph_file,[set(e1),set(e2),set(e3),set(e5),set(e6),set(e7)])
report_performance(gt_file,e9,"H_Voting")
write_pairs_to_file(e9,graph_file[:len(graph_file)-6] + "_removed_by_H-Voting.edges")