def sort_words(vertex_source, edge_source, model, window = 2, pagerank_config = {'alpha': 0.85,}):
"""??????????????
Keyword arguments:
vertex_source -- ???????????????????????????????pagerank????
edge_source -- ?????????????????????????????????pagerank???
window -- ????????window????????????
pagerank_config -- pagerank???
"""
#??????????
sorted_words = []
word_index = {}
index_word = {}
_vertex_source = vertex_source
_edge_source = edge_source
words_number = 0
for word_list in _vertex_source:
for word in word_list:
if not word in word_index:
word_index[word] = words_number
index_word[words_number] = word
words_number += 1
graph = np.zeros((words_number, words_number))
#???
for word_list in _edge_source:
for w1, w2 in combine(word_list, window):
if w1 in word_index and w2 in word_index:
index1 = word_index[w1]
index2 = word_index[w2]
try:
similarity = model.similarity(w1,w2)
if similarity<0:
similarity = 0
#print similarity
except:
similarity = 0
graph[index1][index2] = similarity
graph[index2][index1] = similarity
# graph[index1][index2] = 1.0
# graph[index2][index1] = 1.0
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, max_iter=100,**pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(word=index_word[index], weight=score)
sorted_words.append(item)
return sorted_words
评论列表
文章目录