python类pagerank()的实例源码

util.py 文件源码 项目:AIZooService 作者: zhanglbjames 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] = 1.0
                graph[index2][index1] = 1.0

    debug('graph:\n', graph)

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)

    return sorted_words
AKE.py 文件源码 项目:NLP-Keyword-Extraction-Ensemble-Method 作者: Ashwin-Ravi 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee, izip
    import networkx, nltk

    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return izip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)

    return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)
Textrank_count.py 文件源码 项目:Graduation-design 作者: Baichenjia 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def build_matrix():
    ######????? ? ? ?????
    word_index = {}  # ????????
    index_word = {}  # ????????
    weibo_data = handel_weibo_data()  # ????????????
    index = 0
    for sent in weibo_data:  # ?????
        for word in sent:   # ?????????
            if not word in word_index.keys():
                word_index[word] = index
                index_word[index] = word
                index += 1
    words_number = index
    #print "words_number", words_number
    #######???????
    graph = np.zeros((words_number, words_number))  # ??????
    for word_list in weibo_data:  # ???
        for i in range(len(word_list)):  # ???????????????????????????????
            for j in range(i, len(word_list)):
                w1 = word_list[i]
                w2 = word_list[j]  # ???????????
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] += 1   # ?????????1
                graph[index2][index1] += 1   # ?????????
    ######?????networkx??pagerank?????????????????
    nx_graph = nx.from_numpy_matrix(graph)  # ??networdx
    scores = nx.pagerank(nx_graph, alpha=0.85)  # ??pagerank??
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)  # ????????
    key_words = []  # ??????????
    for index, score in sorted_scores:
        if index_word[index] == u'??' or index_word[index] == u'??' or len(index_word[index]) == 1:
            continue
        key_words.append((index_word[index], score))
    ########????????100????????
    fp_textrank_result = open('f://emotion/mysite/Label_extract/result_textrank.txt', 'w+')
    for i in range(100):
        fp_textrank_result.write(key_words[i][0] + ' ' + str(round(key_words[i][1], 10)))
        fp_textrank_result.write('\n')
    fp_textrank_result.close()
    """
    fp_test = open('f://emotion/mysite/Label_extract/test.txt', 'w+')
    for i in range(100):
        fp_test.write(key_words[i][0] + '?')
    fp_test.close()
    """
    print "textrank key word calculate is success..."
    return key_words
remove_cycle_edges_by_hierarchy.py 文件源码 项目:breaking_cycles_in_noisy_hierarchies 作者: zhenv5 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def computing_hierarchy(graph_file,players_score_func_name):
    import os.path
    if players_score_func_name == "socialagony":
        from helper_funs import dir_tail_name
        dir_name,tail = dir_tail_name(graph_file)
        agony_file = os.path.join(dir_name,tail.split(".")[0] + "_socialagony.txt")
        #agony_file = graph_file[:len(graph_file)-6] + "_socialagony.txt"
        #from compute_social_agony import compute_social_agony
        #players = compute_social_agony(graph_file,agony_path = "agony/agony ")     
        if False:
        #if os.path.isfile(agony_file):
            print("load pre-computed socialagony from: %s" % agony_file)
            players = read_dict_from_file(agony_file)
        else:
            print("start computing socialagony...")
            from compute_social_agony import compute_social_agony
            players = compute_social_agony(graph_file,agony_path = "agony/agony ")
            print("write socialagony to file: %s" % agony_file)
        return players
    g = nx.read_edgelist(graph_file,create_using = nx.DiGraph(),nodetype = int)
    if players_score_func_name == "pagerank":
        #print("computing pagerank...")
        players = nx.pagerank(g, alpha = 0.85)
        return players
    elif players_score_func_name == "trueskill":
        output_file = graph_file[:len(graph_file)-6] + "_trueskill.txt"
        output_file_2 = graph_file[:len(graph_file)-6] + "_trueskill.pkl"
        #from true_skill import graphbased_trueskill
        #players = graphbased_trueskill(g)
        #from file_io import write_dict_to_file
        #write_dict_to_file(players,output_file)

        '''
        if os.path.isfile(output_file):
            print("load pre-computed trueskill from: %s" % output_file)
            players = read_dict_from_file(output_file,key_type = int, value_type = float)
        elif os.path.isfile(output_file_2):
            print("load pre-computed trueskill from: %s" % output_file_2)
            players = read_from_pickle(output_file_2)           
        '''
        if True:
            print("start computing trueskill...")
            from true_skill import graphbased_trueskill
            players = graphbased_trueskill(g)
            from file_io import write_dict_to_file
            print("write trueskill to file: %s" % output_file)
            write_dict_to_file(players,output_file)

        return players
remove_cycle_edges_by_hierarchy.py 文件源码 项目:breaking_cycles_in_noisy_hierarchies 作者: zhenv5 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def breaking_cycles_by_hierarchy_performance(graph_file,gt_file,players_score_name):

    from measures import report_performance
    if players_score_name != "ensembling":
        players_score_dict  = computing_hierarchy(graph_file,players_score_name)
        e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,players_score_name)

        if players_score_name == "pagerank":
            report_performance(gt_file,e1,"PR")
            return

        if players_score_name == "socialagony":
            note = "SA_"
        elif players_score_name == "trueskill":
            note = "TS_"

        report_performance(gt_file,e1, note+"G")
        report_performance(gt_file,e2, note+"F")
        report_performance(gt_file,e3, note+"B")
        report_performance(gt_file,e4, note+"Voting")
    else:
        players_score_dict  = computing_hierarchy(graph_file,"socialagony")
        e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"socialagony")
        report_performance(gt_file,e1,  "SA_G")
        write_pairs_to_file(e1,graph_file[:len(graph_file)-6] + "_removed_by_SA-G.edges")
        report_performance(gt_file,e2,  "SA_F")
        write_pairs_to_file(e2,graph_file[:len(graph_file)-6] + "_removed_by_SA-F.edges")
        report_performance(gt_file,e3,  "SA_B")
        write_pairs_to_file(e3,graph_file[:len(graph_file)-6] + "_removed_by_SA-B.edges")
        report_performance(gt_file,e4,  "SA_Voting")
        write_pairs_to_file(e4,graph_file[:len(graph_file)-6] + "_removed_by_SA-Voting.edges")

        players_score_dict  = computing_hierarchy(graph_file,"trueskill")
        e5,e6,e7,e8 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"trueskill")
        report_performance(gt_file,e5,  "TS_G")
        write_pairs_to_file(e5,graph_file[:len(graph_file)-6] + "_removed_by_TS-G.edges")
        report_performance(gt_file,e6,  "TS_F")
        write_pairs_to_file(e6,graph_file[:len(graph_file)-6] + "_removed_by_TS-F.edges")
        report_performance(gt_file,e7,  "TS_B")
        write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-B.edges")
        report_performance(gt_file,e8,  "TS_Voting")
        write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-Voting.edges")

        e9 = remove_cycle_edges_by_voting(graph_file,[set(e1),set(e2),set(e3),set(e5),set(e6),set(e7)])
        report_performance(gt_file,e9,"H_Voting")
        write_pairs_to_file(e9,graph_file[:len(graph_file)-6] + "_removed_by_H-Voting.edges")


问题


面经


文章

微信
公众号

扫码关注公众号