python类pagerank()的实例源码

eagle.py 文件源码 项目:stock-eagle 作者: mtusman 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def rank(nodes, edges):
    ''' Creates the graph with the calculates nodes (sentences) and their weight'''
    graph = nx.DiGraph()
    graph.add_nodes_from(nodes)
    graph.add_weighted_edges_from(edges)
    ''' Uses google's pagerank formula to find the most important senteces'''
    return nx.pagerank(graph)
papyrus_summary_extraction_tool.py 文件源码 项目:Papyrus--simple-but-effective-text-summarization-tool 作者: RebeccaMerrett 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
textrank.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def extractSentences(document):
    # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    # sentenceTokens = sent_detector.tokenize(text.strip())
    sentenceTokens = document.sentences()
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
textrank_orignal.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def extractSentences(text):  # this should be a bunch of sentences, not just one sentence 
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')  # implemented weight graph here 

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
textrank_fujun.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def cosine_similarity_self(A):
    similarity = np.dot(A, A.T)
    square_mag = np.diag(similarity)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = similarity * inv_mag
    cosine = cosine.T * inv_mag
    return cosine

# document should be a list of sentences
# method = "word2vec", "lda", "tfidf"
# def extraction(document, method="rawText"):
#
#     # graph = build_graph(document, method)  # document is a list of sentences
#
#     calculated_page_rank = networkx.pagerank(graph, weight="weight")
#
#     # most important sentences in descending order of importance
#     sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=False)
#
#     return sentences[0:4]
searchers.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self):
        self.index = Index(config.INDEX_PATH)

        # Checks if the full graph for this dataset was already ranked.
        # If not, run page rank and store the results
        pr_file_path = "%s/page_rank/%s.p" % (config.DATA, config.DATASET)
        if not os.path.exists(pr_file_path):
            g = nx.DiGraph()
            g.add_edges_from(model.get_all_edges())

            print "Running pageRank with %d nodes." % g.number_of_nodes()
            self.pr = nx.pagerank(g)

            cPickle.dump(self.pr, open(pr_file_path, "w"))

        # Else, just loads it
        else:
            self.pr = cPickle.load(open(pr_file_path, 'r'))
document_summarization.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def textrank_text_summarizer(documents, num_sentences=2,
                             feature_type='frequency'):

    vec, dt_matrix = build_feature_matrix(norm_sentences, 
                                      feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)

    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)   

    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)

    top_sentence_indices = [ranked_sentences[index][1] 
                            for index in range(num_sentences)]
    top_sentence_indices.sort()

    for index in top_sentence_indices:
        print sentences[index]
textrank.py 文件源码 项目:bookmark_analysis 作者: tarwn 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def extractSentences(text):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    # most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,
                       reverse=True)

    # return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
feedback_graph.py 文件源码 项目:acl2017-interactive_summarizer 作者: UKPLab 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def add_sentences(self, sentences):
        """
        @type sentences: list[Sentence]
        :param sentences:
        :return:
        """
        counter = self.counter
        G = self.G
        for sentence in sentences:
            G.add_nodes_from(sentence.concepts)
            counter.update(ngrams(sentence.concepts, self.N))

        for (keys, value) in counter.items():
            for i in range(0, len(keys) - 1):
                for j in range(1, len(keys)):
                    G.add_edge(keys[i], keys[j], weight=value)
                    # counter.update((keys[i], keys[j]))

        # for (key, value) in counter.items():
        #     G.add_edge(key[0], key[1], attr={"weight": value})

        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        self.pr = nx.pagerank(G)
feedback_graph.py 文件源码 项目:acl2017-interactive_summarizer 作者: UKPLab 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def incorporate_feedback(self, flightrecorder):
        """

        :param flightrecorder:
        :return:
         @type flightrecorder: FlightRecorder
        """
        G = self.G
        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        # use the pagerank personalization feature to incorporate flightrecorder feedback

        union = flightrecorder.union()

        for rejected in union.reject:
            if(G.has_node(rejected)):
                G.remove_node(rejected)

        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        self.pr = nx.pagerank(G)
TR4Sentence.py 文件源码 项目:TextRankPlus 作者: zuoxiaolei 项目源码 文件源码 阅读 64 收藏 0 点赞 0 评论 0
def sort_sentences(sentences, words,model, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = get_similarity( _source[x], _source[y], model)
            graph[x, y] = similarity
            graph[y, x] = similarity
    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
main.py 文件源码 项目:TextRankPlus 作者: zuoxiaolei 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
text_rank.py 文件源码 项目:Naver-News-Summarizer 作者: devFallingstar 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, text):
        self.sentences = get_sentences(text)
        self.graph = build_graph(self.sentences)
        self.pagerank = networkx.pagerank(self.graph, weight='weight')
        self.reordered = sorted(self.pagerank, key=self.pagerank.get, reverse=True)
        self.nouns = []
        for sentence in self.sentences:
            self.nouns += sentence.nouns
        self.bow = collections.Counter(self.nouns)
util.py 文件源码 项目:JustCopy 作者: exe1023 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)        
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
Centrality.py 文件源码 项目:PhD 作者: wutaoadeny 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def Page_Rank(G):   
    PageRank_Centrality = nx.pagerank(G, alpha=0.85)
    #print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
    return PageRank_Centrality
Centrality.py 文件源码 项目:PhD 作者: wutaoadeny 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def Page_Rank(G):
    PageRank_Centrality = nx.pagerank(G, alpha=0.85)
    #print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
    return PageRank_Centrality
net_distribution.py 文件源码 项目:tweetopo 作者: zthxxx 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self, edges, measure='pagerank'):
        '''
        Class for analysis graph
        :param edges: weighted_edges The edges must be given as 3-tuples like (u,v,weight)
        :param measure: what measure for analysis to filter,
                        must be one of  'degree' or 'pagerank' or 'clustering'
        '''
        self.measures = ['degree', 'pagerank', 'clustering']
        self.measure = measure
        self.ranks = {}
        self.G = nx.Graph()
        self.import_data(edges)
net_distribution.py 文件源码 项目:tweetopo 作者: zthxxx 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_pageranks(self):
        pageranks = nx.pagerank(self.G)
        max_pagerank = max(pageranks.values())
        return pageranks, max_pagerank
clusterrank.py 文件源码 项目:senpai 作者: AdiChat 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def extractSentences(self, text):
        '''
        Extracts sentences from the graph using pagerank
        Arguments:
            text: input textual data
        Returns:
            summary: a bunch of sentences
        Raises:
            None
        '''
textrank.py 文件源码 项目:senpai 作者: AdiChat 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def extractSentences(self, text):
        '''
        Extracts sentences from the graph using pagerank
        Arguments:
            text: input textual data
        Returns:
            summary: a bunch of sentences
        Raises:
            None
        '''
util.py 文件源码 项目:AIZooService 作者: zhanglbjames 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)        
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
searchers.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def search(self, query, exclude=[], force=False, limit=20):

        # Fetches all document that have at least one of the terms
        pubs = self.index.search(query,
                                 search_fields=["title", "abstract"],
                                 return_fields=["id"],
                                 ignore=exclude)

        # Unpack and convert to a set for fast lookup
        pubs = set([pub_id for (pub_id,) in pubs])

        # index_ids, _scores = self.index.search(query, ["title", "abstract"], limit=limit, mode="ALL")
        # docs = set(self.index.get_documents(index_ids, "id"))

        g = nx.DiGraph()
        for u, v in self.edges:
            if (u in pubs) and (v in pubs):
                g.add_edge(u, v)

            #       print "PageRank with %d nodes." % g.number_of_nodes()
        r = nx.pagerank(g, alpha=0.7)

        if len(r) == 0:
            return []

        ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
        return ids[:limit]
searchers.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def search(self, query, exclude=[], limit=50, force=False):

        graph = build_graph(query,
                            self.params['K'],
                            self.params['H'],
                            self.params['min_topic_lift'],
                            self.params['min_ngram_lift'],
                            exclude, force, load=True)


        # Simple method to check if node is a document node.
        is_doc = lambda node: node["type"] == "paper"

        # Builds a new unweighted graph with only the documents as nodes
        docs_graph = nx.DiGraph()

        # Removes all non doc nodes
        for u, v in graph.edges():
            u = graph.node[u]
            v = graph.node[v]
            if is_doc(u) and is_doc(v):
                docs_graph.add_edge(u["entity_id"], v["entity_id"])

        r = nx.pagerank(docs_graph, alpha=0.7)
        if len(r) == 0:
            return []

        ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
        return ids[:limit]
ldp_fea.py 文件源码 项目:academic 作者: xinchrome 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def venueNet_feature():
    # output: compute the author centrialy for each venue
    #         venue centrality dict

    CSpaper = pickle.load(open(cspath+"CSvenuePaper","rb")) # all papers in CS venues
    CSvenue_paper = pickle.load(open(cspath+"CSvenue_paper","rb")) #data type, dict, key, value: list
    Citations = pickle.load(open(cspath+"Citations","rb"))
    CSPV = pickle.load( open(cspath+"CSvenuePaper_Venue","rb")) #data type, dict, key, value: list
    nodeSet = set()
    edgeSet = set()
    for key,val in CSvenue_paper.iteritems():
        nodeSet.add(key)
        temp = defaultdict(int)
        for p in val:
            for citing in Citations[p]:
                if citing in CSpaper:
                    temp[(CSPV[citing],key)] +=1
        edges = [(key[0],key[1],val) for key,val in temp.iteritems()]
        edgeSet.update(edges)
    g = nx.DiGraph()
    g.add_nodes_from(nodeSet)
    g.add_weighted_edges_from(edgeSet)

    pr = defaultdict(int)
    for node in g.nodes():
        pr[node]=1

    #DG.add_weighted_edges_from([(1,2,0.5), (3,1,0.75)])
    #pr = nx.pagerank(g) 
    #page rank is time-consuming, replace this in real atmosphere
    pickle.dump(pr,open(cspath+"venue_cen","wb"))
    print 'venueNet_feature finish'
nxgraph.py 文件源码 项目:anomalous-vertices-detection 作者: Kagandi 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def pagerank(self):
        """Return the PageRank of the nodes in the graph.

        Returns
        -------
        pagerank : dictionary
            Dictionary of nodes with PageRank as value

        Examples
        --------
        >>> g.pagerank()
         """
        return nx.pagerank(self._graph, weight=self._weight_field)
feedback_graph.py 文件源码 项目:acl2017-interactive_summarizer 作者: UKPLab 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self, stemmer, language, N=2, G=nx.DiGraph()):
        self.G = G
        self.stemmer = stemmer
        self.language = language
        self.N = N

        self.counter = Counter()
        self.pr = nx.pagerank(G)
TR4KW.py 文件源码 项目:TextRankPlus 作者: zuoxiaolei 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def sort_words(vertex_source, edge_source, model, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """

    #??????????
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    #???
    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                try:
                    similarity = model.similarity(w1,w2)
                    if similarity<0:
                        similarity = 0
                    #print similarity
                except:
                    similarity = 0
                graph[index1][index2] = similarity
                graph[index2][index1] = similarity
#                graph[index1][index2] = 1.0
#                graph[index2][index1] = 1.0

    nx_graph = nx.from_numpy_matrix(graph)

    scores = nx.pagerank(nx_graph, max_iter=100,**pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)
    return sorted_words
util.py 文件源码 项目:JustCopy 作者: exe1023 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] = 1.0
                graph[index2][index1] = 1.0

    debug('graph:\n', graph)

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)

    return sorted_words
textrank.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def extractKeyphrases(text):
    #tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    #assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]

    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

   #this will be used to determine adjacent words in order to construct keyphrases with two words

    graph = buildGraph(word_set_list)

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = int(len(word_set_list) / 3)
    keyphrases = keyphrases[0:aThird+1]

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
    #together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)

            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)

        i = i + 1
        j = j + 1

    return modifiedKeyphrases
textrank_orignal.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def extractKeyphrases(text):
    #tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    #assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]

    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

    #this will be used to determine adjacent words in order to construct keyphrases with two words

    graph = buildGraph(word_set_list)

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = len(word_set_list) / 3
    keyphrases = keyphrases[0:aThird+1]

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
    #together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)

            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)

        i = i + 1
        j = j + 1

    return modifiedKeyphrases


问题


面经


文章

微信
公众号

扫码关注公众号