python类nlargest()的实例源码

NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)
NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST
ranking.py 文件源码 项目:ltls 作者: kjasinska 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def create_ranking2(edge_weight, k, adj, num):
    sink = len(adj)
    heaps = [[] for i in xrange(sink + 1)]
    heaps[0] = [(0, [])]

    for current in xrange(sink):
        for child in adj[current]:
            for length, path in heaps[current]:
                new_path = list(path)
                new_path.append(current)
                # this can be done better using this heapreplace
                ew = edge_weight[0, num[(current, child)]]
                heapq.heappush(heaps[child], (length + ew, new_path))
                heaps[child] = heapq.nlargest(k, heaps[child])
                # TODO what with equal lenght paths?
    # result: heaps[sink]
    return [(length, tuple(zip(nodes, nodes[1:] + [sink]))) for length, nodes in heaps[sink]]
rdd.py 文件源码 项目:MIT-Thesis 作者: alec-heif 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def top(self, num, key=None):
        """
        Get the top N elements from an RDD.

        .. note:: This method should only be used if the resulting array is expected
            to be small, as all the data is loaded into the driver's memory.

        .. note:: It returns the list sorted in descending order.

        >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
        [12]
        >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
        [6, 5]
        >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
        [4, 3, 2]
        """
        def topIterator(iterator):
            yield heapq.nlargest(num, iterator, key=key)

        def merge(a, b):
            return heapq.nlargest(num, a + b, key=key)

        return self.mapPartitions(topIterator).reduce(merge)
desert_mirage_lib.py 文件源码 项目:desert-mirage 作者: valentour 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def nth_largest(n, iter_list):
    """``O(nlogn)`` time if ``n`` is median. 
    Better if largest or smallest.

    Notes
    -----
    Adopted and/or modified from reference(s):
    FogleBird on stackoverflow.com/questions/1034846/
    """
    length = len(iter_list)
    if n >= length:
        return heapq.nlargest(length, iter_list)[-1]
    return heapq.nlargest(n, iter_list)[-1]

# OS utilities
utilities.py 文件源码 项目:pandachaika 作者: pandabuilder 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]:
    if not n > 0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not (0.0 <= cutoff <= 1.0):
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s: SequenceMatcher = SequenceMatcher()
    s.set_seq2(word)
    for x in possibilities:
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
            result.append((s.ratio(), x))

    # Move the best scorers to head of list
    result = heapq.nlargest(n, result)
    # Strip scores for the best n matches
    return result
preprocessing.py 文件源码 项目:dl4nlp 作者: yohokuno 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def build_dictionary(sentences, size):
    """
    Create dictionary containing most frequent words in the sentences
    :param sentences: sequence of sentence that contains words
        Caution: the sequence might be exhausted after calling this function!
    :param size: size of dictionary you want
    :return: dictionary that maps word to index (starting from 1)
    """
    dictionary = defaultdict(int)
    for sentence in sentences:
        for token in sentence:
            dictionary[token] += 1
    frequent_pairs = nlargest(size, dictionary.items(), itemgetter(1))
    words, frequencies = zip(*frequent_pairs)
    result = {word: index + 1 for index, word in enumerate(words)}
    return result
keyphrase.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def keyphrases(self, N=20, fileids=None, categories=None):
        """
        Returns the top N keyphrases grouped by document id.
        TODO: this currently ignores fileids/categories.
        """
        if not self.tfidfs or not self.lexicon or not self.fileids:
            raise ValueError("Must call the score method first!")

        for idx, doc in enumerate(self.tfidfs):
            fileid = self.fileids[idx]

            # Get the top N terms by TF-IDF score
            scores = [
                (self.lexicon[wid], score)
                for wid, score in heapq.nlargest(N, doc, key=itemgetter(1))
            ]

            yield fileid, scores
indexation_generator.py 文件源码 项目:pyxem 作者: pyxem 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def correlate_library(image, library, n_largest):
    """Correlates all simulated diffraction templates in a DiffractionLibrary
    with a particular experimental diffraction pattern (image) stored as a
    numpy array.
    """
    i=0
    out_arr = np.zeros((n_largest * len(library),5))
    for key in library.keys():
        if n_largest:
            pass
        else:
            n_largest=len(library[key])
        correlations = dict()
        for orientation, diffraction_pattern in library[key].items():
            correlation = correlate(image, diffraction_pattern)
            correlations[orientation] = correlation
        res = nlargest(n_largest, correlations.items(), key=itemgetter(1))
        for j in np.arange(n_largest):
            out_arr[j + i*n_largest][0] = i
            out_arr[j + i*n_largest][1] = res[j][0][0]
            out_arr[j + i*n_largest][2] = res[j][0][1]
            out_arr[j + i*n_largest][3] = res[j][0][2]
            out_arr[j + i*n_largest][4] = res[j][1]
        i = i + 1
    return out_arr
engine.py 文件源码 项目:respeaker_virtualenv 作者: respeaker 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def determine_intent(self, utterance, num_results=1):
        """
        Given an utterance, provide a valid intent.

        :param utterance: an ascii or unicode string representing natural language speech

        :param num_results: a maximum number of results to be returned.

        :return: A generator the yields dictionaries.
        """
        intents = []
        for domain in self.domains:
            gen = self.domains[domain].determine_intent(utterance=utterance,
                                                        num_results=1)
            for intent in gen:
                intents.append(intent)

        heapq.nlargest(
            num_results, intents, key=lambda domain: domain['confidence'])
        for intent in intents:
            yield intent
centrality.py 文件源码 项目:atap 作者: foxbook 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs):
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, attr, scores)

    # Filter scores (do not include in book)
    ntypes = nx.get_node_attributes(G, 'type')
    phrases = [
        item for item in scores.items()
        if ntypes.get(item[0], None) == "keyphrase"
    ]

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, phrases, key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx+1, *item))

    return G
moderator.py 文件源码 项目:Chiaki-Nanami 作者: Ikusaba-san 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def newusers(self, ctx, *, count=5):
        """Tells you the newest members of the server.

        This is useful to check if any suspicious members have joined.

        The minimum is 3 members. If no number is given I'll show the last 5 members.
        """
        human_delta = time.human_timedelta
        count = max(count, 3)
        members = heapq.nlargest(count, ctx.guild.members, key=attrgetter('joined_at'))

        names = map(str, members)
        values = (
            (f'**Joined:** {human_delta(member.joined_at)}\n'
             f'**Created:** {human_delta(member.created_at)}\n{"-" * 40}')
            for member in members
        )
        entries = zip(names, values)

        title = f'The {formats.pluralize(**{"newest members": len(members)})}'
        pages = EmbedFieldPages(ctx, entries, lines_per_page=5, colour=0x00FF00, title=title)
        await pages.interact()
gc.py 文件源码 项目:lsdc 作者: febert 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def largest_export_versions(n):
  """Creates a filter that keeps the largest n export versions.

  Args:
    n: number of versions to keep.

  Returns:
    A filter function that keeps the n largest paths.
  """
  def keep(paths):
    heap = []
    for idx, path in enumerate(paths):
      if path.export_version is not None:
        heapq.heappush(heap, (path.export_version, idx))
    keepers = [paths[i] for _, i in heapq.nlargest(n, heap)]
    return sorted(keepers)

  return keep
gc.py 文件源码 项目:lsdc 作者: febert 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def largest_export_versions(n):
  """Creates a filter that keeps the largest n export versions.

  Args:
    n: number of versions to keep.

  Returns:
    A filter function that keeps the n largest paths.
  """
  def keep(paths):
    heap = []
    for idx, path in enumerate(paths):
      if path.export_version is not None:
        heapq.heappush(heap, (path.export_version, idx))
    keepers = [paths[i] for _, i in heapq.nlargest(n, heap)]
    return sorted(keepers)

  return keep
knns.py 文件源码 项目:Surprise 作者: NicolasHug 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        x, y = self.switch(u, i)

        neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])

        # compute weighted average
        sum_sim = sum_ratings = actual_k = 0
        for (sim, r) in k_neighbors:
            if sim > 0:
                sum_sim += sim
                sum_ratings += sim * r
                actual_k += 1

        if actual_k < self.min_k:
            raise PredictionImpossible('Not enough neighbors.')

        est = sum_ratings / sum_sim

        details = {'actual_k': actual_k}
        return est, details
???k??.py 文件源码 项目:AlgorithmsByPython 作者: Jack-Lee-Hiter 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def GetLeastNumbers(self, tinput, k):
        import heapq
        if tinput == None or len(tinput) < k or len(tinput) <= 0 or k <= 0:
            return []
        output = []
        for number in tinput:
            if len(output) < k:
                output.append(number)
            else:
                # ?????? ???
                # output = heapq.nsmallest(k, output)
                # if number >= output[-1]:
                #     continue
                # else:
                #     output[-1] = number
                # ?????? ??
                output = heapq.nlargest(k, output)
                if number >= output[0]:
                    continue
                else:
                    output[0] = number
        return output[::-1]     # ???? return output
IDtesseract.py 文件源码 项目:pytesseractID 作者: iChenwin 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def findIDcnt(countours):
    #????????
    widths = []
    for idx, cnt in enumerate(countours):
        x, y, width, height = cv2.boundingRect(cnt)
        widths.insert(idx, width)

    #???????????
    IDList = heapq.nlargest(3, widths)
    #???????????????????
    IDcnts = []
    for idx, item in enumerate(IDList):
        index = widths.index(item)
        IDcnts.insert(idx, countours[index])
    # print IDcnts

    return IDcnts

# ????
zhishi_eval.py 文件源码 项目:knowledge-graph-keras 作者: eshijia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def make_submit(self, model, submit_file):
        data = self.eval_sets().values()[0]
        target_lines = list()
        answers = np.asarray([[idx] for idx in self.entity.keys()])
        for i, d in enumerate(data):
            num_candidate = len(self.entity)
            index_entities = xrange(num_candidate)

            terms = d.split('\t')
            subjects = np.asarray([[terms[0]]] * num_candidate)
            relations = np.asarray([[terms[1]]] * num_candidate)

            sims = model.predict([subjects, relations, answers], batch_size=num_candidate).flatten()
            print(i)
            r = rankdata(sims, method='ordinal')
            index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
            one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
            target_lines.append(one_line + '\n')
        submit_file.writelines(target_lines)
zhishi_eval.py 文件源码 项目:knowledge-graph-keras 作者: eshijia 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def make_submit_rt(self, model, submit_file):
        data = self.eval_sets_rt().values()[0]
        target_lines = list()
        answers = np.asarray([[idx] for idx in self.entity.keys()])
        for i, d in enumerate(data):
            num_candidate = len(self.entity)
            index_entities = xrange(num_candidate)

            terms = d.split('\t')
            relations = np.asarray([[terms[0]]] * num_candidate)
            objects = np.asarray([[terms[1]]] * num_candidate)

            sims = model.predict_rt([answers, relations, objects], batch_size=num_candidate).flatten()
            print(i)
            r = rankdata(sims, method='ordinal')
            index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
            one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
            target_lines.append(one_line + '\n')
        submit_file.writelines(target_lines)
process.py 文件源码 项目:pipenv 作者: pypa 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
    """Get a list of the best matches to a collection of choices.

    Convenience function for getting the choices with best scores.

    Args:
        query: A string to match against
        choices: A list or dictionary of choices, suitable for use with
            extract().
        processor: Optional function for transforming choices before matching.
            See extract().
        scorer: Scoring function for extract().
        score_cutoff: Optional argument for score threshold. No matches with
            a score less than this number will be returned. Defaults to 0.
        limit: Optional maximum for the number of elements returned. Defaults
            to 5.

    Returns: A a list of (match, score) tuples.
    """

    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
    return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
        sorted(best_list, key=lambda i: i[1], reverse=True)
rdd.py 文件源码 项目:pyspark 作者: v-v-vishnevskiy 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def top(self, num, key=None):
        """
        Get the top N elements from a RDD.

        Note: It returns the list sorted in descending order.

        >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
        [12]
        >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
        [6, 5]
        >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
        [4, 3, 2]
        """
        def topIterator(iterator):
            yield heapq.nlargest(num, iterator, key=key)

        def merge(a, b):
            return heapq.nlargest(num, a + b, key=key)

        return self.mapPartitions(topIterator).reduce(merge)
person.py 文件源码 项目:talktown 作者: james-owen-ryan 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def _choose_vacant_home_or_vacant_lot(self):
        """Choose a vacant home to move into or a vacant lot to build on.

        Currently, a person scores all the vacant homes/lots in town and then selects
        one of the top three. TODO: Probabilistically select from all homes/lots using the
        scores to derive likelihoods of selecting each.
        """
        home_and_lot_scores = self._rate_all_vacant_homes_and_vacant_lots()
        if len(home_and_lot_scores) >= 3:
            # Pick from top three
            top_three_choices = heapq.nlargest(3, home_and_lot_scores, key=home_and_lot_scores.get)
            if random.random() < 0.6:
                choice = top_three_choices[0]
            elif random.random() < 0.9:
                choice = top_three_choices[1]
            else:
                choice = top_three_choices[2]
        elif home_and_lot_scores:
            choice = list(home_and_lot_scores)[0]
        else:
            choice = None
        return choice
business.py 文件源码 项目:talktown 作者: james-owen-ryan 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _init_acquire_currently_occupied_lot(self):
        """If there are no vacant lots in town, acquire a lot and demolish the home currently on it."""
        lot_scores = self._rate_all_occupied_lots()
        if len(lot_scores) >= 3:
            # Pick from top three
            top_three_choices = heapq.nlargest(3, lot_scores, key=lot_scores.get)
            if random.random() < 0.6:
                choice = top_three_choices[0]
            elif random.random() < 0.9:
                choice = top_three_choices[1]
            else:
                choice = top_three_choices[2]
        elif lot_scores:
            choice = max(lot_scores)
        else:
            raise Exception("A company attempted to secure an *occupied* lot in town but somehow could not.")
        return choice
process.py 文件源码 项目:watcher 作者: nosmokingbandit 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
    """Get a list of the best matches to a collection of choices.

    Convenience function for getting the choices with best scores.

    Args:
        query: A string to match against
        choices: A list or dictionary of choices, suitable for use with
            extract().
        processor: Optional function for transforming choices before matching.
            See extract().
        scorer: Scoring function for extract().
        score_cutoff: Optional argument for score threshold. No matches with
            a score less than this number will be returned. Defaults to 0.
        limit: Optional maximum for the number of elements returned. Defaults
            to 5.

    Returns: A a list of (match, score) tuples.
    """

    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
    return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
        sorted(best_list, key=lambda i: i[1], reverse=True)
business.py 文件源码 项目:dissertation 作者: james-owen-ryan 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _init_acquire_currently_occupied_lot(self):
        """If there are no vacant lots in town, acquire a lot and demolish the home currently on it."""
        lot_scores = self._rate_all_occupied_lots()
        if len(lot_scores) >= 3:
            # Pick from top three
            top_three_choices = heapq.nlargest(3, lot_scores, key=lot_scores.get)
            if random.random() < 0.6:
                choice = top_three_choices[0]
            elif random.random() < 0.9:
                choice = top_three_choices[1]
            else:
                choice = top_three_choices[2]
        elif lot_scores:
            choice = max(lot_scores)
        else:
            raise Exception("A company attempted to secure an *occupied* lot in town but somehow could not.")
        return choice
__init__.py 文件源码 项目:python- 作者: secondtonone1 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def most_common(self, n=None):
        '''List the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.

        >>> Counter('abcdeabcdabcaba').most_common(3)
        [('a', 5), ('b', 4), ('c', 3)]

        '''
        # Emulate Bag.sortedByCount from Smalltalk
        if n is None:
            return sorted(self.items(), key=_itemgetter(1), reverse=True)
        return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
collections.py 文件源码 项目:kinect-2-libras 作者: inessadl 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def most_common(self, n=None):
        '''List the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.

        >>> Counter('abcdeabcdabcaba').most_common(3)
        [('a', 5), ('b', 4), ('c', 3)]

        '''
        # Emulate Bag.sortedByCount from Smalltalk
        if n is None:
            return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
        return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
model.py 文件源码 项目:health-mosconi 作者: GNUHealth-Mosconi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def global_search(cls, text, limit, menu='ir.ui.menu'):
        """
        Search on models for text including menu
        Returns a list of tuple (ratio, model, model_name, id, name, icon)
        The size of the list is limited to limit
        """
        pool = Pool()
        ModelAccess = pool.get('ir.model.access')

        if not limit > 0:
            raise ValueError('limit must be > 0: %r' % (limit,))

        models = cls.search(['OR',
                ('global_search_p', '=', True),
                ('model', '=', menu),
                ])
        access = ModelAccess.get_access([m.model for m in models])
        s = StringMatcher()
        if isinstance(text, str):
            text = text.decode('utf-8')
        s.set_seq2(text)

        def generate():
            for model in models:
                if not access[model.model]['read']:
                    continue
                Model = pool.get(model.model)
                if not hasattr(Model, 'search_global'):
                    continue
                for record, name, icon in Model.search_global(text):
                    if isinstance(name, str):
                        name = name.decode('utf-8')
                    s.set_seq1(name)
                    yield (s.ratio(), model.model, model.rec_name,
                        record.id, name, icon)
        return heapq.nlargest(int(limit), generate())
embedding.py 文件源码 项目:histwords 作者: williamleif 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def closest(self, w, n=10):
        """
        Assumes the vectors have been normalized.
        """
        scores = self.m.dot(self.represent(w))
        return heapq.nlargest(n, zip(scores, self.iw))
embedding.py 文件源码 项目:histwords 作者: williamleif 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def closest_contexts(self, w, n=10):
        scores = self.ec.m.dot(self.ew.represent(w))
        pairs = zip(scores, self.ec.iw)[1:]
        return heapq.nlargest(n, pairs)


问题


面经


文章

微信
公众号

扫码关注公众号