python类Counter()的实例源码

value_counter.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def fill_histogram(self, idf, columns):
        """Fill input histogram with column(s) of input dataframe

        :param idf: input data frame used for filling histogram
        :param list columns: histogram column(s)
        """

        name = ':'.join(columns)
        if name not in self._counts:
            # create an (empty) value counts dict
            self._counts[name] = Counter()
        # value_counts() is faster than groupby().size(), but only works for series (1d).
        # else use groupby() for multi-dimensions
        g = idf.groupby(by=columns).size() if len(columns) > 1 else idf[columns[0]].value_counts()
        counts = Counter(g.to_dict())
        # remove specific keys from histogram before merging, if so requested
        counts = self.drop_requested_keys(name, counts)
        self._counts[name].update(counts)
test_histogram.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def test_bin_edges(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        # uniform
        bin_edges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        self.assertListEqual(h.get_uniform_bin_edges(), bin_edges)

        # truncated uniform bin edges
        truncated_bin_edges = [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
        self.assertListEqual(h.truncated_bin_edges([5.5,12.5]), truncated_bin_edges)

        h_bin_edges = h.bin_edges()
        self.assertIsInstance(h_bin_edges, np.ndarray)
        self.assertListEqual(h_bin_edges.tolist(), bin_edges)
test_histogram.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def test_bin_centers(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_centers = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5, 18.5]
        h_bin_centers = h.bin_centers()
        self.assertIsInstance(h_bin_centers, np.ndarray)
        self.assertListEqual(h_bin_centers.tolist(), bin_centers)
test_histogram.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def test_bin_entries(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_entries = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        h_bin_entries = h.bin_entries()
        self.assertIsInstance(h_bin_entries, np.ndarray)
        self.assertListEqual(h_bin_entries.tolist(), bin_entries)
test_histogram.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_bin_labels(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
        h_bin_labels = h.bin_labels()
        self.assertIsInstance(h_bin_labels, np.ndarray)
        self.assertListEqual(h_bin_labels.tolist(), bin_labels)
preprocess_data.py 文件源码 项目:identifiera-sarkasm 作者: risnejunior 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def build_vocabulary( words, max_size ):
    vocab_instances = 0
    unique_counts = Counter(words)
    d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
    vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

    # start at 2 to leave room for padding & unknown
    pb = Progress_bar(len(d) - 1) 
    for i, (key, value) in enumerate(vocabulary.items(), start=2):      
        vocab_instances += value
        vocabulary[key] = i
        pb.tick()

    vocabulary[cfg.padding_char] = 0
    vocabulary[cfg.placeholder_char] = 1
    #reverse the vocbulary (for reverse lookup)
    rev_vocabulary = {v: k for k, v in vocabulary.items()}  
    vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

    return vocab
counts2vocab.py 文件源码 项目:histwords 作者: williamleif 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)

    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
dsm_module.py 文件源码 项目:MetaphoricChange 作者: Garrafao 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign):
    """
    Builds file with all lemma + POS pairs above certain frequency threshold. 
    :param dtatcfdir: path to directory with dta tcf files
    :param freq_file: path to frequency file
    :param MIN_FREQ: frequency threshold
    :param join_sign: sign to join lemma + first char of POS
    """

    # build frequency file from lemmas
    outputpath = freq_file
    print 'Building frequency file to ' + outputpath + "..."
    lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign))
    frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count)
    with open(outputpath, 'w') as f_out:
        for lemma in frequent_lemmas:
            print >> f_out, lemma.encode('utf-8')
geoparse.py 文件源码 项目:mordecai 作者: openeventdata 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def _feature_most_common(self, results):
        """
        Find the most common country name in ES/Geonames results

        Paramaters
        ----------
        results: dict
            output of `query_geonames`

        Returns
        -------
        most_common: str
            ISO code of most common country, or empty string if none
        """
        try:
            country_count = Counter([i['country_code3'] for i in results['hits']['hits']])
            most_common = country_count.most_common()[0][0]
            return most_common
        except IndexError:
            return ""
        except TypeError:
            return ""
bleu.py 文件源码 项目:atma 作者: AtmaHou 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def MP(candidate, references, n):
    """
    calculate modified precision
    """
    counts = Counter(ngrams(candidate, n))
    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values())
problem2c.py 文件源码 项目:STA141C 作者: clarkfitzg 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def overlap_score(q1, q2):
    """
    >>> overlap_score("a b c", "a b")
    0.8

    >>> overlap_score("   ", " ")
    0
    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())

    numerator = 0
    for word in c1:
        if word in c2:
            numerator += c1[word]
    for word in c2:
        if word in c1:
            numerator += c2[word]

    m = sum(c1.values())
    n = sum(c2.values())

    try:
        score = numerator / (m + n)
    except ZeroDivisionError:
        score = 0
    return score
checker.py 文件源码 项目:ThreatPrep 作者: ThreatResponse 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_category_stats(self):
        """Get a count of CheckState results for each category of checks.
        Ignore collection counts to avoid duplications"""
        flat_results = self.get_flattened_results()
        categories = list(set([x.category for x in flat_results]))
        metrics = {}
        for category in categories:
            metrics[category] = collections.Counter([
                x.status for x in filter(
                    lambda y: len(y.subchecks) == 0 and y.category==category,
                    flat_results
                )
            ])
        return metrics
test_data_driven_specs.py 文件源码 项目:monasca-transform 作者: openstack 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def check_list_field_for_row(
            self, row=None, field_name=None, expected_list=None):
        found_list = getattr(row, field_name)
        self.assertEqual(Counter(expected_list), Counter(found_list))
issue62_b.py 文件源码 项目:python-driver 作者: bblfsh 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def convert_uasts(self, file_uast_generator):
        for file_uast in file_uast_generator:
            print("-" * 20 + " " + str(file_uast.filepath))
            id_cnt = Counter()
            self.collect_id_cnt(file_uast.response.uast, id_cnt)
            print(id_cnt)
companycase.py 文件源码 项目:companycase 作者: duedil-ltd 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def fetch_all_transitions(self, language, ngram_length):
        """ Generate a dict of counts for transitions for all n-grams in the language word list """
        wordlist = os.path.join(os.path.dirname(__file__), "wordlists/{0}.txt".format(language))
        if not os.path.exists(wordlist):
            raise SystemError("Language '{0}' does not exist".format(language))

        all_grams = []
        with codecs.open(wordlist, 'r', encoding='utf-8') as f:
            for line in f:
                words = line.strip('\n').lower().split()
                ngrams = reduce(lambda x, y: x + y, map(lambda word: self.find_ngrams(word, ngram_length), words))
                all_grams += ngrams
        return dict(Counter(all_grams))
reader.py 文件源码 项目:variational-text-tensorflow 作者: carpedm20 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _build_vocab(self, file_path, vocab_path):
    counter = Counter(self._read_text(file_path).split())

    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    self.vocab = dict(zip(words, range(len(words))))

    save_pkl(vocab_path, self.vocab)
util.py 文件源码 项目:treecat 作者: posterior 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def log_profiling_stats():
    logger.info('-----------------------------------------------------------')
    logger.info('Series:')
    for name, series in sorted(SERIES.items()):
        logger.info('  {}: {}'.format(name, ' '.join(map(str, series))))

    logger.info('-----------------------------------------------------------')
    logger.info('Histograms:')
    for name, histogram in sorted(HISTOGRAMS.items()):
        logger.info('{: >10s} {}'.format('Count', name))
        for value, count in sorted(histogram.items()):
            logger.info('{: >10d} {}'.format(count, value))

    logger.info('-----------------------------------------------------------')
    logger.info('Counters:')
    logger.info('{: >10s} {}'.format('Count', 'Counter'))
    for name, count in sorted(COUNTERS.items()):
        logger.info('{: >10d} {}'.format(count, name))

    logger.info('-----------------------------------------------------------')
    logger.info('Timers:')
    times = [(t.elapsed, t.count, f) for (f, t) in TIMERS.items()]
    times.sort(reverse=True, key=lambda x: x[0])
    logger.info('{: >10} {: >10} {}'.format('Seconds', 'Calls', 'Function'))
    for time, count, name in times:
        logger.info('{: >10.3f} {: >10} {}'.format(time, count, name))
discover.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _guess_cdr3_start(group):
        """
        Return a guess for the CDR3 start within sequences in the given group
        """
        return Counter(group.V_CDR3_start).most_common()[0][0]
analyser.py 文件源码 项目:trf 作者: aistairc 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def calc_rs_pos(self) -> Dict[str, float]:
        """Calculate the ratio of each pos of words in input text
        Returns:
            float: the ratio of each pos of words in input text
        """
        pos = []
        # TODO: It may take a long time when the number of sentences are large
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            pos += [mrph.hinsi for mrph in juman_result.mrph_list()]
        pos_counter = Counter(pos)
        total = sum(pos_counter.values())
        return {name: float(num) / total for name, num in pos_counter.items()}
report.py 文件源码 项目:cellranger 作者: 10XGenomics 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, **kwargs):
        Metric.__init__(self, **kwargs)
        self.d = collections.Counter()


问题


面经


文章

微信
公众号

扫码关注公众号