python类Counter()的实例源码

post.py 文件源码 项目:redberry 作者: michaelcho 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def keywords(self, num=5):

        words_only = self.strip_tags(self.content, strip_punctuation=True)
        words = words_only.split()

        counter = collections.Counter(words)
        common = counter.most_common()

        keywords = []

        INSIGNIFICANT_WORDS = ('should', 'which', 'therefore')

        for word in common:
            lower_word = word[0].lower()
            if len(lower_word) > 4 and lower_word not in INSIGNIFICANT_WORDS:
                keywords.append(lower_word)

            if len(keywords) >= num:
                break

        return ", ".join(keywords)
problem2.py 文件源码 项目:STA141C 作者: clarkfitzg 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def overlap_score(q1, q2):
    """
    q1, q2 are preprocessed sentences (strings)

    >>> overlap_score("a b", "a")
    0.6666666666666666

    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())
    c1c2 = c1 + c2

    both = set(c1.keys())
    both = both.intersection(c2.keys())

    bothscore = float(sum(c1c2[x] for x in both))
    mplusn = float(sum(c1c2.values()))

    score = bothscore / mplusn

    return score
problem2_backup.py 文件源码 项目:STA141C 作者: clarkfitzg 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def overlap_score(q1, q2):
    """
    >>> overlap_score("fun", "real fun")
    0.6666666666666666
    >>> overlap_score("  ", "   ")
    0
    """

    q1count = Counter(q1.split())
    q2count = Counter(q2.split())

    both = set(q1count.keys())
    both = both.intersection(q2count.keys())
    combined = q1count + q2count

    mplusn = float(sum(combined.values()))
    overlap = float(sum(combined[x] for x in both))

    try:
        return overlap / mplusn
    except ZeroDivisionError:
        return 0
decision_tree.py 文件源码 项目:Modeling_Preparation 作者: Yangruipis 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def vote(df, columns_name, value):
        label_data = df.loc[df[columns_name] == value, 'label'].values
        return Counter(label_data).most_common()[0][0]
state.py 文件源码 项目:NeoVintageous 作者: NeoVintageous 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def update_xpos(self, force=False):
        if self.must_update_xpos or force:
            try:
                # TODO: we should check the current mode instead. ============
                sel = self.view.sel()[0]
                pos = sel.b
                if not sel.empty():
                    if sel.a < sel.b:
                        pos -= 1
                # ============================================================
                r = sublime.Region(self.view.line(pos).a, pos)
                counter = Counter(self.view.substr(r))
                tab_size = self.view.settings().get('tab_size')
                xpos = (self.view.rowcol(pos)[1] +
                        ((counter['\t'] * tab_size) - counter['\t']))
            except Exception as e:
                nvim.console_message(e)
                _logger.exception('error setting xpos; default to 0')
                self.xpos = 0
                return
            else:
                self.xpos = xpos
multidiscover.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def main(args):
    if args.minimum_frequency is None:
        minimum_frequency = max((len(args.tables) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.tables:
        table = pd.read_csv(path, sep='\t')
        table = table[table.database_diff >= args.minimum_db_diff]
        table = table.dropna()
        tables.append(table)
        if len(table) == 0:
            logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.consensus))

    # Find most frequent occurrences and print result
    print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.consensus == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row.gene
                database_diff = row.database_diff
                #shm = row['V_SHM']
        print(frequency, gene, database_diff, sequence, *names, sep='\t')
commonv.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def main(args):
    if args.minimum_frequency is None:
        # args.table is a list of file names
        minimum_frequency = max((len(args.table) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.table:
        table = read_table(path)
        table = table.loc[:,['V_gene', 'V_SHM', 'V_nt', 'name']]
        tables.append(table)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.V_nt))

    # Find most frequent occurrences and print result
    print('Frequency', 'Gene', '%SHM', 'Sequence', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.V_nt == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row['V_gene']
                shm = row['V_SHM']
        print(frequency, gene, shm, sequence, *names, sep='\t')
bag_of_features_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, dictionary=None, **kwargs):
        '''
        :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
        '''
        self.dictionary = dictionary

        accepted_types = [
            pd.Series, list, np.array, tuple
        ]

        def bag_of_words_transform_function(corpus):
            counter = Counter(corpus)
            for el in self.dictionary:
                if counter.get(el) is None:
                    counter[el] = 0
            return counter

        super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
                                                    columns=None,
                                                    transform_function=bag_of_words_transform_function)
test_term.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def assertDifferentObjects(self, *objs):
        id_counts = Counter(map(id, objs))
        ((most_common_id, count),) = id_counts.most_common(1)
        if count > 1:
            dupe = [o for o in objs if id(o) == most_common_id][0]
            self.fail("%s appeared %d times in %s" % (dupe, count, objs))
analyser.py 文件源码 项目:trf 作者: aistairc 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def calc_n_types(self) -> int:
        """Calculate the number of types of input text
        Returns:
            int: the number of types of input text
        """
        surfaces = []
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            surfaces += [mrph.midasi for mrph in juman_result.mrph_list()]
        word_type_counter = Counter(surfaces)
        return len(word_type_counter)
analyser.py 文件源码 项目:trf 作者: aistairc 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def calc_rs_modality(self) -> Dict[str, float]:

        modality_counter = Counter()
        for i, s in enumerate(self.sentences):
            chunks = []
            for bnst in self.knp.parse(s).bnst_list():
                chunk = Chunk(chunk_id=bnst.bnst_id,
                              link=bnst.parent,
                              description=bnst.fstring)
                chunks.append(chunk)

            s = "".join([chunk.description for chunk in chunks])
            ms = set(re.findall("<?????-(.+?)>", s))
            modality_counter += Counter(ms)

            n = len(self.sentences)

        return dict([(k, float(c) / n)
                     for k, c in modality_counter.items()])
sqlite_type_helper.py 文件源码 项目:PlasoScaffolder 作者: ClaudiaSaxer 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def GetDuplicateColumnNames(
      self, columns: sql_query_column_model.SQLColumnModel) -> [str]:
    """Find out if the query has duplicate column names and if a alias is
        needed.

    Args:
      columns (sql_query_column_model.SQLColumnModel): all columns parsed
          from the cursor
    Returns:
      [str]: a list of all the duplicate column names, if its empty it means it
          is a distinct list of columns
    """
    single_column_name_list = [column.sql_column for column in columns]
    duplicate_list = [column for column, count in
                      collections.Counter(single_column_name_list).items() if
                      count > 1]
    return sorted(duplicate_list)
dataset.py 文件源码 项目:tf_rnnlm 作者: Ubiqus 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _build_vocab(self, filename):
    counts = Counter()
    with tf.gfile.GFile(filename, "r") as f:
      #for line in f:
      #  words = line.replace("\n"," ").split()
      #  counts += Counter(words)
      while True:
        chunk = f.read(int(500000000/2))
        if not chunk: 
          break
        counts += Counter(chunk.replace("\n", " ").split())

    sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
    self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
    self.word_to_id[EOS] = IEOS
    self.word_to_id[BOS] = IBOS
    self.word_to_id[PAD] = IPAD
models.py 文件源码 项目:dl4mt-multi 作者: nyu-dl 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def print_params(self, cgs):
        """
        cgs : list of computational graph names
        """
        for name, cg in cgs.iteritems():
            shapes = [param.get_value().shape for param in cg.parameters]
            logger.info(
                "Parameter shapes for computation graph[{}]".format(name))
            for shape, count in Counter(shapes).most_common():
                logger.info('    {:15}: {}'.format(shape, count))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(shapes)))

            logger.info(
                "Parameter names for computation graph[{}]: ".format(name))
            for item in cg.parameters:
                logger.info(
                    "    {:15}: {}".format(item.get_value().shape, item.name))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(cg.parameters)))
manuscript.py 文件源码 项目:manubot 作者: greenelab 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_manuscript_stats(text, citation_df):
    """
    Compute manuscript statistics.
    """
    stats = collections.OrderedDict()

    # Number of distinct references by type
    ref_counts = (
        citation_df
        .standard_citation
        .drop_duplicates()
        .map(lambda x: x.split(':')[0])
        .pipe(collections.Counter)
    )
    ref_counts['total'] = sum(ref_counts.values())
    stats['reference_counts'] = ref_counts
    stats['word_count'] = len(text.split())
    logging.info(f"Generated manscript stats:\n{json.dumps(stats, indent=2)}")
    return stats
__init__.py 文件源码 项目:otRebuilder 作者: Pal3love 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def subset_glyphs(self, s):
    table = self.table.Baseline
    if table.Format in (1, 3):
        baselines = {glyph: table.BaselineValues.get(glyph, table.DefaultBaseline)
                     for glyph in s.glyphs}
        if len(baselines) > 0:
            mostCommon, _cnt = Counter(baselines.values()).most_common(1)[0]
            table.DefaultBaseline = mostCommon
            baselines = {glyph: b for glyph, b in baselines.items()
                         if b != mostCommon}
        if len(baselines) > 0:
            table.BaselineValues = baselines
        else:
            table.Format = {1: 0, 3: 2}[table.Format]
            del table.BaselineValues
    return True
__init__.py 文件源码 项目:otRebuilder 作者: Pal3love 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def subset_glyphs(self, s):
    prop = self.table.GlyphProperties
    if prop.Format == 0:
        return prop.DefaultProperties != 0
    elif prop.Format == 1:
        prop.Properties = {g: prop.Properties.get(g, prop.DefaultProperties)
                           for g in s.glyphs}
        mostCommon, _cnt = Counter(prop.Properties.values()).most_common(1)[0]
        prop.DefaultProperties = mostCommon
        prop.Properties = {g: prop for g, prop in prop.Properties.items()
                           if prop != mostCommon}
        if len(prop.Properties) == 0:
            del prop.Properties
            prop.Format = 0
            return prop.DefaultProperties != 0
        return True
    else:
        assert False, "unknown 'prop' format %s" % prop.Format
mem-network.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
        for question in questions:
            for word in nltk.word_tokenize(question):
                counter[word.lower()] += 1
        for answer in answers:
            for word in nltk.word_tokenize(answer):
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word
create_kmer_freq_vectors.py 文件源码 项目:mbin 作者: fanglab 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def kmer_freq ( ref_str, k ):
    """
    Walk through sequence and return k-mer counts plus
    a pseudocount of 1.
    """
    ref_str = ref_str.upper()
    kmers = []
    for seq in product("ATGC",repeat=k):
        kmers.append( "".join(seq) )

    kmer_counts = Counter()
    for j in range( len(ref_str)-(k-1) ):
        motif    = ref_str[j:j+k]
        kmer_counts[motif] += 1

    # Combine forward and reverse complement motifs into one count
    combined_kmer = Counter()
    for kmer in kmers:
        kmer_rc = rev_comp_motif(kmer)
        if not combined_kmer.get(kmer_rc):
            combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

    return combined_kmer
read_scanner.py 文件源码 项目:mbin 作者: fanglab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def kmer_freq ( mode, ref_str, strand, opts ):
    ref_str = ref_str.upper()
    if strand==1:
        ref_str = ref_str[::-1]
    k = opts.comp_kmer
    kmers = []
    for seq in product("ATGC",repeat=k):
        kmers.append( "".join(seq) )

    kmer_counts = Counter()
    for j in range( len(ref_str)-(k-1) ):
        motif    = ref_str[j:j+k]
        kmer_counts[motif] += 1

    # Combine forward and reverse complement motifs into one count
    combined_kmer = Counter()
    for kmer in kmers:
        kmer_rc = motif_tools.rev_comp_motif(kmer)
        if not combined_kmer.get(kmer_rc):
            combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

    return combined_kmer
data_preparation.py 文件源码 项目:keras-utilities 作者: cbaziotis 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_class_weights2(y, smooth_factor=0):
    """
    Returns the normalized weights for each class based on the frequencies of the samples
    :param smooth_factor: factor that smooths extremely uneven weights
    :param y: list of true labels (the labels must be hashable)
    :return: dictionary with the weight for each class
    """
    counter = Counter(y)

    if smooth_factor > 0:
        p = max(counter.values()) * smooth_factor
        for k in counter.keys():
            counter[k] += p

    majority = max(counter.values())

    return {cls: float(majority / count) for cls, count in counter.items()}
utils.py 文件源码 项目:DeepPath 作者: xwhan 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def path_clean(path):
    rel_ents = path.split(' -> ')
    relations = []
    entities = []
    for idx, item in enumerate(rel_ents):
        if idx%2 == 0:
            relations.append(item)
        else:
            entities.append(item)
    entity_stats = Counter(entities).items()
    duplicate_ents = [item for item in entity_stats if item[1]!=1]
    duplicate_ents.sort(key = lambda x:x[1], reverse=True)
    for item in duplicate_ents:
        ent = item[0]
        ent_idx = [i for i, x in enumerate(rel_ents) if x == ent]
        if len(ent_idx)!=0:
            min_idx = min(ent_idx)
            max_idx = max(ent_idx)
            if min_idx!=max_idx:
                rel_ents = rel_ents[:min_idx] + rel_ents[max_idx:]
    return ' -> '.join(rel_ents)
dactyl_style_checker.py 文件源码 项目:dactyl 作者: ripple 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def main(cli_args):
    if len(config["targets"]) == 0:
        exit("No target found; maybe you need to specify a Dactyl config file?")

    issues = check_all_pages(target=cli_args.target)
    if issues:
        num_issues = sum(len(p[1]) for p in issues)
        print("Found %d issues:" % num_issues)
        for pagename,issuelist in issues:
            print("Page: %s" % pagename)
            c = collections.Counter(issuelist)
            for i, count_i in c.items():
                if i[0]=="Unplain Phrase":
                    print("   Discouraged phrase: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_phrases"][i[1].lower()] ))
                elif i[0]=="Unplain Word":
                    print("   Discouraged word: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_words"][i[1].lower()] ))
                else:
                    print("   %s: %s (%d instances)" % (i[0], i[1], count_i))
        exit(1)
    else:
        print("Style check passed with flying colors!")
        exit(0)
sampling.py 文件源码 项目:evaluation_tools 作者: JSALT-Rosetta 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_nb_caption_per_img(n, selected_captions): 
    """
    Get image id from audio caption file names that were selected by their speakers
    Choose images that have at least n captions per image
    ----------
    n : int, 
        desired number of caption per image
    selected_captions : list of string, 
        list of caption file names selected by their speakers
    """

    counter_nb_caption=Counter()

    for cap in selected_captions: 
        #get image id 
        ImgID = cap.split('_')[-0]
        # add a count 
        counter_nb_caption[ImgID]+=1

    #choose img_id that have a count of n
    d=dict((k, v) for k, v in counter_nb_caption.items() if v == n)

    ImgID_selected=d.keys()

    return(ImgID_selected)
utils.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _f1_score(pred, answers):
    """Compute the F1 score."""

    def _score(g_tokens, a_tokens):
        common = Counter(g_tokens) & Counter(a_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1. * num_same / len(g_tokens)
        recall = 1. * num_same / len(a_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    if pred is None or answers is None:
        return 0
    g_tokens = _normalize_answer(pred).split()
    scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
    return max(scores)
test_data.py 文件源码 项目:dsb3 作者: EliasVansteenkiste 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test2():
    patient_data_paths = utils_lung.get_patient_data_paths(pathfinder.DATA_PATH)
    print len(patient_data_paths)
    pixel_spacings_xy = []
    n_slices = []

    for k, p in enumerate(patient_data_paths):
        pid = utils_lung.extract_pid_dir(p)
        sid2data, sid2metadata = utils_lung.get_patient_data(p)
        mtd = sid2metadata.itervalues().next()

        assert mtd['PixelSpacing'][0] == mtd['PixelSpacing'][1]
        pixel_spacings_xy.append(mtd['PixelSpacing'][0])
        n_slices.append(len(sid2metadata))
        print pid, pixel_spacings_xy[-1], n_slices[-1]

    print 'nslices', np.max(n_slices), np.min(n_slices), np.mean(n_slices)
    counts = collections.Counter(pixel_spacings_xy)
    new_list = sorted(pixel_spacings_xy, key=counts.get, reverse=True)
    print 'spacing', new_list
retrieval.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]):
    X_train = unitmatrix(X_train) # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    precisions = defaultdict(dict)
    label_counter = Counter(Y_test.tolist())

    for idx in range(len(X_test)):
        retrieval_idx = score[idx].argsort()[::-1]
        for fr in fractions:
            ntop = int(fr * len(X_train))
            pr = float(len([i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]])) / ntop
            try:
                precisions[fr][Y_test[idx]] += pr
            except:
                precisions[fr][Y_test[idx]] = pr
    new_pr = {}
    for fr, val in precisions.iteritems():
        avg_pr = 0.
        for label, pr in val.iteritems():
            avg_pr += pr / label_counter[label]
        new_pr[fr] = avg_pr / len(label_counter)

    return sorted(new_pr.items(), key=lambda d:d[0])
baseline_PPMI1.py 文件源码 项目:EventStoryLine 作者: tommasoc80 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cross_sentence(event_lemma_dict):
    """
    function to create all possible pairs between event mentions in a file
    :param event_lemma_dict: dictionary of event lemmas in file
    :return: counter dictionary of event pairs in a file
    """

    full_event_file = []
    pairs_circumstantial_corpus = Counter([])

    for k, v in event_lemma_dict.items():
        full_event_file.append(k)

    event_pairs_full = list(product(full_event_file, repeat=2))

    for i in event_pairs_full:
        pairs_circumstantial_corpus.update([i])

    return pairs_circumstantial_corpus
group.py 文件源码 项目:sentrycli 作者: operasoftware 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def print_grouping(attributes, grouping, top):
    """
    Print computed groups.

    :param attributes: list of grouped attributes
    :type: list(str)
    :param grouping: counter for each combination of attributes' values
    :type: Counter
    :type top: int
    """
    total = sum(grouping.values())

    table = Table(attributes + ['count', '%'])
    table.add_rows(total, grouping.most_common(top))

    print '\n' + table.by_count()
    print 'Total:', total
gmlan_gw.py 文件源码 项目:gmlan_gw 作者: tmkdev 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self):
        self.handlers = {
            0x001: self._power,
            0x186: self._text,
            0x185: self._textparam,
            0x061: self._exttemp,
            0x005: self._tpms,
            #0x18e: self._textparam,
            0x026: self._fuel,
            0x053: self._gpsdate,
            0x055: self._gps,
        }

        self.counter = Counter() 
        self.locations = []
        self.fuel = [0,0]


问题


面经


文章

微信
公众号

扫码关注公众号