def fill_histogram(self, idf, columns):
"""Fill input histogram with column(s) of input dataframe
:param idf: input data frame used for filling histogram
:param list columns: histogram column(s)
"""
name = ':'.join(columns)
if name not in self._counts:
# create an (empty) value counts dict
self._counts[name] = Counter()
# value_counts() is faster than groupby().size(), but only works for series (1d).
# else use groupby() for multi-dimensions
g = idf.groupby(by=columns).size() if len(columns) > 1 else idf[columns[0]].value_counts()
counts = Counter(g.to_dict())
# remove specific keys from histogram before merging, if so requested
counts = self.drop_requested_keys(name, counts)
self._counts[name].update(counts)
python类Counter()的实例源码
def test_bin_edges(self):
# constructor
cnt = Counter()
for i in range(10):
cnt[i*2] = i
vc = ValueCounts(key='x', counts=cnt)
bin_specs = { 'bin_width': 1, 'bin_offset': 0 }
h = Histogram(vc, variable='x', bin_specs = bin_specs)
# uniform
bin_edges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
self.assertListEqual(h.get_uniform_bin_edges(), bin_edges)
# truncated uniform bin edges
truncated_bin_edges = [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
self.assertListEqual(h.truncated_bin_edges([5.5,12.5]), truncated_bin_edges)
h_bin_edges = h.bin_edges()
self.assertIsInstance(h_bin_edges, np.ndarray)
self.assertListEqual(h_bin_edges.tolist(), bin_edges)
def test_bin_centers(self):
# constructor
cnt = Counter()
for i in range(10):
cnt[i*2] = i
vc = ValueCounts(key='x', counts=cnt)
bin_specs = { 'bin_width': 1, 'bin_offset': 0 }
h = Histogram(vc, variable='x', bin_specs = bin_specs)
bin_centers = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5, 18.5]
h_bin_centers = h.bin_centers()
self.assertIsInstance(h_bin_centers, np.ndarray)
self.assertListEqual(h_bin_centers.tolist(), bin_centers)
def test_bin_entries(self):
# constructor
cnt = Counter()
for i in range(10):
cnt[i*2] = i
vc = ValueCounts(key='x', counts=cnt)
bin_specs = { 'bin_width': 1, 'bin_offset': 0 }
h = Histogram(vc, variable='x', bin_specs = bin_specs)
bin_entries = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
h_bin_entries = h.bin_entries()
self.assertIsInstance(h_bin_entries, np.ndarray)
self.assertListEqual(h_bin_entries.tolist(), bin_entries)
def test_bin_labels(self):
# constructor
cnt = Counter()
for i in range(10):
cnt[i*2] = i
vc = ValueCounts(key='x', counts=cnt)
bin_specs = { 'bin_width': 1, 'bin_offset': 0 }
h = Histogram(vc, variable='x', bin_specs = bin_specs)
bin_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
h_bin_labels = h.bin_labels()
self.assertIsInstance(h_bin_labels, np.ndarray)
self.assertListEqual(h_bin_labels.tolist(), bin_labels)
def build_vocabulary( words, max_size ):
vocab_instances = 0
unique_counts = Counter(words)
d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) )
# start at 2 to leave room for padding & unknown
pb = Progress_bar(len(d) - 1)
for i, (key, value) in enumerate(vocabulary.items(), start=2):
vocab_instances += value
vocabulary[key] = i
pb.tick()
vocabulary[cfg.padding_char] = 0
vocabulary[cfg.placeholder_char] = 1
#reverse the vocbulary (for reverse lookup)
rev_vocabulary = {v: k for k, v in vocabulary.items()}
vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)
return vocab
def main():
args = docopt("""
Usage:
counts2pmi.py <counts>
""")
counts_path = args['<counts>']
words = Counter()
contexts = Counter()
with open(counts_path) as f:
for line in f:
count, word, context = line.strip().split()
count = int(count)
words[word] += count
contexts[context] += count
words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)
save_count_vocabulary(counts_path + '.words.vocab', words)
save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign):
"""
Builds file with all lemma + POS pairs above certain frequency threshold.
:param dtatcfdir: path to directory with dta tcf files
:param freq_file: path to frequency file
:param MIN_FREQ: frequency threshold
:param join_sign: sign to join lemma + first char of POS
"""
# build frequency file from lemmas
outputpath = freq_file
print 'Building frequency file to ' + outputpath + "..."
lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign))
frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count)
with open(outputpath, 'w') as f_out:
for lemma in frequent_lemmas:
print >> f_out, lemma.encode('utf-8')
def _feature_most_common(self, results):
"""
Find the most common country name in ES/Geonames results
Paramaters
----------
results: dict
output of `query_geonames`
Returns
-------
most_common: str
ISO code of most common country, or empty string if none
"""
try:
country_count = Counter([i['country_code3'] for i in results['hits']['hits']])
most_common = country_count.most_common()[0][0]
return most_common
except IndexError:
return ""
except TypeError:
return ""
def MP(candidate, references, n):
"""
calculate modified precision
"""
counts = Counter(ngrams(candidate, n))
if not counts:
return 0
max_counts = {}
for reference in references:
reference_counts = Counter(ngrams(reference, n))
for ngram in counts:
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
return sum(clipped_counts.values()) / sum(counts.values())
def overlap_score(q1, q2):
"""
>>> overlap_score("a b c", "a b")
0.8
>>> overlap_score(" ", " ")
0
"""
c1 = Counter(q1.split())
c2 = Counter(q2.split())
numerator = 0
for word in c1:
if word in c2:
numerator += c1[word]
for word in c2:
if word in c1:
numerator += c2[word]
m = sum(c1.values())
n = sum(c2.values())
try:
score = numerator / (m + n)
except ZeroDivisionError:
score = 0
return score
def get_category_stats(self):
"""Get a count of CheckState results for each category of checks.
Ignore collection counts to avoid duplications"""
flat_results = self.get_flattened_results()
categories = list(set([x.category for x in flat_results]))
metrics = {}
for category in categories:
metrics[category] = collections.Counter([
x.status for x in filter(
lambda y: len(y.subchecks) == 0 and y.category==category,
flat_results
)
])
return metrics
def check_list_field_for_row(
self, row=None, field_name=None, expected_list=None):
found_list = getattr(row, field_name)
self.assertEqual(Counter(expected_list), Counter(found_list))
def convert_uasts(self, file_uast_generator):
for file_uast in file_uast_generator:
print("-" * 20 + " " + str(file_uast.filepath))
id_cnt = Counter()
self.collect_id_cnt(file_uast.response.uast, id_cnt)
print(id_cnt)
def fetch_all_transitions(self, language, ngram_length):
""" Generate a dict of counts for transitions for all n-grams in the language word list """
wordlist = os.path.join(os.path.dirname(__file__), "wordlists/{0}.txt".format(language))
if not os.path.exists(wordlist):
raise SystemError("Language '{0}' does not exist".format(language))
all_grams = []
with codecs.open(wordlist, 'r', encoding='utf-8') as f:
for line in f:
words = line.strip('\n').lower().split()
ngrams = reduce(lambda x, y: x + y, map(lambda word: self.find_ngrams(word, ngram_length), words))
all_grams += ngrams
return dict(Counter(all_grams))
def _build_vocab(self, file_path, vocab_path):
counter = Counter(self._read_text(file_path).split())
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
self.vocab = dict(zip(words, range(len(words))))
save_pkl(vocab_path, self.vocab)
def log_profiling_stats():
logger.info('-----------------------------------------------------------')
logger.info('Series:')
for name, series in sorted(SERIES.items()):
logger.info(' {}: {}'.format(name, ' '.join(map(str, series))))
logger.info('-----------------------------------------------------------')
logger.info('Histograms:')
for name, histogram in sorted(HISTOGRAMS.items()):
logger.info('{: >10s} {}'.format('Count', name))
for value, count in sorted(histogram.items()):
logger.info('{: >10d} {}'.format(count, value))
logger.info('-----------------------------------------------------------')
logger.info('Counters:')
logger.info('{: >10s} {}'.format('Count', 'Counter'))
for name, count in sorted(COUNTERS.items()):
logger.info('{: >10d} {}'.format(count, name))
logger.info('-----------------------------------------------------------')
logger.info('Timers:')
times = [(t.elapsed, t.count, f) for (f, t) in TIMERS.items()]
times.sort(reverse=True, key=lambda x: x[0])
logger.info('{: >10} {: >10} {}'.format('Seconds', 'Calls', 'Function'))
for time, count, name in times:
logger.info('{: >10.3f} {: >10} {}'.format(time, count, name))
def _guess_cdr3_start(group):
"""
Return a guess for the CDR3 start within sequences in the given group
"""
return Counter(group.V_CDR3_start).most_common()[0][0]
def calc_rs_pos(self) -> Dict[str, float]:
"""Calculate the ratio of each pos of words in input text
Returns:
float: the ratio of each pos of words in input text
"""
pos = []
# TODO: It may take a long time when the number of sentences are large
for sentence in self.sentences:
juman_result = self.juman.analysis(sentence)
pos += [mrph.hinsi for mrph in juman_result.mrph_list()]
pos_counter = Counter(pos)
total = sum(pos_counter.values())
return {name: float(num) / total for name, num in pos_counter.items()}
def __init__(self, **kwargs):
Metric.__init__(self, **kwargs)
self.d = collections.Counter()