def keywords(self, num=5):
words_only = self.strip_tags(self.content, strip_punctuation=True)
words = words_only.split()
counter = collections.Counter(words)
common = counter.most_common()
keywords = []
INSIGNIFICANT_WORDS = ('should', 'which', 'therefore')
for word in common:
lower_word = word[0].lower()
if len(lower_word) > 4 and lower_word not in INSIGNIFICANT_WORDS:
keywords.append(lower_word)
if len(keywords) >= num:
break
return ", ".join(keywords)
python类Counter()的实例源码
def overlap_score(q1, q2):
"""
q1, q2 are preprocessed sentences (strings)
>>> overlap_score("a b", "a")
0.6666666666666666
"""
c1 = Counter(q1.split())
c2 = Counter(q2.split())
c1c2 = c1 + c2
both = set(c1.keys())
both = both.intersection(c2.keys())
bothscore = float(sum(c1c2[x] for x in both))
mplusn = float(sum(c1c2.values()))
score = bothscore / mplusn
return score
def overlap_score(q1, q2):
"""
>>> overlap_score("fun", "real fun")
0.6666666666666666
>>> overlap_score(" ", " ")
0
"""
q1count = Counter(q1.split())
q2count = Counter(q2.split())
both = set(q1count.keys())
both = both.intersection(q2count.keys())
combined = q1count + q2count
mplusn = float(sum(combined.values()))
overlap = float(sum(combined[x] for x in both))
try:
return overlap / mplusn
except ZeroDivisionError:
return 0
def vote(df, columns_name, value):
label_data = df.loc[df[columns_name] == value, 'label'].values
return Counter(label_data).most_common()[0][0]
def update_xpos(self, force=False):
if self.must_update_xpos or force:
try:
# TODO: we should check the current mode instead. ============
sel = self.view.sel()[0]
pos = sel.b
if not sel.empty():
if sel.a < sel.b:
pos -= 1
# ============================================================
r = sublime.Region(self.view.line(pos).a, pos)
counter = Counter(self.view.substr(r))
tab_size = self.view.settings().get('tab_size')
xpos = (self.view.rowcol(pos)[1] +
((counter['\t'] * tab_size) - counter['\t']))
except Exception as e:
nvim.console_message(e)
_logger.exception('error setting xpos; default to 0')
self.xpos = 0
return
else:
self.xpos = xpos
def main(args):
if args.minimum_frequency is None:
minimum_frequency = max((len(args.tables) + 1) // 2, 2)
else:
minimum_frequency = args.minimum_frequency
logger.info('Minimum frequency set to %s', minimum_frequency)
# Read in tables
tables = []
for path in args.tables:
table = pd.read_csv(path, sep='\t')
table = table[table.database_diff >= args.minimum_db_diff]
table = table.dropna()
tables.append(table)
if len(table) == 0:
logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)
# Count V sequence occurrences
counter = Counter()
for table in tables:
counter.update(set(table.consensus))
# Find most frequent occurrences and print result
print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
for sequence, frequency in counter.most_common():
if frequency < minimum_frequency:
break
names = []
gene = None
for table in tables:
matching_rows = table[table.consensus == sequence]
if matching_rows.empty:
continue
names.extend(matching_rows.name)
if gene is None:
row = matching_rows.iloc[0]
gene = row.gene
database_diff = row.database_diff
#shm = row['V_SHM']
print(frequency, gene, database_diff, sequence, *names, sep='\t')
def main(args):
if args.minimum_frequency is None:
# args.table is a list of file names
minimum_frequency = max((len(args.table) + 1) // 2, 2)
else:
minimum_frequency = args.minimum_frequency
logger.info('Minimum frequency set to %s', minimum_frequency)
# Read in tables
tables = []
for path in args.table:
table = read_table(path)
table = table.loc[:,['V_gene', 'V_SHM', 'V_nt', 'name']]
tables.append(table)
# Count V sequence occurrences
counter = Counter()
for table in tables:
counter.update(set(table.V_nt))
# Find most frequent occurrences and print result
print('Frequency', 'Gene', '%SHM', 'Sequence', sep='\t')
for sequence, frequency in counter.most_common():
if frequency < minimum_frequency:
break
names = []
gene = None
for table in tables:
matching_rows = table[table.V_nt == sequence]
if matching_rows.empty:
continue
names.extend(matching_rows.name)
if gene is None:
row = matching_rows.iloc[0]
gene = row['V_gene']
shm = row['V_SHM']
print(frequency, gene, shm, sequence, *names, sep='\t')
bag_of_features_transformer.py 文件源码
项目:xpandas
作者: alan-turing-institute
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, dictionary=None, **kwargs):
'''
:param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
'''
self.dictionary = dictionary
accepted_types = [
pd.Series, list, np.array, tuple
]
def bag_of_words_transform_function(corpus):
counter = Counter(corpus)
for el in self.dictionary:
if counter.get(el) is None:
counter[el] = 0
return counter
super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
columns=None,
transform_function=bag_of_words_transform_function)
def assertDifferentObjects(self, *objs):
id_counts = Counter(map(id, objs))
((most_common_id, count),) = id_counts.most_common(1)
if count > 1:
dupe = [o for o in objs if id(o) == most_common_id][0]
self.fail("%s appeared %d times in %s" % (dupe, count, objs))
def calc_n_types(self) -> int:
"""Calculate the number of types of input text
Returns:
int: the number of types of input text
"""
surfaces = []
for sentence in self.sentences:
juman_result = self.juman.analysis(sentence)
surfaces += [mrph.midasi for mrph in juman_result.mrph_list()]
word_type_counter = Counter(surfaces)
return len(word_type_counter)
def calc_rs_modality(self) -> Dict[str, float]:
modality_counter = Counter()
for i, s in enumerate(self.sentences):
chunks = []
for bnst in self.knp.parse(s).bnst_list():
chunk = Chunk(chunk_id=bnst.bnst_id,
link=bnst.parent,
description=bnst.fstring)
chunks.append(chunk)
s = "".join([chunk.description for chunk in chunks])
ms = set(re.findall("<?????-(.+?)>", s))
modality_counter += Counter(ms)
n = len(self.sentences)
return dict([(k, float(c) / n)
for k, c in modality_counter.items()])
def GetDuplicateColumnNames(
self, columns: sql_query_column_model.SQLColumnModel) -> [str]:
"""Find out if the query has duplicate column names and if a alias is
needed.
Args:
columns (sql_query_column_model.SQLColumnModel): all columns parsed
from the cursor
Returns:
[str]: a list of all the duplicate column names, if its empty it means it
is a distinct list of columns
"""
single_column_name_list = [column.sql_column for column in columns]
duplicate_list = [column for column, count in
collections.Counter(single_column_name_list).items() if
count > 1]
return sorted(duplicate_list)
def _build_vocab(self, filename):
counts = Counter()
with tf.gfile.GFile(filename, "r") as f:
#for line in f:
# words = line.replace("\n"," ").split()
# counts += Counter(words)
while True:
chunk = f.read(int(500000000/2))
if not chunk:
break
counts += Counter(chunk.replace("\n", " ").split())
sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
self.word_to_id[EOS] = IEOS
self.word_to_id[BOS] = IBOS
self.word_to_id[PAD] = IPAD
def print_params(self, cgs):
"""
cgs : list of computational graph names
"""
for name, cg in cgs.iteritems():
shapes = [param.get_value().shape for param in cg.parameters]
logger.info(
"Parameter shapes for computation graph[{}]".format(name))
for shape, count in Counter(shapes).most_common():
logger.info(' {:15}: {}'.format(shape, count))
logger.info(
"Total number of parameters for computation graph[{}]: {}"
.format(name, len(shapes)))
logger.info(
"Parameter names for computation graph[{}]: ".format(name))
for item in cg.parameters:
logger.info(
" {:15}: {}".format(item.get_value().shape, item.name))
logger.info(
"Total number of parameters for computation graph[{}]: {}"
.format(name, len(cg.parameters)))
def get_manuscript_stats(text, citation_df):
"""
Compute manuscript statistics.
"""
stats = collections.OrderedDict()
# Number of distinct references by type
ref_counts = (
citation_df
.standard_citation
.drop_duplicates()
.map(lambda x: x.split(':')[0])
.pipe(collections.Counter)
)
ref_counts['total'] = sum(ref_counts.values())
stats['reference_counts'] = ref_counts
stats['word_count'] = len(text.split())
logging.info(f"Generated manscript stats:\n{json.dumps(stats, indent=2)}")
return stats
def subset_glyphs(self, s):
table = self.table.Baseline
if table.Format in (1, 3):
baselines = {glyph: table.BaselineValues.get(glyph, table.DefaultBaseline)
for glyph in s.glyphs}
if len(baselines) > 0:
mostCommon, _cnt = Counter(baselines.values()).most_common(1)[0]
table.DefaultBaseline = mostCommon
baselines = {glyph: b for glyph, b in baselines.items()
if b != mostCommon}
if len(baselines) > 0:
table.BaselineValues = baselines
else:
table.Format = {1: 0, 3: 2}[table.Format]
del table.BaselineValues
return True
def subset_glyphs(self, s):
prop = self.table.GlyphProperties
if prop.Format == 0:
return prop.DefaultProperties != 0
elif prop.Format == 1:
prop.Properties = {g: prop.Properties.get(g, prop.DefaultProperties)
for g in s.glyphs}
mostCommon, _cnt = Counter(prop.Properties.values()).most_common(1)[0]
prop.DefaultProperties = mostCommon
prop.Properties = {g: prop for g, prop in prop.Properties.items()
if prop != mostCommon}
if len(prop.Properties) == 0:
del prop.Properties
prop.Format = 0
return prop.DefaultProperties != 0
return True
else:
assert False, "unknown 'prop' format %s" % prop.Format
def build_vocab(train_data, test_data):
counter = collections.Counter()
for stories, questions, answers in [train_data, test_data]:
for story in stories:
for sent in story:
for word in nltk.word_tokenize(sent):
counter[word.lower()] += 1
for question in questions:
for word in nltk.word_tokenize(question):
counter[word.lower()] += 1
for answer in answers:
for word in nltk.word_tokenize(answer):
counter[word.lower()] += 1
# no OOV here because there are not too many words in dataset
word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
word2idx["PAD"] = 0
idx2word = {v:k for k, v in word2idx.items()}
return word2idx, idx2word
def kmer_freq ( ref_str, k ):
"""
Walk through sequence and return k-mer counts plus
a pseudocount of 1.
"""
ref_str = ref_str.upper()
kmers = []
for seq in product("ATGC",repeat=k):
kmers.append( "".join(seq) )
kmer_counts = Counter()
for j in range( len(ref_str)-(k-1) ):
motif = ref_str[j:j+k]
kmer_counts[motif] += 1
# Combine forward and reverse complement motifs into one count
combined_kmer = Counter()
for kmer in kmers:
kmer_rc = rev_comp_motif(kmer)
if not combined_kmer.get(kmer_rc):
combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1
return combined_kmer
def kmer_freq ( mode, ref_str, strand, opts ):
ref_str = ref_str.upper()
if strand==1:
ref_str = ref_str[::-1]
k = opts.comp_kmer
kmers = []
for seq in product("ATGC",repeat=k):
kmers.append( "".join(seq) )
kmer_counts = Counter()
for j in range( len(ref_str)-(k-1) ):
motif = ref_str[j:j+k]
kmer_counts[motif] += 1
# Combine forward and reverse complement motifs into one count
combined_kmer = Counter()
for kmer in kmers:
kmer_rc = motif_tools.rev_comp_motif(kmer)
if not combined_kmer.get(kmer_rc):
combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1
return combined_kmer
def get_class_weights2(y, smooth_factor=0):
"""
Returns the normalized weights for each class based on the frequencies of the samples
:param smooth_factor: factor that smooths extremely uneven weights
:param y: list of true labels (the labels must be hashable)
:return: dictionary with the weight for each class
"""
counter = Counter(y)
if smooth_factor > 0:
p = max(counter.values()) * smooth_factor
for k in counter.keys():
counter[k] += p
majority = max(counter.values())
return {cls: float(majority / count) for cls, count in counter.items()}
def path_clean(path):
rel_ents = path.split(' -> ')
relations = []
entities = []
for idx, item in enumerate(rel_ents):
if idx%2 == 0:
relations.append(item)
else:
entities.append(item)
entity_stats = Counter(entities).items()
duplicate_ents = [item for item in entity_stats if item[1]!=1]
duplicate_ents.sort(key = lambda x:x[1], reverse=True)
for item in duplicate_ents:
ent = item[0]
ent_idx = [i for i, x in enumerate(rel_ents) if x == ent]
if len(ent_idx)!=0:
min_idx = min(ent_idx)
max_idx = max(ent_idx)
if min_idx!=max_idx:
rel_ents = rel_ents[:min_idx] + rel_ents[max_idx:]
return ' -> '.join(rel_ents)
def main(cli_args):
if len(config["targets"]) == 0:
exit("No target found; maybe you need to specify a Dactyl config file?")
issues = check_all_pages(target=cli_args.target)
if issues:
num_issues = sum(len(p[1]) for p in issues)
print("Found %d issues:" % num_issues)
for pagename,issuelist in issues:
print("Page: %s" % pagename)
c = collections.Counter(issuelist)
for i, count_i in c.items():
if i[0]=="Unplain Phrase":
print(" Discouraged phrase: %s (%d instances); suggest '%s' instead." %
( i[1], count_i, config["disallowed_phrases"][i[1].lower()] ))
elif i[0]=="Unplain Word":
print(" Discouraged word: %s (%d instances); suggest '%s' instead." %
( i[1], count_i, config["disallowed_words"][i[1].lower()] ))
else:
print(" %s: %s (%d instances)" % (i[0], i[1], count_i))
exit(1)
else:
print("Style check passed with flying colors!")
exit(0)
def get_nb_caption_per_img(n, selected_captions):
"""
Get image id from audio caption file names that were selected by their speakers
Choose images that have at least n captions per image
----------
n : int,
desired number of caption per image
selected_captions : list of string,
list of caption file names selected by their speakers
"""
counter_nb_caption=Counter()
for cap in selected_captions:
#get image id
ImgID = cap.split('_')[-0]
# add a count
counter_nb_caption[ImgID]+=1
#choose img_id that have a count of n
d=dict((k, v) for k, v in counter_nb_caption.items() if v == n)
ImgID_selected=d.keys()
return(ImgID_selected)
def _f1_score(pred, answers):
"""Compute the F1 score."""
def _score(g_tokens, a_tokens):
common = Counter(g_tokens) & Counter(a_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1. * num_same / len(g_tokens)
recall = 1. * num_same / len(a_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
if pred is None or answers is None:
return 0
g_tokens = _normalize_answer(pred).split()
scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
return max(scores)
def test2():
patient_data_paths = utils_lung.get_patient_data_paths(pathfinder.DATA_PATH)
print len(patient_data_paths)
pixel_spacings_xy = []
n_slices = []
for k, p in enumerate(patient_data_paths):
pid = utils_lung.extract_pid_dir(p)
sid2data, sid2metadata = utils_lung.get_patient_data(p)
mtd = sid2metadata.itervalues().next()
assert mtd['PixelSpacing'][0] == mtd['PixelSpacing'][1]
pixel_spacings_xy.append(mtd['PixelSpacing'][0])
n_slices.append(len(sid2metadata))
print pid, pixel_spacings_xy[-1], n_slices[-1]
print 'nslices', np.max(n_slices), np.min(n_slices), np.mean(n_slices)
counts = collections.Counter(pixel_spacings_xy)
new_list = sorted(pixel_spacings_xy, key=counts.get, reverse=True)
print 'spacing', new_list
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]):
X_train = unitmatrix(X_train) # normalize
X_test = unitmatrix(X_test)
score = X_test.dot(X_train.T)
precisions = defaultdict(dict)
label_counter = Counter(Y_test.tolist())
for idx in range(len(X_test)):
retrieval_idx = score[idx].argsort()[::-1]
for fr in fractions:
ntop = int(fr * len(X_train))
pr = float(len([i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]])) / ntop
try:
precisions[fr][Y_test[idx]] += pr
except:
precisions[fr][Y_test[idx]] = pr
new_pr = {}
for fr, val in precisions.iteritems():
avg_pr = 0.
for label, pr in val.iteritems():
avg_pr += pr / label_counter[label]
new_pr[fr] = avg_pr / len(label_counter)
return sorted(new_pr.items(), key=lambda d:d[0])
def cross_sentence(event_lemma_dict):
"""
function to create all possible pairs between event mentions in a file
:param event_lemma_dict: dictionary of event lemmas in file
:return: counter dictionary of event pairs in a file
"""
full_event_file = []
pairs_circumstantial_corpus = Counter([])
for k, v in event_lemma_dict.items():
full_event_file.append(k)
event_pairs_full = list(product(full_event_file, repeat=2))
for i in event_pairs_full:
pairs_circumstantial_corpus.update([i])
return pairs_circumstantial_corpus
def print_grouping(attributes, grouping, top):
"""
Print computed groups.
:param attributes: list of grouped attributes
:type: list(str)
:param grouping: counter for each combination of attributes' values
:type: Counter
:type top: int
"""
total = sum(grouping.values())
table = Table(attributes + ['count', '%'])
table.add_rows(total, grouping.most_common(top))
print '\n' + table.by_count()
print 'Total:', total
def __init__(self):
self.handlers = {
0x001: self._power,
0x186: self._text,
0x185: self._textparam,
0x061: self._exttemp,
0x005: self._tpms,
#0x18e: self._textparam,
0x026: self._fuel,
0x053: self._gpsdate,
0x055: self._gps,
}
self.counter = Counter()
self.locations = []
self.fuel = [0,0]