def norm_post_sim(modes,cov_matrix):
post = multivariate_normal(modes,cov_matrix)
nsims = 30000
phi = np.zeros([nsims,len(modes)])
for i in range(0,nsims):
phi[i] = post.rvs()
chain = np.array([phi[i][0] for i in range(len(phi))])
for m in range(1,len(modes)):
chain = np.vstack((chain,[phi[i][m] for i in range(len(phi))]))
mean_est = [np.mean(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))]
median_est = [np.median(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))]
upper_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),95) for j in range(len(modes))]
lower_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),5) for j in range(len(modes))]
return chain, mean_est, median_est, upper_95_est, lower_95_est
python类percentile()的实例源码
calculate_aggregate_statistics.py 文件源码
项目:tbp-next-basket
作者: GiulioRossetti
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values),
'sum': np.sum(values)
}
return agg_measures
def get_pr(reference_frames,output_frames,mode='type',pr_resolution=100):
# filter output by confidence
confidence = collect_confidence(output_frames)
conf_order = []
step = 100/float(pr_resolution)
for j in range(1,pr_resolution+1):
conf_order.append( np.percentile(confidence, j*step) )
conf_order = [-1] + conf_order + [2]
# get curve
params = []
for threshold in conf_order:
params.append( [ reference_frames, output_frames, confidence, threshold, mode ] )
all_tp, all_fp, all_fn, all_prec, all_rec = zip(*pool.map(single_point, params))
all_prec = list(all_prec) #+ [0]
all_rec = list(all_rec) #+ [1]
all_rec, all_prec = zip(*sorted(zip(all_rec, all_prec)))
AUC = metrics.auc(all_rec, all_prec)
return all_rec, all_prec, AUC
# create complete output for 1 mode --------------------------------------------
def results_stat():
for file in os.listdir('./results'):
if file.endswith('.txt'):
file_path = os.path.join('./results', file)
data = [[], [], [], []]
with open(file_path, 'r') as f:
lines = f.read().split('\n')
for line in lines:
if len(line) > 0:
line_data = map(eval, line.split())
for i in range(4):
data[i].append(line_data[i])
lines = [[], [], [], []]
for i in range(4):
lines[i].append(np.mean(data[i]))
lines[i].append(np.std(data[i]))
lines[i].append(np.min(data[i]))
lines[i].append(np.percentile(data[i], 25))
lines[i].append(np.percentile(data[i], 50))
lines[i].append(np.percentile(data[i], 75))
lines[i].append(np.max(data[i]))
output_path = os.path.join('./stat', file)
with open(output_path, 'w') as f:
for line in lines:
f.write('\t'.join(map(str, line)) + '\n')
def splitclusters(datapointwts,seeds,theta):
std = {}; seeds1 = []; seedweight = [];
cluster, p2cluster = point2cluster(datapointwts, seeds,theta);
for cl in cluster:
mang = seeds[cl][-1];
if len(cluster[cl]) > 10:
std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
clockwise = [xx for xx in cluster[cl] if greaterthanangle(xx[2], mang)];
if std[cl]>20 and len(clockwise)>0 and len(clockwise)<len(cluster[cl]):
seeds1.append(avgpoint(clockwise))
seeds1.append(avgpoint([xx for xx in cluster[cl] if not greaterthanangle(xx[2], mang)]))
seedweight.append(len(clockwise))
seedweight.append(len(cluster[cl]) -len(clockwise))
else:
seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl]))
else:
seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl]))
return seeds1, seedweight
def splitclustersparallel(datapointwts,seeds):
X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = [];
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S)
distances, indices = nbrs.kneighbors(X)
for cd in range(len(seeds)):
cluster[cd] = []; roadwidth.append(0);
for ii, ll in enumerate(indices):
dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
cd = ll[dd.index(min(dd))];
cluster[cd].append(datapointwts[ii])
p2cluster.append(cd)
for cl in cluster:
mang = seeds[cl][-1];
scl = seeds[cl]
if len(cluster[cl]) > 10:
std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1]) for xx in cluster[cl]])
print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1]) for xx in cluster[cl]])
def _write_config(self, fasta_filename):
"""Write daligner sensitive config to fasta_filename.sensitive.config."""
lens = [len(r.sequence) for r in ContigSetReaderWrapper(fasta_filename)]
self.low_cDNA_size, self.high_cDNA_size = 0, 0
if len(lens) == 1:
self.low_cDNA_size, self.high_cDNA_size = lens[0], lens[0]
if len(lens) >= 2:
self.low_cDNA_size = int(np.percentile(lens, 10))
self.high_cDNA_size = int(np.percentile(lens, 90))
try:
with open(fasta_filename+'.sensitive.config', 'w') as f:
f.write("sensitive={s}\n".format(s=self.sensitive_mode))
f.write("low={l}\n".format(l=self.low_cDNA_size))
f.write("high={h}\n".format(h=self.high_cDNA_size))
except IOError:
pass # it's OK not to have write permission
def summary(self):
""" Method to produce a summary table of of the Mack Chainladder
model.
Returns:
This calculation is consistent with the R calculation
BootChainLadder$summary
"""
IBNR = self.IBNR
tri = self.tri
summary = pd.DataFrame()
summary['Latest'] = tri.get_latest_diagonal()
summary['Mean Ultimate'] = summary['Latest'] + pd.Series([np.mean(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index)
summary['Mean IBNR'] = summary['Mean Ultimate'] - summary['Latest']
summary['SD IBNR'] = pd.Series([np.std(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index)
summary['IBNR 75%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=75) for num in range(len(tri.data))],index=summary.index)
summary['IBNR 95%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=95) for num in range(len(tri.data))],index=summary.index)
return summary
def run_ltsd_example():
fs, d = fetch_sample_speech_tapestry()
winsize = 1024
d = d.astype("float32") / 2 ** 15
d -= d.mean()
pad = 3 * fs
noise_pwr = np.percentile(d, 1) ** 2
noise_pwr = max(1E-9, noise_pwr)
d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d))
_, vad_segments = ltsd_vad(d, fs, winsize=winsize)
v_up = np.where(vad_segments == True)[0]
s = v_up[0]
st = v_up[-1] + int(.5 * fs)
d = d[s:st]
bname = "tapestry.wav".split(".")[0]
wavfile.write("%s_out.wav" % bname, fs, soundsc(d))
def run_ltsd_example():
fs, d = fetch_sample_speech_tapestry()
winsize = 1024
d = d.astype("float32") / 2 ** 15
d -= d.mean()
pad = 3 * fs
noise_pwr = np.percentile(d, 1) ** 2
noise_pwr = max(1E-9, noise_pwr)
d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d))
_, vad_segments = ltsd_vad(d, fs, winsize=winsize)
v_up = np.where(vad_segments == True)[0]
s = v_up[0]
st = v_up[-1] + int(.5 * fs)
d = d[s:st]
bname = "tapestry.wav".split(".")[0]
wavfile.write("%s_out.wav" % bname, fs, soundsc(d))
def _compute_data_weights_topk(self, opts, density_ratios):
"""Put a uniform distribution on K points with largest prob real data.
This is a naiive heuristic which makes next GAN concentrate on those
points of the training set, which were classified correctly with
largest margins. I.e., out current mixture model is not capable of
generating points looking similar to these ones.
"""
threshold = np.percentile(density_ratios,
opts["topk_constant"]*100.0)
# Note that largest prob_real_data corresponds to smallest density
# ratios.
mask = density_ratios <= threshold
data_weights = np.zeros(self._data_num)
data_weights[mask] = 1.0 / np.sum(mask)
return data_weights
def quantile_binning(data=None, bins=10, qrange=(0.0, 1.0), **kwargs):
"""Binning schema based on quantile ranges.
This binning finds equally spaced quantiles. This should lead to
all bins having roughly the same frequencies.
Note: weights are not (yet) take into account for calculating
quantiles.
Parameters
----------
bins: sequence or Optional[int]
Number of bins
qrange: Optional[tuple]
Two floats as minimum and maximum quantile (default: 0.0, 1.0)
Returns
-------
StaticBinning
"""
if np.isscalar(bins):
bins = np.linspace(qrange[0] * 100, qrange[1] * 100, bins + 1)
bins = np.percentile(data, bins)
return static_binning(bins=make_bin_array(bins), includes_right_edge=True)
def estimate_bayes_factor(traces, logp, r=0.05):
"""
Esitmate Odds ratios in a random subsample of the chains in MCMC
AstroML (see Eqn 5.127, pg 237)
"""
ndim, nsteps = traces.shape # [ndim,number of steps in chain]
# compute volume of a n-dimensional (ndim) sphere of radius r
Vr = np.pi ** (0.5 * ndim) / gamma(0.5 * ndim + 1) * (r ** ndim)
# use neighbor count within r as a density estimator
bt = BallTree(traces.T)
count = bt.query_radius(traces.T, r=r, count_only=True)
# BF = N*p/rho
bf = logp + np.log(nsteps) + np.log(Vr) - np.log(count) #log10(bf)
p25, p50, p75 = np.percentile(bf, [25, 50, 75])
return p50, 0.7413 * (p75 - p25)
########################################################################
def reducer(self, key, values):
if key == 'latencies':
values = list(values)
yield 'average_latency', numpy.average(values)
yield 'median_latency', numpy.median(values)
yield '95th_percentile_latency', numpy.percentile(values, 95)
elif key == 'sent':
first, last, count = minmax(values)
yield 'count', count
yield 'sending_first', first
yield 'sending_last', last
yield 'sending_overhead', (last - first) / (count - 1)
yield 'sending_throughput', (count - 1) / (last - first)
elif key == 'received':
first, last, count = minmax(values)
yield 'receiving_first', first
yield 'receiving_last', last
yield 'receiving_overhead', (last - first) / (count - 1)
yield 'receiving_throughput', (count - 1) / (last - first)
def calculate_batchsize_maxlen(texts):
""" Calculates the maximum length in the provided texts and a suitable
batch size. Rounds up maxlen to the nearest multiple of ten.
# Arguments:
texts: List of inputs.
# Returns:
Batch size,
max length
"""
def roundup(x):
return int(math.ceil(x / 10.0)) * 10
# Calculate max length of sequences considered
# Adjust batch_size accordingly to prevent GPU overflow
lengths = [len(tokenize(t)) for t in texts]
maxlen = roundup(np.percentile(lengths, 80.0))
batch_size = 250 if maxlen <= 100 else 50
return batch_size, maxlen
def ss_octile(y):
"""Obtain the octile summary statistic.
The statistic reaches the optimal performance upon a high number of
observations. According to Allingham et al. (2009), it is more stable than ss_robust.
Parameters
----------
y : array_like
Yielded points.
Returns
-------
array_like of the shape (batch_size, dim_ss=8, dim_ss_point)
"""
octiles = np.linspace(12.5, 87.5, 7)
E1, E2, E3, E4, E5, E6, E7 = np.percentile(y, octiles, axis=1)
# Combining the summary statistics.
ss_octile = np.hstack((E1, E2, E3, E4, E5, E6, E7))
ss_octile = ss_octile[:, :, np.newaxis]
return ss_octile
def flag_outliers(series, iqr_multiplier=1.5):
"""Use Tukey's boxplot criterion for outlier identification.
"""
top_quartile_cutoff = np.percentile(series.get_values(), 75)
bottom_quartile_cutoff = np.percentile(series.get_values(), 25)
# Compute interquartile range
iqr = top_quartile_cutoff - bottom_quartile_cutoff
top_outlier_cutoff = top_quartile_cutoff + iqr * iqr_multiplier
bottom_outlier_cutoff = bottom_quartile_cutoff - iqr * iqr_multiplier
return series[(series < bottom_outlier_cutoff) | (series > top_outlier_cutoff)]
# In[ ]:
def truncate_and_scale(data, percMin=0.01, percMax=99.9, zeroTo=1.0):
"""Truncate and scale the data as a preprocessing step.
Parameters
----------
data : nd numpy array
Data/image to be truncated and scaled.
percMin : float, positive
Minimum percentile to be truncated.
percMax : float, positive
Maximum percentile to be truncated.
zeroTo : float
Data will be returned in the range from 0 to this number.
Returns
-------
data : nd numpy array
Truncated and scaled data/image.
"""
# adjust minimum
percDataMin = np.percentile(data, percMin)
data[np.where(data < percDataMin)] = percDataMin
data = data - data.min()
# adjust maximum
percDataMax = np.percentile(data, percMax)
data[np.where(data > percDataMax)] = percDataMax
data = 1./data.max() * data
return data * zeroTo
def preprocess_bitmap(bitmap):
# contrast stretching
p2, p98 = np.percentile(bitmap, (2, 98))
assert abs(p2-p98) > 10
bitmap = skimage.exposure.rescale_intensity(bitmap, in_range=(p2, p98))
# from skimage.filters import threshold_otsu
# thresh = threshold_otsu(bitmap)
# bitmap = bitmap > thresh
return bitmap
def plot_tsne_totalcounts(chart, sample_properties, sample_data):
""" Plot cells colored by total counts """
analysis = sample_data.analysis
if not analysis or len(sample_properties['genomes']) > 1:
return None
reads_per_bc = analysis.matrix.get_reads_per_bc()
vmin, vmax = np.percentile(reads_per_bc, ws_gex_constants.TSNE_TOTALCOUNTS_PRCT_CLIP)
return plot_dimensions_color(chart, analysis.get_tsne().transformed_tsne_matrix,
reads_per_bc,
ws_gex_constants.TSNE_TOTALCOUNTS_DESCRIPTION,
vmin, vmax,
1, 2)