def traffic_districution(self):
data_dir = g_singletonDataFilePath.getTrainDir()
df = self.load_trafficdf(data_dir)
print df['traffic'].describe()
# sns.distplot(self.gapdf['gap'],kde=False, bins=100);
df['traffic'].plot(kind='hist', bins=100)
plt.xlabel('Traffic')
plt.title('Histogram of Traffic')
return
# def disp_gap_bydistrict(self, disp_ids = np.arange(34,67,1), cls1 = 'start_district_id', cls2 = 'time_id'):
# # disp_ids = np.arange(1,34,1)
# plt.figure()
# by_district = self.gapdf.groupby(cls1)
# size = len(disp_ids)
# # size = len(by_district)
# col_len = row_len = math.ceil(math.sqrt(size))
# count = 1
# for name, group in by_district:
# if not name in disp_ids:
# continue
# plt.subplot(row_len, col_len, count)
# group.groupby(cls2)['gap'].mean().plot()
# count += 1
# return
python类distplot()的实例源码
def disp_gap_bydate(self):
gaps_mean = self.gapdf.groupby('time_date')['gap'].mean()
gaps_mean.plot(kind='bar')
plt.ylabel('Mean of gap')
plt.title('Date/Gap Correlation')
# for i in gaps_mean.index:
# plt.plot([i,i], [0, gaps_mean[i]], 'k-')
plt.show()
return
# def drawGapDistribution(self):
# self.gapdf[self.gapdf['gapdf'] < 10]['gapdf'].hist(bins=50)
# # sns.distplot(self.gapdf['gapdf']);
# # sns.distplot(self.gapdf['gapdf'], hist=True, kde=False, rug=False)
# # plt.hist(self.gapdf['gapdf'])
# plt.show()
# return
# def drawGapCorrelation(self):
# _, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
# res = self.gapdf.groupby('start_district_id')['gapdf'].sum()
# ax1.bar(res.index, res.values)
# res = self.gapdf.groupby('time_slotid')['gapdf'].sum()
# ax2.bar(res.index.map(lambda x: x[11:]), res.values)
# plt.show()
# return
def __init__(self, path, games, logger, suffix):
super(QuestionVsDialogue, self).__init__(path, self.__class__.__name__, suffix)
q_by_d = []
for game in games:
q_by_d.append(len(game.questions))
sns.set_style("whitegrid", {"axes.grid": False})
#ratio question/dialogues
f = sns.distplot(q_by_d, norm_hist =True, kde=False, bins=np.arange(0.5, 25.5, 1))
f.set_xlim(0.5,25.5)
f.set_ylim(bottom=0)
f.set_xlabel("Number of questions", {'size':'14'})
f.set_ylabel("Ratio of dialogues", {'size':'14'})
def __init__(self, path, games, logger, suffix):
super(WordVsQuestion, self).__init__(path, self.__class__.__name__, suffix)
w_by_q = []
for game in games:
for q in game.questions:
q = re.sub('[?]', '', q)
words = re.findall(r'\w+', q)
w_by_q.append(len(words))
sns.set_style("whitegrid", {"axes.grid": False})
# ratio question/words
f = sns.distplot(w_by_q, norm_hist=True, kde=False, bins=np.arange(2.5, 15.5, 1), color="g")
f.set_xlabel("Number of words", {'size': '14'})
f.set_ylabel("Ratio of questions", {'size': '14'})
f.set_xlim(2.5, 14.5)
f.set_ylim(bottom=0)
def overlap_visualize():
train,test,dev = load("nlpcc",filter=True)
test = test.reindex(np.random.permutation(test.index))
df = test
df['qlen'] = df['question'].str.len()
df['alen'] = df['answer'].str.len()
df['q_n_words'] = df['question'].apply(lambda row:len(row.split(' ')))
df['a_n_words'] = df['answer'].apply(lambda row:len(row.split(' ')))
def normalized_word_share(row):
w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" ")))
return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
df['word_share'] = df.apply(normalized_word_share, axis=1)
plt.figure(figsize=(12, 8))
plt.subplot(1,2,1)
sns.violinplot(x = 'flag', y = 'word_share', data = df[0:50000])
plt.subplot(1,2,2)
sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color = 'green')
sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color = 'red')
print evaluation.evaluationBypandas(test,df['word_share'])
plt.show('hold')
def follower_botness(username):
#given a username, it creates the histogram of the botness of the followers
#and saves it in plots (for now) it also returns the probable percentage of follower bots
#(cutoff needs to be defined, for now it is 0.7)"""
cutoff = 0.7
scorelist = []
followers = db.getFollowers(toName=username)
for f in followers:
follower = f['_from'].split('/')[1]
score = db.getUser(follower)['botness']['score']
scorelist.append(score)
if scorelist:
scores = pd.Series(scorelist, name='probability of follower bot')
ax = sns.distplot(scores)
fig = ax.get_figure()
fig.savefig('testfig.png')
botpercent = sum(np.array(scorelist)>cutoff) / len(scorelist)
return botpercent
else:
return None
def plot_word_frequency(df, words, category):
plt.figure()
allwords = df['lines'].str.cat(sep=' ')
allwords = allwords.split()
allwords = [word.lower() for word in allwords]
allwords = np.array(allwords)
indices = []
for word in words:
new_indices = np.where(allwords == word)
for index in new_indices[0]:
indices.append(index)
sns.distplot(indices, rug=True, hist=False)
plt.xlim(0, len(allwords))
plt.ylabel("{category} word frequency".format(category=category))
plt.xlabel("Time (words)")
plt.title("{category} words over time".format(category=category),
loc='left')
plt.savefig("../plots/{category}_frequency".format(category=category))
def _barplot(self, first: RunData, second: RunData, property: str, size: int,
filename: str = None, show_ticks: bool = True) -> str:
import matplotlib.pyplot as plt
import seaborn as sns
filename = filename or self._get_new_figure_filename()
self._set_fig_size(size)
length = min(len(first[property]), len(second[property]))
first_prop = first[property][0:length]
second_prop = second[property][0:length]
min_xval = min(first_prop + second_prop)
max_xval = max(first_prop + second_prop)
bins = np.linspace(min_xval, max_xval, math.floor(math.sqrt(length) * size))
sns.distplot(first_prop, bins=bins,label=first.description(), kde=False)
sns.distplot(second_prop, bins=bins,label=second.description(), kde=False)
if not show_ticks:
plt.xticks([])
plt.yticks([])
plt.xlim(min_xval, max_xval)
plt.legend()
plt.savefig(filename)
plt.close()
return filename
def plotHistogram(series, x_label, scale = "linear", normed=False, name = None):
figure_name = "histogram"
if name:
figure_name = name + "_" + figure_name
figure = pyplot.figure()
axis = figure.add_subplot(1, 1, 1)
seaborn.distplot(series, kde = False, norm_hist=normed,ax = axis)
axis.set_yscale(scale)
axis.set_xlabel(x_label)
# axis.set_ylabel(y_label)
data.saveFigure(figure, figure_name)
def plot_dist(
main_file, mask_file, xlabel, distribution=None, xlabel2=None,
figsize=DINA4_LANDSCAPE):
data = _get_values_inside_a_mask(main_file, mask_file)
fig = plt.Figure(figsize=figsize)
FigureCanvas(fig)
gsp = GridSpec(2, 1)
ax = fig.add_subplot(gsp[0, 0])
sns.distplot(data.astype(np.double), kde=False, bins=100, ax=ax)
ax.set_xlabel(xlabel)
ax = fig.add_subplot(gsp[1, 0])
sns.distplot(np.array(distribution).astype(np.double), ax=ax)
cur_val = np.median(data)
label = "{0!g}".format(cur_val)
plot_vline(cur_val, label, ax=ax)
ax.set_xlabel(xlabel2)
return fig
def cross_section_hist(data, factor_name, date):
'''???????????????????????
??
--------------------------------
data:DataFrame(index:[Date,IDs],factor1,factor2,...)
factor_name:str
date?str
'''
plot_data = data.ix[(date,), factor_name].reset_index(drop=True)
ax = sns.distplot(plot_data)
return ax
# ??3
# Quantile-Quantile?????????????
def plot_predict_proba(y_pred_probs, clf, pdf=None):
"""Plots the predict proba distribution"""
fig, ax = plt.subplots(1, figsize=(18, 8))
sns.set_style("white")
sns.set_context("poster",
font_scale=2.25,
rc={"lines.linewidth": 1.25, "lines.markersize": 8})
sns.distplot(y_pred_probs)
plt.xlabel('predict_proba')
plt.ylabel('frequency')
plt.title(clf + ' proba')
if pdf:
pdf.savefig()
plt.close()
else:
plt.show()
def plot_mean_bootstrap_exponential_readme():
X = np.random.exponential(7, 4)
classical_samples = [np.mean(resample(X)) for _ in range(10000)]
posterior_samples = mean(X, 10000)
l, r = highest_density_interval(posterior_samples)
classical_l, classical_r = highest_density_interval(classical_samples)
plt.subplot(2, 1, 1)
plt.title('Bayesian Bootstrap of mean')
sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples')
plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
plt.xlim(-1, 18)
plt.legend()
plt.subplot(2, 1, 2)
plt.title('Classical Bootstrap of mean')
sns.distplot(classical_samples, label='Classical Bootstrap Samples')
plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
plt.xlim(-1, 18)
plt.legend()
plt.savefig('readme_exponential.png', bbox_inches='tight')
def joint_plot(x, y, xlabel=None,
ylabel=None, xlim=None, ylim=None,
loc="best", color='#0485d1',
size=8, markersize=50, kind="kde",
scatter_color="r"):
with sns.axes_style("darkgrid"):
if xlabel and ylabel:
g = SubsampleJointGrid(xlabel, ylabel,
data=DataFrame(data={xlabel: x, ylabel: y}),
space=0.1, ratio=2, size=size, xlim=xlim, ylim=ylim)
else:
g = SubsampleJointGrid(x, y, size=size,
space=0.1, ratio=2, xlim=xlim, ylim=ylim)
g.plot_joint(sns.kdeplot, shade=True, cmap="Blues")
g.plot_sub_joint(plt.scatter, 1000, s=20, c=scatter_color, alpha=0.3)
g.plot_marginals(sns.distplot, kde=False, rug=False)
g.annotate(ss.pearsonr, fontsize=25, template="{stat} = {val:.2g}\np = {p:.2g}")
g.ax_joint.set_yticklabels(g.ax_joint.get_yticks())
g.ax_joint.set_xticklabels(g.ax_joint.get_xticks())
return g
def joint_overplot(x, y, df, fig, color='r', marg_kws=None):
"""Overplot additional data on existing JointGrid instance.
Args:
x (str):
y (str):
df (DataFrame):
fig: seaborn JointGrid instance.
color (str): Color.
marg_kws (dict): Keyword arguments to pass to plot_marginals().
Returns:
fig: seaborn JointGrid instance.
"""
if marg_kws is None:
marg_kws = dict(norm_hist=True,
hist_kws=dict(weights=df.Survivors.values))
fig.x = df[x]
fig.y = df[y]
fig.plot_joint(plt.scatter, c=color)
fig.plot_marginals(sns.distplot, color=color, kde=False, axlabel=False,
**marg_kws)
return fig
def display_covariate_dist(self, covariate_list, save_file=None):
'''
'''
n_covars = len(covariate_list)
for covariate in covariate_list:
g = sns.FacetGrid(self.data, col="arm_assignment")
if len(self.data[covariate].unique())>2:
g.map(sns.distplot, covariate, kde=False)
else:
g.map(sns.distplot, covariate, kde=False)
if save_file:
g.savefig(save_file, dpi=450)
if save_file is None:
sns.plt.show()
def weight_norm_histogram(rbm, show_plot=False, filename=None):
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
for l in range(rbm.num_weights):
num_inputs = rbm.weights[l].shape[0]
norm = be.to_numpy_array(be.norm(rbm.weights[l].W(), axis=0) / sqrt(num_inputs))
sns.distplot(norm, ax=ax, label=str(l))
ax.legend()
if show_plot:
plt.show(fig)
if filename is not None:
fig.savefig(filename)
plt.close(fig)
def screen_zscore(series, axis=None, z_score=False, plot=True):
"""
Calculate screen z score (difference between positive and negative controls).
"""
Z = lambda pos, neg: 1 - (3 * (np.std(pos) + np.std(neg)) / (abs(np.mean(pos) - np.mean(neg))))
if z_score:
series = (series - series.mean()) / series.std()
pos = series.ix[series.index[series.index.str.contains("Essential")]]
neg = series.ix[series.index[series.index.str.contains("CTRL")]]
z = Z(pos, neg)
# Plot
if plot:
pos.name = None
neg.name = None
if axis is None:
fig, axis = plt.subplots(1)
sns.distplot(pos, ax=axis, label="positive controls")
sns.distplot(neg, ax=axis, label="negative controls; screen Z-score = {}".format(z))
return z
def target_plot(self):
target_type = self.input_data.metadata.loc[self.target].type
target_data = self.input_data.df[self.target]
sns.set(style="white", color_codes=True)
if not self.run_time_config['is_time_series']:
if target_type == ColType.BINARY:
plt.figure(figsize=(6, 1))
sns.barplot(target_data.sum() / target_data.shape[0])
plt.xlim([0, 1])
plt.title(target_data.name + ' rate')
elif target_type == ColType.NUMERIC or target_type == ColType.ORDINAL:
plt.figure(figsize=(6, 2))
ax = sns.distplot(target_data, hist_kws=dict(edgecolor='black'))
ax.set_xlim(target_data.min(), target_data.max())
plt.title(target_data.name + ' histogram')
else:
self.time_series_target_plot()
def distribution(data,xlabel="data",ylabel="percentage",name=None):
ax = plt.axes()
ax.set(xlabel=xlabel,ylabel=ylabel)
ds = sns.distplot(data,ax=ax)
plt.show()
if name is not None:
ds.get_figure().savefig(name)
def weather_distribution(self):
data_dir = g_singletonDataFilePath.getTrainDir()
self.gapdf = self.load_weatherdf(data_dir)
print self.gapdf['weather'].describe()
# sns.distplot(self.gapdf['gap'],kde=False, bins=100);
sns.countplot(x="weather", data=self.gapdf, palette="Greens_d");
plt.title('Countplot of Weather')
# self.gapdf['weather'].plot(kind='bar')
# plt.xlabel('Weather')
# plt.title('Histogram of Weather')
return
def gapdistricution(self):
data_dir = g_singletonDataFilePath.getTrainDir()
self.gapdf = self.load_gapdf(data_dir)
print self.gapdf['gap'].describe()
# sns.distplot(self.gapdf['gap'],kde=False, bins=100);
self.gapdf['gap'].plot(kind='hist', bins=200)
plt.xlabel('Gaps')
plt.title('Histogram of Gaps')
return
def generateDistributionPlot(test):
sns.set(color_codes=True)
for row in test:
label = row.pop(0)
d = [float(i) for i in row]
# Plot a filled kernel density estimate
sns.distplot(d, hist=False, kde_kws={"shade": True}, label=label)
plt.xlim([-0.01, 0.1])
plt.xlabel("time (s)")
plt.ylabel("operations")
def plot_dist(train_y,dev_y,test_y):
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('text', usetex=True)
plt.rc('font', family='Times-Roman')
sns.set_style(style='white')
color = sns.color_palette("Set2", 10)
fig = plt.figure(figsize=(8,12))
ax1 = fig.add_subplot(3, 1, 1)
# plt.title("Label distribution",fontsize=20)
sns.distplot(train_y,kde=False,label='Training', hist=True, norm_hist=True,color="blue")
ax1.set_xlabel("Answer")
ax1.set_ylabel("Frequency")
ax1.set_xlim([0,500])
plt.legend(loc='best')
ax2 = fig.add_subplot(3, 1, 2)
sns.distplot(dev_y,kde=False,label='Validation', hist=True, norm_hist=True,color="green")
ax2.set_xlabel("Answer")
ax2.set_ylabel("Frequency")
ax2.set_xlim([0,500])
plt.legend(loc='best')
ax3 = fig.add_subplot(3, 1, 3)
sns.distplot(test_y,kde=False,label='Test', hist=True, norm_hist=True,color="red")
ax3.set_xlabel("Answer")
ax3.set_ylabel("Frequency")
ax3.set_xlim([0,500])
plt.legend(loc='best')
plt.savefig('checkpoints/label_dist.pdf', format='pdf', dpi=300)
plt.show()
def getPlot(self, params):
n = int(params['bins'])
fig = sns.distplot(self.likes, bins=n, rug = False, kde = False, hist_kws=dict(alpha=0.75, edgecolor="k", linewidth=1))
fig.set_xlabel('Number of likes')
fig.set_ylabel('Count')
return fig
def plot_z(self,indices=None,figsize=(15,5),loc=1):
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
plt.figure(figsize=figsize)
for z in range(1,len(self.z_list)+1):
if indices is not None and z-1 not in indices:
continue
else:
if hasattr(self.z_list[z-1], 'sample'):
sns.distplot(self.z_list[z-1].prior.transform(self.z_list[z-1].sample), rug=False, hist=False,label=self.z_list[z-1].method + ' estimate of ' + self.z_list[z-1].name)
elif hasattr(self.z_list[z-1], 'value') and hasattr(self.z_list[z-1], 'std'):
if self.z_list[z-1].prior.transform_name is None:
x = np.linspace(self.z_list[z-1].value-self.z_list[z-1].std*3.5,self.z_list[z-1].value+self.z_list[z-1].std*3.5,100)
plt.plot(x,mlab.normpdf(x,self.z_list[z-1].value,self.z_list[z-1].std),label=self.z_list[z-1].method + ' estimate of ' + self.z_list[z-1].name)
else:
sims = self.z_list[z-1].prior.transform(np.random.normal(self.z_list[z-1].value,self.z_list[z-1].std,100000))
sns.distplot(sims, rug=False, hist=False,label=self.z_list[z-1].method + ' estimate of ' + self.z_list[z-1].name)
else:
raise ValueError("No information on latent variable to plot!")
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Latent Variable Plot')
plt.legend(loc=1)
plt.show()
def trace_plot(self,figsize=(15,15)):
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
if hasattr(self.z_list[0], 'sample'):
fig = plt.figure(figsize=figsize)
palette = [(0.2980392156862745, 0.4470588235294118, 0.6901960784313725),
(0.3333333333333333, 0.6588235294117647, 0.40784313725490196),
(0.7686274509803922, 0.3058823529411765, 0.3215686274509804),
(0.5058823529411764, 0.4470588235294118, 0.6980392156862745),
(0.8, 0.7254901960784313, 0.4549019607843137),
(0.39215686274509803, 0.7098039215686275, 0.803921568627451)] * len(self.z_list)
for j in range(len(self.z_list)):
chain = self.z_list[j].sample
for k in range(4):
iteration = j*4 + k + 1
ax = fig.add_subplot(len(self.z_list),4,iteration)
if iteration in range(1,len(self.z_list)*4 + 1,4):
a = sns.distplot(self.z_list[j].prior.transform(chain), rug=False, hist=False,color=palette[j])
a.set_ylabel(self.z_list[j].name)
if iteration == 1:
a.set_title('Density Estimate')
elif iteration in range(2,len(self.z_list)*4 + 1,4):
a = plt.plot(self.z_list[j].prior.transform(chain),color=palette[j])
if iteration == 2:
plt.title('Trace Plot')
elif iteration in range(3,len(self.z_list)*4 + 1,4):
plt.plot(np.cumsum(self.z_list[j].prior.transform(chain))/np.array(range(1,len(chain)+1)),color=palette[j])
if iteration == 3:
plt.title('Cumulative Average')
elif iteration in range(4,len(self.z_list)*4 + 1,4):
plt.bar(range(1,10),[acf(chain,lag) for lag in range(1,10)],color=palette[j])
if iteration == 4:
plt.title('ACF Plot')
sns.plt.show()
else:
raise ValueError("No samples to plot!")
def plot_distribution(df, target, tag='eda', directory=None):
r"""Display a Distribution Plot.
Parameters
----------
df : pandas.DataFrame
The dataframe containing the ``target`` feature.
target : str
The target variable for the distribution plot.
tag : str
Unique identifier for the plot.
directory : str, optional
The full specification of the plot location.
Returns
-------
None : None.
References
----------
http://seaborn.pydata.org/generated/seaborn.distplot.html
"""
logger.info("Generating Distribution Plot")
# Generate the distribution plot
dist_plot = sns.distplot(df[target])
dist_fig = dist_plot.get_figure()
# Save the plot
write_plot('seaborn', dist_fig, 'distribution_plot', tag, directory)
#
# Function plot_box
#
def plot_action_distribution(actions, file="action_ditribution.png"):
plt.figure(figsize=(10, 10))
sb.distplot(actions, kde=False)
plt.ylabel("probability")
plt.xlabel("action")
plt.title("Action distribution")
plt.savefig(file)
plt.close()
def plot_value_distribution(values, file="value_distribution.png"):
plt.figure(figsize=(10, 10))
sb.distplot(values)
plt.xlabel("critic value")
plt.title("Value distribution")