def _compute_stats(self, pred, expo, loss, prem):
n_samples, n_groups = pred.shape[0], self.n_groups
pred_ser = pd.Series(pred)
loss_to_returns = np.sum(loss) / np.sum(prem)
rank = pd.qcut(pred_ser, n_groups, labels=False)
n_groups = np.amax(rank) + 1
groups = np.arange(n_groups) # if we ever go back to using n_groups...
tab = pd.DataFrame({
'rank': rank,
'pred': pred,
'prem': prem,
'loss': loss,
'expo': expo
})
grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns
return tab, agg_rlr, n_groups
python类qcut()的实例源码
def discretize(data, bins=5, quantile=False):
'''
Creates 'bins' number of bins and discretizes the data.
Uses cut function by default. qcut function otherwise.
'''
if quantile:
new_data = pd.qcut(data, bins, labels=list(range(bins)))
else:
new_data = pd.cut(data, bins, labels=list(range(bins)))
return new_data
def _recursive_category_gen(col, num_bins):
"""
Generate number of bins recursively
Parameters
----------
col : string
the name of the column in the dataframe with the continuous variable
num_bins : int
how many quantiles
Returns
-------
num_bins : int
categories : list
"""
bin_labels = range(num_bins)
# base case catch
if num_bins == 0:
raise ValueError('Unable to perform qcut to 0 bins.')
# we assume the num_bins count will work
try:
categories = pd.qcut(x=col, q=num_bins, labels=bin_labels)
return num_bins, categories
# if it does not, then we need to go down 1 number of bins
except ValueError:
new_bin_count = num_bins - 1
return _recursive_category_gen(col, new_bin_count)
def categorizeCI2(inputDF, subsampleFactor=10, title=None):
#inputDF = normalize(inputDF)
binLabels = ['Low', 'Medium', 'High']
indices = range(0, inputDF.shape[0], subsampleFactor)
plotDF = inputDF.iloc[indices].copy()
plotDF['bin'] = pd.qcut(inputDF['ci'], len(binLabels), labels=binLabels)
plotDF.drop(['ci'], axis=1, inplace=True)
alpha = 0.3
g = parallel_coordinates(plotDF, 'bin',
color=[[0.8,0.0,0.1,alpha],
[0.0,0.8,0.1,alpha],
[0.1,0.1,0.8,alpha],
])
plt.xticks(rotation=270)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
if title:
title += ' (factor=%d)' % subsampleFactor
g.set_title(title)
return g
Chapter 03_Logistic Regression vs Random Forest.py 文件源码
项目:Statistics-for-Machine-Learning
作者: PacktPublishing
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def IV_calc(data,var):
if data[var].dtypes == "object":
dataf = data.groupby([var])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
else:
data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
def y_transform(Y, data, flatten):
df_y = data[Y]
# if user input 'int' then function will be "greater than value"
# if user input 'float' then function will be IQR range
# below is for case where prediction is true or false
# but the y-feature is in different format (e.g continuous)
if flatten == 'mean':
df_y = pd.DataFrame(df_y >= df_y.mean())
elif flatten == 'median':
df_y = pd.DataFrame(df_y >= df_y.median())
elif flatten == 'mode':
df_y = pd.DataFrame(df_y >= df_y.mode()[0])
elif type(flatten) == int:
df_y = pd.DataFrame(df_y >= flatten)
elif type(flatten) == float:
df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))
# below is for case where the y-feature is converted in
# to a categorical, either if it's a number or string.
elif flatten == 'cat_string':
df_y = pd.Categorical(df_y)
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
elif flatten == 'cat_numeric':
df_y = pd.qcut(df_y, 5, duplicates='drop')
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
# for cases when y-feature is already in the format
# where the prediction output will be.
elif flatten == 'none':
df_y = pd.DataFrame(df_y)
return df_y
def compute(self, today, assets,out,factor,bins):
out[:] = pd.qcut(factor,bins,labels=False)
returns_quantization.py 文件源码
项目:deep-learning-bitcoin
作者: philipperemy
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def add_returns_in_place(df): # modifies df
close_prices_returns = compute_returns(df)
num_bins = 10
returns_bins = pd.qcut(close_prices_returns, num_bins)
bins_categories = returns_bins.values.categories
returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)
df['close_price_returns'] = close_prices_returns
df['close_price_returns_bins'] = returns_bins
df['close_price_returns_labels'] = returns_labels
return df, bins_categories
test_groupby.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_apply_use_categorical_name(self):
from pandas import qcut
cats = qcut(self.df.C, 4)
def get_stats(group):
return {'min': group.min(),
'max': group.max(),
'count': group.count(),
'mean': group.mean()}
result = self.df.groupby(cats).D.apply(get_stats)
self.assertEqual(result.index.names[0], 'C')
def get_node_colors_by_attr(G, attr, num_bins=None, cmap='viridis', start=0, stop=1):
"""
Get a list of node colors by binning some continuous-variable attribute into
quantiles.
Parameters
----------
G : networkx multidigraph
attr : string
the name of the attribute
num_bins : int
how many quantiles (default None assigns each node to its own bin)
cmap : string
name of a colormap
start : float
where to start in the colorspace
stop : float
where to end in the colorspace
Returns
-------
list
"""
if num_bins is None:
num_bins=len(G.nodes())
bin_labels = range(num_bins)
attr_values = pd.Series([data[attr] for node, data in G.nodes(data=True)])
cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
colors = get_colors(num_bins, cmap, start, stop)
node_colors = [colors[cat] for cat in cats]
return node_colors
def get_edge_colors_by_attr(G, attr, num_bins=5, cmap='viridis', start=0, stop=1):
"""
Get a list of edge colors by binning some continuous-variable attribute into
quantiles.
Parameters
----------
G : networkx multidigraph
attr : string
the name of the continuous-variable attribute
num_bins : int
how many quantiles
cmap : string
name of a colormap
start : float
where to start in the colorspace
stop : float
where to end in the colorspace
Returns
-------
list
"""
if num_bins is None:
num_bins=len(G.edges())
bin_labels = range(num_bins)
attr_values = pd.Series([data[attr] for u, v, key, data in G.edges(keys=True, data=True)])
cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
colors = get_colors(num_bins, cmap, start, stop)
edge_colors = [colors[cat] for cat in cats]
return edge_colors
data_manag&visualization.py 文件源码
项目:-Python-Analysis_of_wine_quality
作者: ekolik
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def quartileSplit(wine_set):
print("This is the quartile split of the wines' quality. I-st column contains the intervals of wines' quality;")
print("II-nd - the number of wine samples with the quality in the corresponding interval.")
wine_set["quality_quart"] = pd.qcut(wine_set["quality"], 3)
print(wine_set.groupby("quality_quart").size())
def bin_data(path, write_path, num_chunks, binning):
"""Bins the continuous features through bucket or quantile binning
Parameter
---------
path : str
The path where the dataset to be binned is located.
write_path : str
The path where to save the binned dataset.
num_chunks : int
The number of file splits to perform on the binned dataset.
binning : int
The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
"""
# get the list of files found in PATH
files = nd.list_files(path=path)
df = pd.DataFrame()
for file in files:
# append the data from CSV files to the dataframe
df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names))
print('appending : {}'.format(file))
# remove dst_ip_add and src_ip_add features
df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)
for index in range(len(cols_to_std)):
if int(binning) == 0:
# bucket binning
bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10)
df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True)
print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))
if int(binning) == 1:
# decile binning
df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop')
print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))
for id, df_i in enumerate(np.array_split(df, num_chunks)):
# split and save the dataframe to CSV files
df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save,
header=None, index=False)
print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id))))
def discretize(data, vars_to_discretize, n_bins):
'''
Accepts data, a dictionary containing dicretization type for selected variables, and
a dictionary containing the number of bins for selected variables.
Returns data after selected variables have been discretized,
together with binning definition for each variable.
'''
data_subset = ps.DataFrame(data).copy()
bins = {}
for i in vars_to_discretize:
out = None
binning = None
# discretize by splitting into equal intervals
if vars_to_discretize[i] == 'Equal':
out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)
# discretize by frequency
elif vars_to_discretize[i] == 'Freq':
nb = n_bins[i]
while True:
try:
out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
break
except:
nb -= 1
# discretize based on provided bin margins
elif vars_to_discretize[i] == 'Bins':
out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
binning = n_bins[i]
data_subset.ix[:,i] = out
# replace NA variables with and special index (1+max) -
# if it has not been done so automatically an in np.digitize
data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
bins[i] = binning
return data_subset, bins
def create_figure(df,x,y,discrete,quantileable,continuous,size,color,controls):
xs = df[x.value].values
ys = df[y.value].values
# x_title = x.value.title()
# y_title = y.value.title()
x_title = "Marginal Effective Tax Rate"
y_title = "Asset Category"
source = ColumnDataSource(ColumnDataSource.from_df(df))
kw = dict()
if x.value in discrete:
kw['x_range'] = sorted(set(xs))
if y.value in discrete:
kw['y_range'] = sorted(set(ys))
# kw['title'] = "%s vs %s" % (x_title, y_title)
#kw['title'] = "Marginal Effective Tax Rates on Typically Financed Corporate Investments, 2016 Law"
# kw['title'] = "Marginal Effective Tax Rates on Corporate Investments, 2016 Law"
kw['title'] = "METRs on Corporate Investments, 2016 Law"
p = figure(plot_height=400, plot_width=600, tools='pan,box_zoom,reset,hover', **kw)
p.xaxis.axis_label = x_title
p.yaxis.axis_label = y_title
hover = p.select(dict(type=HoverTool))
hover.tooltips = [('Asset', '@Asset')]
if x.value in discrete:
p.xaxis.major_label_orientation = pd.np.pi / 4
sz = 9
if size.value != 'None':
groups = pd.qcut(df[size.value].values, len(SIZES))
sz = [SIZES[xx] for xx in groups.codes]
c = "#73000A"
if color.value != 'None':
groups = pd.qcut(df[color.value].values, len(COLORS))
c = [COLORS[xx] for xx in groups.codes]
p.circle(x=xs, y=ys, source=source, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)
# p.title.text_color = "black"
# p.title.text_font = "Georgia"
return p
regression_modeling.py 文件源码
项目:-Python-Analysis_of_wine_quality
作者: ekolik
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def log_regression(wine_set):
# # examining the data before recoding
# print(wine_set["sulphates"].describe())
# wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4)
# print(wine_set.groupby("sulphates_c").size())
# print()
# #
# print(wine_set["alcohol"].describe())
# wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4)
# print(wine_set.groupby("alcohol_c").size())
# print()
#
# print(wine_set["quality"].describe())
# wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3)
# print(wine_set.groupby("quality_c").size())
# print()
# recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9}
recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1}
wine_set['quality_c'] = wine_set['quality'].map(recode)
# recode sulphates into 2 groups: 0: <= mean, 1: > mean
def sulphates_to_cat(x):
if x['sulphates'] <= wine_set['sulphates'].mean():
return 0
else:
return 1
wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1)
# recode alcohol into 2 groups: 0: <= mean , 1: > mean
def alcohol_to_cat(x):
if x['alcohol'] <= wine_set['alcohol'].mean():
return 0
else:
return 1
wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1)
# print(wine_set.head(10))
# logistic regression for sulphates+alcohol -> quality
print ("Logistic regression model for the association between wine's quality and sulphates&alcohol")
model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set)
results1 = model1.fit()
print(results1.summary())
# odds ratios with 95% confidence intervals
print("\nConfidence intervals")
conf = results1.conf_int()
conf['Odds ratio'] = results1.params
conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio']
print(numpy.exp(conf))