def transform(self, x):
"""
Parameters:
x (Sequence): - ???????
Returns:
np.array: - ????????????numpy??
"""
s = pd.cut(x, bins=self.bins)
d = pd.get_dummies(s)
z = d.T.to_dict()
re = []
for i, v in z.items():
for j, u in v.items():
if u == 1:
re.append(str(j))
return np.array(re)
python类cut()的实例源码
def test_facet_wrap_expression():
p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
assert p == 'facet_wrap_expression'
def update(attrname, old, new):
new_selected, new_x_factors, new_y_factors = get_subset(dictionary_selector.value, dictionary_selector.value)
bins = np.linspace(new_selected.counts.min(), new_selected.counts.max(), 10) # bin labels must be one more than len(colorpalette)
new_selected["color"] = pd.cut(new_selected.counts, bins, labels = list(reversed(palettes.Blues9)), include_lowest=True)
new_selected["wikidataID"] = new_selected["x"].map(lambda x: wikidataIDs.get(x))
fig.xaxis.axis_label = dictionary_selector.value
fig.yaxis.axis_label = dictionary_selector.value
fig.title.text = "Top %d fact co-occurrences selected" % top_n.value
src = ColumnDataSource(dict(
x=new_selected["x"].astype(object),
y=new_selected["y"].astype(object),
color=new_selected["color"].astype(object),
wikidataID=new_selected["wikidataID"],
counts=new_selected["counts"].astype(int),
raw=new_selected["raw"].astype(int)))
source.data.update(src.data)
fig.x_range.update(factors=new_x_factors[:top_n.value])
fig.y_range.update(factors=new_y_factors[:top_n.value])
def plot_tendencies(word_list, pos_dic, bin_size, output_dir, file_name):
plt.figure()
dataframe_list = list()
for word in word_list:
if word not in pos_dic:
raise Exception('Word ' + word + ' not found')
df = pd.DataFrame(pos_dic[word], columns=['pos'])
df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
df = df.groupby(['bins'])['bins'].count()
dataframe_list.append(df)
df_final = pd.DataFrame(pd.concat(dataframe_list, axis=1)).fillna(0)
df_final.columns = word_list
ax = df_final.plot()
ax.set_xlabel("Position (en % de la longueur de la description)")
ax.set_ylabel("Nombre d'occurrences")
plt.title('Position des mots dans les descriptions des offres', y=1.08)
plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
def discretize(data, bins=5, quantile=False):
'''
Creates 'bins' number of bins and discretizes the data.
Uses cut function by default. qcut function otherwise.
'''
if quantile:
new_data = pd.qcut(data, bins, labels=list(range(bins)))
else:
new_data = pd.cut(data, bins, labels=list(range(bins)))
return new_data
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
count_table = count_table.unstack()
count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
z_boolean_values = count_table_long[z_boolean].unique()
ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
size_factor = maximal_bubble_size/count_table_long['value'].max()
x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
count_table_long[x] = count_table_long[x].map(x_values_dict)
count_table_long[y] = count_table_long[y].map(y_values_dict)
plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
c=count_table_long['ratio'], alpha=0.5,
cmap='cool')
return count_table_long, xticks, yticks, xticklabels, yticklabels
def test_probabilities(model: ClassifierMixin, X: np.array, y: pd.Series,
bins: int = 10, threshold: float = 0.5):
"""Print confusion matrix based on class probability."""
probs = [p[1] for p in model.predict_proba(X)]
print('\tProbabilities')
df = pd.DataFrame({'prob': probs, 'label': y})
step = 1 / bins
cut_labels = [round(step * f, 1) for f in range(10)]
by_prob = (df.groupby(pd.cut(df['prob'], bins, labels=cut_labels))
.agg(['sum', 'count'])['label'])
print('\t\tprobs\t1\t0\tacc')
for index, row in by_prob.iloc[::-1].iterrows():
ones = row['sum']
if math.isnan(ones):
ones = 0
else:
ones = int(ones)
count = row['count']
zeros = int(count) - ones
if count > 0:
acc = zeros / count if index < threshold else ones / count
else:
acc = 0.0
print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
def _discretize_by_width(col, num_bins, labels):
maxvalue = col.max()
minvalue = col.min()
width = float((maxvalue-minvalue))/num_bins
bins = [minvalue + x*width for x in range(num_bins)]+[maxvalue]
if labels:
if len(labels)!=num_bins:
raise ValueError('Length of assigned labels not consistent with num_bins!')
else:
group_names = labels
else:
group_names = range(num_bins)
return pd.cut(col, bins,labels=group_names, include_lowest=True)
def _discretize_by_frequency(col, num_bins, labels):
percent = 1.0/num_bins
bins = sorted(list(set(col.quantile([x*percent for x in range(num_bins+1)]))))
if len(bins)-1 < num_bins:
num_bins = len(bins)-1
print('...Only %d bins (unbalanced) generated due to overlapping percentile boundaries.'%num_bins)
if labels:
if len(labels)!=num_bins:
raise ValueError('Length of assigned labels not consistent with num_bins!')
else:
group_names = labels
else:
group_names = range(num_bins)
return pd.cut(col, bins,labels=group_names, include_lowest=True)
def compute_group(cls, data, scales, **params):
bins = params['bins']
breaks = params['breaks']
binwidth = params['binwidth']
boundary = params['boundary']
func = make_summary_fun(params['fun_data'], params['fun_y'],
params['fun_ymin'], params['fun_ymax'],
params['fun_args'])
breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
data['bin'] = pd.cut(data['x'], bins=breaks, labels=False,
include_lowest=True)
def func_wrapper(data):
"""
Add `bin` column to each summary result.
"""
result = func(data)
result['bin'] = data['bin'].iloc[0]
return result
# This is a plyr::ddply
out = groupby_apply(data, 'bin', func_wrapper)
centers = (breaks[:-1] + breaks[1:]) * 0.5
bin_centers = centers[out['bin'].values]
out['x'] = bin_centers
out['bin'] += 1
if isinstance(scales.x, scale_discrete):
out['width'] = 0.9
else:
out['width'] = np.diff(breaks)[bins-1]
return out
def test_facet_grid_expression():
p = g + facet_grid(
['var2', 'pd.cut(var1, (0, 2, 4), include_lowest=True)'])
assert p == 'facet_grid_expression'
def plot_tendency(word, pos_dic, bin_size, output_dir, file_name):
plt.figure()
if word not in pos_dic:
raise Exception('Word ' + word + ' notfound')
df = pd.DataFrame(pos_dic[word], columns=['pos']) # .groupby(['pos'])['pos'].count()
df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
df = df.groupby(['bins'])['bins'].count()
ax = df.plot(title="Position du mot '" + word + "' dans les descriptions des offres")
ax.set_xlabel("Position (en % de la longueur de la description)")
ax.set_ylabel("Nombre d'occurrences")
plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
def symbolize(self, xs):
"""
Symbolize a PPA
"""
alphabet_sz = len(self.alphabet)
cutpoints = self.cutpoints[alphabet_sz]
return pd.cut(xs, bins = cutpoints, labels = self.alphabet)
def is_not_uniform(idx, nbins=10, allowed_gap=0.75):
idx_bins = pd.cut(idx, bins=nbins, labels=False)
idx_bin_size = np.bincount(idx_bins)
diff = idx_bin_size[:-1] - idx_bin_size[1:]
monotonic = (diff < 0).all() or (diff > 0).all()
huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap
return monotonic or huge_gap
def make_object_map(data,field,**kwargs):
linear = False
for key,value in kwargs.iteritems():
if key == 'linear':
linear = value
print linear
if linear == False:
colors,rangelist = make_distributed_range(data,field)
else:
colors = get_heatmap51()
colors2 = colors
maxvalue = data[field].max()
if maxvalue < 51:
totallist = range(maxvalue)
colors = reduce_color_list_size(totallist,colors)
colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
else:
colors = reduce_color_list_size(range(len(data)),colors)
colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
if not rangelist[0] == 0:
rangelist = [0] + rangelist[1:]
data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)
return data
colors2 = get_heatmap51()
if not rangelist[0] == 0:
rangelist = [0] + rangelist[1:]
data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])
return data
# for a given dataframe and field returns a non used grouped object to multiple operations on
def make_object_map(data,field,**kwargs):
linear = False
for key,value in kwargs.iteritems():
if key == 'linear':
linear = value
print linear
if linear == False:
colors,rangelist = make_distributed_range(data,field)
else:
colors = get_heatmap51()
colors2 = colors
maxvalue = data[field].max()
if maxvalue < 51:
totallist = range(maxvalue)
colors = reduce_color_list_size(totallist,colors)
colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
else:
colors = reduce_color_list_size(range(len(data)),colors)
colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
if not rangelist[0] == 0:
rangelist = [0] + rangelist[1:]
data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)
return data
colors2 = get_heatmap51()
if not rangelist[0] == 0:
rangelist = [0] + rangelist[1:]
data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])
return data
# for a given dataframe and field returns a non used grouped object to multiple operations on
def process_dataset():
data_dir = os.path.dirname(__file__)
df = pd.read_csv(os.path.join(data_dir, 'data/frisk/frisk_with_noise.dat'), skiprows=6, delim_whitespace=True)
# compute proportion black in precinct, black = 1
# first aggregate by precinct/ethnicity, and sum over populations
popdf = df[['pop', 'precinct', 'eth']]. \
groupby(['precinct', 'eth'])['pop'].apply(sum)
percent_black = np.array([ popdf[i][1] / float(popdf[i].sum())
for i in xrange(1, 76)] )
precinct_type = pd.cut(percent_black, [0, .1, .4, 1.]) #
df['precinct_type'] = precinct_type.codes[df.precinct.values-1]
return df
def busmap_by_rectangular_grid(buses, divisions=10):
busmap = pd.Series(0, index=buses.index)
if isinstance(divisions, tuple):
divisions_x, divisions_y = divisions
else:
divisions_x = divisions_y = divisions
gb = buses.groupby([pd.cut(buses.x, divisions_x), pd.cut(buses.y, divisions_y)])
for nk, oks in enumerate(itervalues(gb.groups)):
busmap.loc[oks] = nk
return busmap
test_groupby.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_groupby_categorical_unequal_len(self):
# GH3011
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
# The raises only happens with categorical, not with series of types
# category
bins = pd.cut(series.dropna().values, 4)
# len(bins) != len(series) here
self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def setUp(self):
self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c',
'c', 'c'])
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
df = df.sort_values(by=['value'], ascending=True)
df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False,
labels=labels)
self.cat = df
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_series_functions_no_warnings(self):
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
with tm.assert_produces_warning(False):
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
labels=labels)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_assignment_to_dataframe(self):
# assignment
df = DataFrame({'value': np.array(
np.random.randint(0, 10000, 100), dtype='int32')})
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
df = df.sort_values(by=['value'], ascending=True)
s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
d = s.values
df['D'] = d
str(df)
result = df.dtypes
expected = Series(
[np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D'])
tm.assert_series_equal(result, expected)
df['E'] = s
str(df)
result = df.dtypes
expected = Series([np.dtype('int32'), com.CategoricalDtype(),
com.CategoricalDtype()],
index=['value', 'D', 'E'])
tm.assert_series_equal(result, expected)
result1 = df['D']
result2 = df['E']
self.assertTrue(result1._data._block.values.equals(d))
# sorting
s.name = 'E'
self.assertTrue(result2.sort_index().equals(s.sort_index()))
cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
df = pd.DataFrame(pd.Series(cat))
def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10,
bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=4000,
normalization_by_all = False, log=False):
"""
:param df: dataframe
:param x: name of first numerical/categorical field (string) (for x-axis)
:param y: name of second numerical/categorical field (string) (for y-axis)
:param z_boolean: name of categorical field with two categories / boolean field (for coloring)
:param ordered_x_values: the values we would like to map from x categorical variable
according to the order we would like to present them
:param ordered_y_values: the values we would like to map from the y categorical variable
according to the order we would like to present them
:param bins_x: the bins for x values if x is numberic
:param bins_y: the bins for y values if y is numberic
:param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x)
:param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change!
:param log: whether to apply log on the count (influence the size of the bubbles)
:return: nice bubble plot, bubble size is propotional to the frequency of the bucket :)
"""
plt.figure(figsize=figsize)
x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None
y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1)
count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0)
ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values
ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values
if z_boolean is not None:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size,
normalization_by_all=normalization_by_all)
else:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values,
normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size )
plt.xticks(xticks, xticklabels,fontsize=fontsize)
plt.yticks(yticks, yticklabels,fontsize=fontsize)
plt.xlabel(x, fontsize=fontsize)
plt.ylabel(y, fontsize=fontsize)
if z_boolean is None:
plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4);
else:
plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);
def transform_with_woe(model_data):
cut_point = model_config.logistic_cut
for key in cut_point.keys():
cutss = cut_point[key]['cut_point']
wwoe = cut_point[key]['woe']
model_data[key] = pd.cut(model_data[key],bins=cutss,labels=range(len(cutss) - 1)).map(lambda x:wwoe[x])
return model_data
def create_categorical_features(df, label_list, random_state=None):
"""
Creates random categorical variables
:param df: data frame we're operation on
:param label_list: A list of lists, each list is the labels for one categorical variable
:param random_state: the numpy RandomState
:return: A modified dataframe
Example:
create_categorical_features(df, [['a','b'], ['red','blue']])
"""
random_state = get_random_state(random_state)
df = df.copy()
n_categorical = len(label_list)
# get numeric columns ONCE so we don't have to do it every time we loop:
numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != 'y']
for i in range(0, n_categorical):
# we might be out of numerical columns!
if not numer_cols:
break
# chose a random numeric column that isn't y
chosen_col = random_state.choice(numer_cols)
# pop the chosen_col out of the numer_cols
numer_cols.pop(numer_cols.index(chosen_col))
# use cut to convert that column to categorical
df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i])
return df
def binColumns(inputDF, bins=DEFAULT_BIN_COUNT):
columns = inputDF.columns
binned = pd.DataFrame(columns=columns)
for col in columns:
s = inputDF[col]
binned[col] = pd.cut(s, bins, labels=False)
return binned
# TBD: Finish refactoring this
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
data = pd.read_pickle(file_in)['close']
data = data.reshape(-1, 24)
data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)])
data_s = {
'open_price': np.array([data[i][0][0]
for i in range(data.shape[0] - 1)]),
'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23]
for i in range(data.shape[0] - 1)]),
'max_price': np.array([data[i].max()
for i in range(data.shape[0] - 1)]),
'min_price': np.array([data[i].min()
for i in range(data.shape[0] - 1)]),
'mean_price': np.array([data[i].mean()
for i in range(data.shape[0] - 1)]),
'median_price': np.array([np.median(data[i])
for i in range(data.shape[0] - 1)]),
'buy_or_sell': np.array(
[int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0])
for i in range(data.shape[0] - 1)]),
'change': np.array(
[(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) /
data[i + 1][int(NUM_PIX / 24) - 1][23] * 100
for i in range(data.shape[0] - 1)])}
data_s = pd.DataFrame(data_s)
bins = [-100, -5, -4, -3, -2, -1.5, -1, -
0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
bins = [-100, -5, -2, 0, 2, 5, 100]
labels = [-3, -2, -1, 1, 2, 3]
data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
data = data.reshape(len(data), NUM_PIX)
np.save(file_out[0], data[:len(data) - 1])
data_s.to_pickle(file_out[1])
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
data = pd.read_pickle(file_in)['close']
data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
data = data.reshape(-1, 576)
data_s = {
'open_price': np.array([data[i][0]
for i in range(data.shape[0] - 576)]),
'close_price': np.array([data[i][575]
for i in range(data.shape[0] - 576)]),
'max_price': np.array([data[i].max()
for i in range(data.shape[0] - 576)]),
'min_price': np.array([data[i].min()
for i in range(data.shape[0] - 576)]),
'mean_price': np.array([data[i].mean()
for i in range(data.shape[0] - 576)]),
'median_price': np.array([np.median(data[i])
for i in range(data.shape[0] - 576)]),
'buy_or_sell': np.array(
[int(data[i + 576][575] > data[i + 576][0])
for i in range(data.shape[0] - 576)]),
'change': np.array(
[(data[i + 576][575] - data[i + 576][0]) /
data[i + 576][575] * 100
for i in range(data.shape[0] - 576)])}
data_s = pd.DataFrame(data_s)
bins = [-100, -5, -4, -3, -2, -1.5, -1, -
0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
bins = [0.01 * x for x in bins]
labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
bins = [-100, -5, -2, 0, 2, 5, 100]
bins = [0.01 * x for x in bins]
labels = [-3, -2, -1, 1, 2, 3]
data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
np.save(file_out[0], data[:len(data) - 576])
data_s.to_pickle(file_out[1])
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
data = pd.read_pickle(file_in)['close']
data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
data = data.reshape(-1, 576)
data_s = {
'open_price': np.array([data[i][0]
for i in range(data.shape[0] - 576)]),
'close_price': np.array([data[i][575]
for i in range(data.shape[0] - 576)]),
'max_price': np.array([data[i].max()
for i in range(data.shape[0] - 576)]),
'min_price': np.array([data[i].min()
for i in range(data.shape[0] - 576)]),
'mean_price': np.array([data[i].mean()
for i in range(data.shape[0] - 576)]),
'median_price': np.array([np.median(data[i])
for i in range(data.shape[0] - 576)]),
'buy_or_sell': np.array(
[int(data[i + 576][575] > data[i + 576][0])
for i in range(data.shape[0] - 576)]),
'change': np.array(
[(data[i + 576][575] - data[i + 576][0]) /
data[i + 576][575] * 100
for i in range(data.shape[0] - 576)])}
data_s = pd.DataFrame(data_s)
bins = [-100, -5, -4, -3, -2, -1.5, -1, -
0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
bins = [0.01 * x for x in bins]
labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
bins = [-100, -5, -2, 0, 2, 5, 100]
bins = [0.01 * x for x in bins]
labels = [-3, -2, -1, 1, 2, 3]
data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
np.save(file_out[0], data[:len(data) - 576])
data_s.to_pickle(file_out[1])
def period_by_hours(x, separation):
''' aggrege le x par intervale d'heure.
Le calcul pourrait être simple si on interdisait
le chevauchement de jour.
'''
print(separation)
assert isinstance(separation, list)
assert all([sep < 24 for sep in separation])
separation.sort()
if 0 in separation:
separation.append(24)
hour_categ = pd.cut(x.dt.hour, separation, right=False)
date_categ = x.dt.date
return date_categ.astype(str) + ' ' + hour_categ.astype(str)
else:
hour = x.dt.hour
hour_categ = pd.cut(hour, separation, right=False).astype(str)
night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')'
hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ
assert hour_categ.nunique(dropna=False) == len(separation)
date_categ = x.dt.date.astype(str)
# décalage d'un jour pour les premières heures
decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1)
date_categ[x.dt.hour < separation[1]] = decale.astype(str)
assert all(date_categ.str.len() == 10)
return date_categ + ' ' + hour_categ
### 4 - special