def visualize_results(self):
# Visualize logistic curve using seaborn
sns.set(style="darkgrid")
sns.regplot(x="pageviews_cumsum",
y="is_conversion",
data=self.df,
logistic=True,
n_boot=500,
y_jitter=.01,
scatter_kws={"s": 60})
sns.set(font_scale=1.3)
sns.plt.title('Logistic Regression Curve')
sns.plt.ylabel('Conversion probability')
sns.plt.xlabel('Cumulative sum of pageviews')
sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
sns.plt.show()
python类set()的实例源码
business_case_solver.py 文件源码
项目:themarketingtechnologist
作者: thomhopmans
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def outlier_identification(self, model, x_train, y_train):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print('\nOutlier shapes')
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
model.fit(x_train_split, y_train_split)
y_predicted = model.predict(x_test_split)
residuals = np.absolute(y_predicted - y_test_split)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
outliers_mask = residuals >= rmse_pred_vs_actual
outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
not_an_outlier = outliers_mask == 0
# Resample the training set from split, since the set was randomly split
x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
y_test_split, title_name):
# Split the training data into an extra set of test
# x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def __init__(self, parent):
fig = Figure(figsize=(4, 4), dpi=100, tight_layout=True)
super(DefaultGraph, self).__init__(fig)
self.setParent(parent)
sns.set(style="dark")
for index, s in zip(range(9), np.linspace(0, 3, 10)):
axes = fig.add_subplot(3, 3, index + 1)
x, y = np.random.randn(2, 50)
cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)
sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes)
axes.set_xlim(-3, 3)
axes.set_ylim(-3, 3)
axes.set_xticks([])
axes.set_yticks([])
fig.suptitle("Activity Browser", y=0.5, fontsize=30, backgroundcolor=(1, 1, 1, 0.5))
self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding)
self.updateGeometry()
def plot_correlation_fig(data):
"""
Creates a correlation heat map for all columns in user data.
Parameters
----------
data: Pandas DataFrame
User data file as a Pandas DataFrame
Returns
-------
Matplotlib Figure object.
"""
sns.set(context='talk', style='white')
fig = plt.figure()
sns.heatmap(data.corr(), vmin=-1, vmax=1)
plt.tight_layout()
return fig
def plot_count_fig(tasks):
"""
Create count plot, as a 2-row x 3-col bar plot of data points for each k in each covar.
Parameters
----------
tasks: list(dict)
Returns
-------
Matplotlib Figure object.
"""
sns.set(context='talk', style='whitegrid')
df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied']))
df = df.loc[:, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']]
df['covar_type'] = [x.capitalize() for x in df['covar_type']]
df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
f = sns.factorplot(x='k', kind='count', col='covar_type', row='covar_tied', data=df,
row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
palette='Blues_d')
f.set_titles("{col_name}-{row_name}")
f.set_xlabels("Num. of Clusters (K)")
return f.fig
def generateRawPlot(test):
# set figure size
plt.figure(figsize=(15, 6))
handles = []
# draw plot
for raw in test:
label = raw.pop(0)
xAxis = range(len(raw))
yAxis = [float(i) for i in raw]
handle, = plt.plot(xAxis, yAxis, label=label)
handles.append(handle)
# put axis labels
plt.xlabel("operations")
plt.ylabel("time (s)")
plt.legend(handles=handles)
def generateMassPlot(test):
# set figure size
plt.figure(figsize=(15, 6))
handles = []
# draw plot
for raw in test:
label = raw.pop(0)
yAxis = [i / (len(raw)) for i in range(len(raw) + 1)]
values = sorted([float(i) for i in raw])
xAxis = [0] + values
handle, = plt.plot(xAxis, yAxis, label=label)
handles.append(handle)
# put axis labels
plt.xlabel("time (s)")
plt.ylabel("probability of completion")
plt.legend(handles=handles)
def cor_df(data, cols=None, xticklabels=False, yticklabels=False, close=True):
'''
??: ???????????
???:
data: ?????dataframe??
cols: ?????list??????data????
close: ????????
???:
cormat: ??????dataframe??
heatmap: ????fig??
'''
if cols is None:
cols=list(data.columns)
corrmat = data[cols].corr()
fig = plt.figure()
ax = fig.add_subplot(111)
sns.set(context='paper', font='monospace')
sns.heatmap(corrmat, vmax=0.8, square=True, ax=ax, xticklabels=xticklabels, yticklabels=yticklabels)
ax.set_title('Heatmap of Correlation Matrix')
if close:
plt.close('all')
return corrmat, fig
#Distribution
def benchmark_spark(ratings, factors, iterations=5):
conf = (SparkConf()
.setAppName("implicit_benchmark")
.setMaster('local[*]')
.set('spark.driver.memory', '16G')
)
context = SparkContext(conf=conf)
spark = SparkSession(context)
times = {}
try:
ratings = convert_sparse_to_dataframe(spark, context, ratings)
for rank in factors:
als = ALS(rank=rank, maxIter=iterations,
alpha=1, implicitPrefs=True,
userCol="row", itemCol="col", ratingCol="data")
start = time.time()
als.fit(ratings)
elapsed = time.time() - start
times[rank] = elapsed / iterations
print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
finally:
spark.stop()
return times
def generate_speed_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky'],
labels=None, colours=None):
labels = labels or {}
colours = colours or {}
seaborn.set()
fig, ax = plt.subplots()
factors = data['factors']
for key in keys:
ax.plot(factors, data[key],
color=colours.get(key, COLOURS.get(key)),
marker='o', markersize=6)
ax.text(factors[-1] + 5, data[key][-1], labels.get(key, LABELS[key]), fontsize=10)
ax.set_ylabel("Seconds per Iteration")
ax.set_xlabel("Factors")
plt.savefig(filename, bbox_inches='tight', dpi=300)
def find_n_most_similar_articles(self):
"""
Find the n most similar articles with the highest similarity score for each article in the DataFrame.
:return:
"""
# Iterate over each article in DataFrame
for index, row in self.df_article_vectors.iterrows():
# Get the similarity scores of the current article compared to all other articles
similarity_scores = self.similarity_score_dict[index]
# Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
for i in range(0, self.n_most_similar):
# Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
most_similar_article_score = similarity_scores[most_similar_article_index]
del similarity_scores[most_similar_article_index]
# Find corresponding title and set it as most similar article i in DataFrame
title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nl.WhitespaceTokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
stems = []
stemmer = SnowballStemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
def outlier_prediction(x_train, y_train):
# Use built-in isolation forest or use predicted vs. actual
# Compute squared residuals of every point
# Make a threshold criteria for inclusion
# The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
rng = np.random.RandomState(42)
clf_all_features = IsolationForest(max_samples=100, random_state=rng)
clf_all_features.fit(x_train)
# Predict if a particular sample is an outlier using all features for higher dimensional data set.
y_pred_train = clf_all_features.predict(x_train)
# Exclude suggested outlier samples for improvement of prediction power/score
outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
x_train_modified = x_train[outlier_map_out_train, ]
y_train_modified = y_train[outlier_map_out_train, ]
return x_train_modified, y_train_modified
def drop_variable(self, df):
# if HousePrices._is_one_hot_encoder:
# Drop all categorical feature helping columns ('Num')
# Todo: is it defined when importing data set? _feature_names_num
# for feature_name in HousePrices._feature_names_num:
# df = df.drop([feature_name], axis=1)
# is_with_feature_agglomeration = 0
# if is_with_feature_agglomeration:
# print(df.shape)
# df = HousePrices.feature_agglomeration(df)
# print(df.shape)
# df = df.drop(['Fireplaces'], axis=1)
df = df.drop(['Id'], axis=1)
if not any(tuple(df.columns == 'SalePrice')):
# All feature var names occuring in test data is assigned the public varaible df_test_all_feature_var_names.
self.df_test_all_feature_var_names = df.columns
return df
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
# lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
# 0.3, 0.6, 1], cv=10)
lasso.fit(x_train_split, y_train_split)
y_predicted = lasso.predict(X=x_test_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def ranges_to_list(x, start=0, stop=None):
s = set()
for xi in x:
xi = str(xi)
if xi.find('-') >= 0:
t = xi.split('-')
if len(t) != 2:
raise ValueError('Invalid range!')
if len(t[0]) == 0:
t[0] = start
if len(t[1]) == 0:
t[1] = stop
s |= set(range(int(t[0]), int(t[1]) + 1))
else:
s.add(int(xi))
s = sorted(list(s))
return s
def scoped_mpl_import():
import matplotlib
matplotlib.rcParams['backend'] = MPL_BACKEND
import matplotlib.pyplot as plt
plt.rcParams['toolbar'] = 'None' # mute matplotlib toolbar
import seaborn as sns
sns.set(style="whitegrid", color_codes=True, font_scale=1.0,
rc={'lines.linewidth': 1.0,
'backend': matplotlib.rcParams['backend']})
palette = sns.color_palette("Blues_d")
palette.reverse()
sns.set_palette(palette)
return (matplotlib, plt, sns)
def cross_section_cndl(data, factor_name):
'''???????????????
??????????????
??
------------------------------
data:DataFrame(index:[Date,IDs],factor1,factor2,...)
factor_name:str
'''
data = data.reset_index()
sns.set(style='ticks')
ax = sns.boxplot(x='Date', y=factor_name, data=data, palette='PRGn')
sns.despine(offset=10, trim=True)
return ax
# ??2
# ?????, ?????????????
def factor_plot(dataFrame, factors, prediction, color="Set3"):
# First, plot the total for each factor. Then, plot the total for each
# factor for the prediction variable (so in a conversion example, how
# many people converted, revenue per country, etc.)
# These refer to the rows and columns of the axis numpy array; not the
# data itself.
row = 0
column = 0
sns.set(style="whitegrid")
# TODO: Set the width based on the max number of unique
# values for the factors.
plots = plt.subplots(len(factors), 2, figsize=(8,12))
# It should
for factor in factors:
sns.countplot(x=factor, palette="Set3", data=dataFrame,
ax=plots[1][row][column])
# Then print the total for each prediction
sns.barplot(x=factor, y=prediction, data=dataFrame,
ax=plots[1][row][column+1])
row += 1
plt.tight_layout() # Need this or else plots will crash into each other
def swarm(data,x,y,xscale='linear',yscale='linear'):
# set default pretty settings from Seaborn
sns.set(style="white", palette="muted")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 0.2})
# createthe plot
g = sns.swarmplot(x=x, y=y, data=data, palette='RdYlGn')
plt.tick_params(axis='both', which='major', pad=10)
g.set(xscale=xscale)
g.set(yscale=yscale)
# Setting plot limits
start = data[y].min().min()
plt.ylim(start,);
sns.despine()
def correlation(data,title=''):
corr = data.corr(method='spearman')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.set(style="white")
sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 0.3})
rcParams['figure.figsize'] = 25, 12
rcParams['font.family'] = 'Verdana'
rcParams['figure.dpi'] = 300
g = sns.heatmap(corr, mask=mask, linewidths=1, cmap="RdYlGn", annot=False)
g.set_xticklabels(data,rotation=25,ha="right");
plt.tick_params(axis='both', which='major', pad=15);
def plot(self, ax=None, holdon=False):
sns.set(style="white")
data = self.X
if ax is None:
_, ax = plt.subplots()
for i, index in enumerate(self.clusters):
point = np.array(data[index]).T
ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i])
for point in self.centroids:
ax.scatter(*point, marker='x', linewidths=10)
if not holdon:
plt.show()
def plot_mds(subjects, experiments, axes):
for subj, exp, ax in zip(subjects, experiments, axes):
res_fname = "correlation_analysis/{}_{}_ifs.pkz".format(subj, exp)
res = moss.load_pkl(res_fname)
sorter = np.argsort(np.abs(res.prefs))
x_, y_ = res.mds_coords.T.dot(res.prefs)
t = np.arctan2(y_, x_)
rot = [[np.cos(t), np.sin(t)], [-np.sin(t), np.cos(t)]]
x, y = np.dot(rot, res.mds_coords[sorter].T)
cmap = get_colormap(exp)
ax.scatter(x, y, c=res.prefs[sorter],
cmap=cmap, vmin=-1.75, vmax=1.75,
s=8, linewidth=0)
ax.set(xlim=(-.9, .9), ylim=(-.9, .9), aspect="equal")
ax.set_axis_off()
def visualize_housing_data(df):
sns.set(style='whitegrid', context='notebook')
cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
sns.pairplot(df[cols], size=2.5)
plt.show()
correlation_matrix = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.5)
heatmap = sns.heatmap(
correlation_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 15},
yticklabels=cols,
xticklabels=cols,
)
plt.show()
business_case_solver_without_classes.py 文件源码
项目:themarketingtechnologist
作者: thomhopmans
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def visualize_results(df):
# Visualize logistic curve using seaborn
sns.set(style="darkgrid")
sns.regplot(x="pageviews_cumsum",
y="is_conversion",
data=df,
logistic=True,
n_boot=500,
y_jitter=.01,
scatter_kws={"s": 60})
sns.set(font_scale=1.3)
sns.plt.title('Logistic Regression Curve')
sns.plt.ylabel('Conversion probability')
sns.plt.xlabel('Cumulative sum of pageviews')
sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
sns.plt.show()
# Run the final program
def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nltk.WhitespaceTokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
stems = []
stemmer = SnowballStemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
def find_n_most_similar_articles(self):
"""
Find the n most similar articles with the highest similarity score for each TMT article in the DataFrame.
:return:
"""
# Iterate over each article in DataFrame
for index, row in self.df_article_vectors.iterrows():
# Get the similarity scores of the current article compared to all other articles
similarity_scores = self.similarity_score_dict[index]
# Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
for i in range(0, self.n_most_similar):
# Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
most_similar_article_score = similarity_scores[most_similar_article_index]
del similarity_scores[most_similar_article_index]
# Find corresponding title and set it as most similar article i in DataFrame
title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
def image(path, costs):
ys = ['0', '1', '2', '3', '4', '5', '6', '7+', 'X']
xs = [costs.get(k, 0) for k in ys]
sns.set_style('white')
sns.set(font='Concourse C3', font_scale=3)
g = sns.barplot(ys, xs, palette=['grey'] * len(ys))
g.axes.yaxis.set_ticklabels([])
rects = g.patches
sns.set(font='Concourse C3', font_scale=2)
for rect, label in zip(rects, xs):
if label == 0:
continue
height = rect.get_height()
g.text(rect.get_x() + rect.get_width()/2, height + 0.5, label, ha='center', va='bottom')
g.margins(y=0, x=0)
sns.despine(left=True, bottom=True)
g.get_figure().savefig(path, transparent=True, pad_inches=0, bbox_inches='tight')
plt.clf() # Clear all data from matplotlib so it does not persist across requests.
return path