def draw(self):
""" Draw a heat map. """
def get_crosstab(data, row_fact,col_fact, row_names, col_names):
ct = pd.crosstab(data[row_fact], data[col_fact])
ct = ct.reindex_axis(row_names, axis=0).fillna(0)
ct = ct.reindex_axis(col_names, axis=1).fillna(0)
return ct
def plot(data, color):
ct = get_crosstab(
data,
self._groupby[0],
self._groupby[1],
self._levels[0],
self._levels[1])
sns.heatmap(ct,
robust=True,
annot=True,
cbar=False,
cmap=cmap,
fmt="g",
vmax=vmax,
#ax=plt.gca(),
linewidths=1)
if len(self._groupby) < 2:
# create a dummy cross tab with one dimension containing empty
# values:
data_column = self._table[self._groupby[0]].reset_index(drop=True)
tab = pd.crosstab(
pd.Series([""] * len(data_column), name=""),
data_column)
plot_facet = lambda data, color: sns.heatmap(
tab,
robust=True,
annot=True,
cbar=False,
cmap=cmap,
fmt="g",
linewidths=1)
else:
plot_facet = plot
vmax = pd.crosstab(
[self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None],
[self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max()
cmap = ListedColormap(self.options["color_palette_values"])
self.map_data(plot_facet)
python类crosstab()的实例源码
def plot_ks_cdf(y_true,y_score,pos_label=1,label_map=None,color_map=None,decimals=0,
xlabel='Score',ylabel='CumSum',fontsize=12,figsize=(18,8),close=True):
'''
??: ??KS???????????????????
???:
y_true: ?????series?????????{0,1}?{-1,1}??
y_score: ?????series????????????????????
pos_label: int?????positive?????
label_map: ???????????????{0:'Good',1:'Bad'}?
color_map: ????????????????{0:'g',1:'r'}?
decimals: ?????????
xlabel: ??????xlabel?
ylabel: ??????ylabel?
fontsize: int??????
close: ???????
???:
????????{'ks': KS??'split': KS??????'fig': ?????????}?
'''
if label_map is None:
label_map={0:'Good',1:'Bad'}
ks_dict = {}
y_true=pd.Series(y_true)
y_score=pd.Series(y_score)
y_score_dataframe=pd.concat([y_true,y_score],axis=1)
ks=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=False,decimals=decimals)
score_split=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=True,decimals=decimals)
crossfreq = pd.crosstab(y_score.round(decimals),y_true)
crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
color=crossdens.columns.map(lambda xx: color_map.get(xx,None))
crossdens=crossdens.rename(columns=label_map)
crossdens.columns.name=''
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
crossdens.plot(kind='line',ax=ax,fontsize=fontsize,color=color)
ax.set_xlabel(xlabel,fontsize=fontsize)
ax.set_ylabel(ylabel,fontsize=fontsize)
ax.set_title('CDF Curve (KS=%.2f, SPLIT=%.*f)'%(ks,decimals,score_split),fontsize=fontsize)
if close:
plt.close('all')
ks_dict['ks'] = ks
ks_dict['split'] = score_split
ks_dict['fig'] = fig
return ks_dict
def create_crosstabs(model):
r"""Create cross-tabulations for categorical variables.
Parameters
----------
model : alphapy.Model
The model object containing the data.
Returns
-------
model : alphapy.Model
The model object with the updated feature map.
"""
logger.info("Creating Cross-Tabulations")
# Extract model data
X = model.X_train
y = model.y_train
# Extract model parameters
factors = model.specs['factors']
target_value = model.specs['target_value']
# Iterate through columns, dispatching and transforming each feature.
crosstabs = {}
for fname in X:
if fname in factors:
logger.info("Creating crosstabs for feature %s", fname)
ct = pd.crosstab(X[fname], y).apply(lambda r : r / r.sum(), axis=1)
crosstabs[fname] = ct
# Save crosstabs to the feature map
model.feature_map['crosstabs'] = crosstabs
return model
#
# Function get_factors
#
def concordance(series1, series2, method, nreps=1000):
"""
Measures the concordance between two pandas Series and returns a pvalue
and measure of concordance.
Parameters
----------
series1, series2 : pandas Series
Series with matching indexes.
method : str
['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
nreps : int
number of repititions to build the null. Only needed if method is
'empirical'
Returns
-------
measure : float
some sort of measure of concordance (e.g. r for the correlation
methods, n_observed - mean(n_expected) for empirical, etc)
p : float
p value of observed concordance between series1 and series2
"""
if method == 'fisher':
# Note: this automatically ignores any bugs which were not present
# in both series.
mat = pd.crosstab(series1, series2)
return fisher_exact(mat)
elif method == 'spearman':
return spearmanr(series1, series2)
elif method == 'kendalltau':
return kendalltau(series1, series2, nan_policy='omit')
elif method == 'empirical':
return empirical_pval(series1, series2, nreps)
elif method == 'cohen':
tmp = pd.concat((series1, series2), axis=1).dropna()
return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan
else:
raise ValueError('Unknown concordance method.')
clustering_model_kmeans_external.py 文件源码
项目:ML-Predictions
作者: ltfschoen
项目源码
文件源码
阅读 15
收藏 0
点赞 0
评论 0
def process_clustering(self):
print("K-Means Clustering in progress...")
dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]
if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]:
return
# Explore loaded data
df = self.prediction_data
target_column = dataset_choice["target_column"]
affiliation_column = dataset_choice["affiliation_column"]
centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY
# Initialise K-Means Clustering Model using specified quantity of clusters (centroids)
# for training the model using the whole dataset.
kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1)
df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O'])
print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist())
print("All dtypes: ", dict(df.dtypes))
print("Any rows null?: ", df.isnull().values.any())
print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)])
# Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row
# to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a
# cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid)
# Important Note: Pass only numeric dataframe columns
clustered_row_distances = kmeans_model.fit_transform(df_numeric)
# Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column
# and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party')
labels = kmeans_model.labels_
# Show how many are grouped into say Cluster 0
# print(labels.tolist().count(0))
# Count quantity of unique Clusters
print("Clusters total count: %r" % (len(labels.tolist())))
print("Clusters unique count: %r" % (len(set(labels.tolist()))))
cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels))
print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column])))
if self.prediction_config.PLOT_KMEANS_OUTLIERS == True:
self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances)
# Generate new DataFrame column to be used as Target Column for Prediction Algorithms
# (i.e. to detect which roll call votes were most likely to cause extremism such
# that Senators would not vote along their own party lines)
extremism = (clustered_row_distances ** 3).sum(axis=1)
df["extremism"] = extremism
df.sort_values("extremism", inplace=True, ascending=False)
print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10)))
self.prediction_data.df_listings = df
def fishers_exact_plot(data, condition1, condition2, ax=None,
condition1_value=None,
alternative="two-sided", **kwargs):
"""
Perform a Fisher's exact test to compare to binary columns
Parameters
----------
data: Pandas dataframe
Dataframe to retrieve information from
condition1: str
First binary column to compare (and used for test sidedness)
condition2: str
Second binary column to compare
ax : Axes, default None
Axes to plot on
condition1_value:
If `condition1` is not a binary column, split on =/!= to condition1_value
alternative:
Specify the sidedness of the test: "two-sided", "less"
or "greater"
"""
plot = sb.barplot(
x=condition1,
y=condition2,
ax=ax,
data=data,
**kwargs
)
plot.set_ylabel("Percent %s" % condition2)
condition1_mask = get_condition_mask(data, condition1, condition1_value)
count_table = pd.crosstab(data[condition1], data[condition2])
print(count_table)
oddsratio, p_value = fisher_exact(count_table, alternative=alternative)
add_significance_indicator(plot=plot, significant=p_value <= 0.05)
only_percentage_ticks(plot)
if alternative != "two-sided":
raise ValueError("We need to better understand the one-sided Fisher's Exact test")
sided_str = "two-sided"
print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str))
return FishersExactResults(oddsratio=oddsratio,
p_value=p_value,
sided_str=sided_str,
with_condition1_series=data[condition1_mask][condition2],
without_condition1_series=data[~condition1_mask][condition2],
plot=plot)
def rfFitScore(clf, dftrain, dftrain_y, dftest, dftest_y):
'''random forest classifier fit and score.
clf=RandomForestClassifier, dftrain=train data,
dftrain_y=train data Y, dftest=test data,
dftest_y=test data Y'''
clfit = clf.fit(dftrain, dftrain_y['Y']) # clf.fit(X, y)
imp = clfit.feature_importances_ # ndarray of 562
# clfit.fit_transform( X, y=None ) # returns X_new
new_y = clfit.predict( dftest ) # returns predicted Y
test_score = clfit.score( dftest, dftest_y['Y'] )
print("test score:", test_score) # clfit.oob_score_
if (clf.oob_score):
print("oob score", clfit.oob_score_)
# calculate test score by other means
print("predict True %.3f percent, %d out of %d" % \
((100 * sum(dftest_y['Y'] == new_y) / dftest_y.shape[0]), \
sum(dftest_y['Y'] == new_y), dftest_y.shape[0]))
print("predict False %.3f percent, %d out of %d" % \
((100 * sum(dftest_y['Y'] != new_y) / dftest_y.shape[0]), \
sum(dftest_y['Y'] != new_y), dftest_y.shape[0]))
# new_p = clfit.predict_proba( dftest )
# # probability of each X variable to predict each y class
# print("test predict probabilities head:\n", new_p[:5])
# cross table of variable predictions
ptab = pd.crosstab(dftest_y['Y'], new_y, \
rownames=['actual'], colnames=['predicted'])
print("cross table:\n", ptab)
# accuracy: percent labeled correctly
# precision: true positives / (true positives + true negatives)
# recall: true positives / (true positives + false negatives)
precision, recall, fbeta, support = prfs(dftest_y['Y'], new_y)
print("precision", precision, "\nrecall", recall, \
"\nfbeta", fbeta, "\nsupport", support)
if (clf.oob_score):
return test_score, imp, clfit.oob_score_
else:
return test_score, imp
def get_data():
f_path = "../dataset/logistic_regression/UCLA_dataset.csv"
df = pd.read_csv(f_path)
print df.head()
print df.describe()
print df.std()
print pd.crosstab(df['admit'], df['rank'], rownames=['admit'])
# df.hist()
# pl.show()
# dummy_ranks = pd.get_dummies(df['rank'], prefix='rank')
# print dummy_ranks.head()
# train_cols = df.columns[1:]
# lr = sm.Logit(df['admit'], df[train_cols])
# ret = lr.fit()
# print ret.summary()
train, test = train_test_split(df, test_size=0.2)
train_x, train_y = train[train.columns[1:]], train['admit']
test_x, test_y = test[test.columns[1:]], test['admit']
lr = LogisticRegression()
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
print accuracy_score(test_y, y_pred)
rf = RandomForestClassifier(n_jobs=4)
rf.fit(train_x, train_y)
Y_pred = rf.predict(test_x)
cnf_matrix = confusion_matrix(test_y, Y_pred)
print cnf_matrix
accuracy_percent = accuracy_score(test_y, Y_pred)
print "accuracy is: %s%s" % (accuracy_percent, '%')
recall_percent = recall_score(test_y, Y_pred)
print "recall is: %s%s" % (recall_percent, '%')