def get_python_guangzhou():
frame2 = frame[(frame.kd == 'Python') &(frame.city == u'??') ]
cframe = [v for k, v in frame2.to_dict(orient='index').items()]
pattern = r'\d{4}-\d{2}-\d{2}'
for c in cframe:
if re.match(pattern, c['published']):
pass
else:
c['published'] = datetime.datetime.utcnow().strftime("%Y-%m-%d")
df = DataFrame(cframe)
df['published'] = pd.to_datetime(df['published'])
mask = (df['published'] > '2016-04-01') & (df['published'] <= '2016-05-02')
dataframe = df.loc[mask]
jobframe = pd.crosstab(dataframe.experience, frame.salary, margins=True).sort_values(by='All', ascending=False)
jobframe = jobframe.drop('All', axis=0).drop('All', axis=1)
pie_chart = pygal.StackedBar()
pie_chart.title = u'???python?????'
pie_chart.x_labels = jobframe.index
for cit, num in jobframe.iteritems():
pie_chart.add("%s" % (cit), num)
pie_chart.render_to_file(os.path.dirname(__file__) + '/chart/guangzhou_salary.svg')
python类crosstab()的实例源码
def flavor_profile(df,ingr,comp,ingr_comp):
sorted_ingredients = df.columns
underscore_ingredients=[]
for item in sorted_ingredients:
underscore_ingredients.append(item.replace(' ','_'))
print len(underscore_ingredients), len(sorted_ingredients)
ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
ingr_total = ingr_total.join(comp,how='right',on='compound id')
ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]
df_flavor = df.values.dot(ingr_flavor.values)
print df.shape, df_flavor.shape
return df_flavor
#normalize flavor matrix with tfidf method
def get_crosstab(self,X,y):
'''
?feature_names?????????????
X?DataFrame???????Series??????
y?Series?index???X????????0-1????????
?????DataFrame?X?Series??????????X?DataFrame?????????????????DataFrame?
'''
if len(X.shape)==1:
result=pd.crosstab(X,y)
else:
result={}
if self.feature_names is None:
if isinstance(X,pd.DataFrame):
feature_names=list(X.columns)
else:
feature_names=[i for i in range(X.shape[1])]
else:
feature_names=self.feature_names
if isinstance(X,pd.DataFrame):
for feature in feature_names:
result[feature]=pd.crosstab(X[feature],y)
else:
for feature in feature_names:
result[feature]=pd.crosstab(X[:,feature],y)
return result
def make(T):
log_tr = log[log.order_number_rev>T]
# dow
dow = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_')
dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_')
# timezone
timezone = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_')
timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_')
# dow * timezone
dow_tz = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_')
dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_')
tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1)
tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T))
def flavor_profile(df,ingr,comp,ingr_comp):
sorted_ingredients = df.columns
underscore_ingredients=[]
for item in sorted_ingredients:
underscore_ingredients.append(item.replace(' ','_'))
print len(underscore_ingredients), len(sorted_ingredients)
ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
ingr_total = ingr_total.join(comp,how='right',on='compound id')
ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]
df_flavor = df.values.dot(ingr_flavor.values)
print df.shape, df_flavor.shape
return df_flavor
#normalize flavor matrix with tfidf method
def ptr_stats(df):
df = df[['CASE DISPOSED STATUS','HCJ Booked','MADE Y / N','PRETRIAL STATUS AT DISPOSITION','bail type made simple']]
crosstab = pd.crosstab([df['CASE DISPOSED STATUS'],df['HCJ Booked'],df['MADE Y / N'],df['PRETRIAL STATUS AT DISPOSITION']], df['bail type made simple'], margins=True)
print(crosstab)
crosstab.to_csv('ptr_stats.csv')
def train_model(split=.25):
"""Tran model based on the iris dataset.
This will split the iris dataset into train and test set, will
train a Random Forest CLassifier and fit the trained model to
the test dataset.
In addition the confusion matrix and features importance will be
calculated.
Args:
split (float): Fraction of observations in the test dataset.
Returns:
RandomForestClassifier: Trained model.
pandas.DataFrame: Confusion matrix.
dictionary: Features importance
"""
iris = load_iris()
all_data = pd.DataFrame(iris.data, columns=iris.feature_names)
features = all_data.columns.str.replace('\s+', '_').str.replace('\W+', '')
all_data['species'] = pd.Categorical.from_codes(iris.target,
iris.target_names)
train, test = train_test_split(all_data, test_size=split)
clf = RandomForestClassifier(n_jobs=1)
clf.fit(train.drop('species', axis=1), train.species)
preds = clf.predict(test.drop('species', axis=1))
conf_matrix = pd.crosstab(test['species'], preds,
rownames=['Actual Species'],
colnames=['Predicted Species'])
f_importances = list(zip(train.drop('species', axis=1).columns,
clf.feature_importances_))
return clf, conf_matrix, f_importances, features
def output_confusion_matrix(self, y, y_pred):
assert y.size == y_pred.size
print("Actual IDV")
print(y.value_counts())
print("Predicted IDV")
print(y_pred.value_counts())
print()
print("Confusion matrix:")
cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
print(cmat)
sys.stdout.flush()
return cmat
#-----------------------------------------------------------------------------
def plot_facet(self, data, color, **kwargs):
x = kwargs.get("x")
y = kwargs.get("y")
levels_x = kwargs.get("levels_x")
levels_y = kwargs.get("levels_y")
#num = []
#date = []
#time = data[self._time_column]
#num = data[self._time_column].apply(self.convert_to_datetime)
#date = data[self._time_column].apply(self.convert_to_timeseries)
#if pd.isnull(num).sum() <= pd.isnull(date).sum():
#data[self._time_column] = num
#else:
#data[self._time_column] = date
#data.dropna(inplace=True)
#if len(self._groupby) == 2:
#ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
#ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
#ct = ct[pd.notnull(ct.index)]
#else:
#ct = pd.crosstab(
#data[self._time_column],
#pd.Series([""] * len(self._table[self._time_column]), name=""))
## Line plot:
#self.vmax = max(self.vmax, ct.values.max())
#ct.plot(ax=plt.gca(), color=self.get_palette())
def plot_facet(self, data, color, **kwargs):
x = kwargs.get("x")
y = kwargs.get("y")
levels_x = kwargs.get("levels_x")
levels_y = kwargs.get("levels_y")
#num = []
#date = []
#time = data[self._time_column]
#num = data[self._time_column].apply(self.convert_to_datetime)
#date = data[self._time_column].apply(self.convert_to_timeseries)
#if pd.isnull(num).sum() <= pd.isnull(date).sum():
#data[self._time_column] = num
#else:
#data[self._time_column] = date
#data.dropna(inplace=True)
#if len(self._groupby) == 2:
#ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
#ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
#ct = ct[pd.notnull(ct.index)]
#else:
#ct = pd.crosstab(
#data[self._time_column],
#pd.Series([""] * len(self._table[self._time_column]), name=""))
## Stacked area plot:
#if len(self._groupby) == 2:
#self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
#ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
def plot_facet(self, data, color, **kwargs):
x = kwargs.get("x")
y = kwargs.get("y")
levels_x = kwargs.get("levels_x")
levels_y = kwargs.get("levels_y")
#num = []
#date = []
#time = data[self._time_column]
#num = data[self._time_column].apply(self.convert_to_datetime)
#date = data[self._time_column].apply(self.convert_to_timeseries)
#if pd.isnull(num).sum() <= pd.isnull(date).sum():
#data[self._time_column] = num
#else:
#data[self._time_column] = date
#data.dropna(inplace=True)
#if len(self._groupby) == 2:
#ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
#ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
#ct = ct[pd.notnull(ct.index)]
#else:
#ct = pd.crosstab(
#data[self._time_column],
#pd.Series([""] * len(self._table[self._time_column]), name=""))
## percentage area plot:
## if there is only one grouping variable (the time column),
## the cross table produces a Series, not a data frame. It
## isn't really very informative to plot it, but we provide
## for this special case anyway_
#if type(ct) == pd.Series:
#ct = ct.apply(lambda x: 100)
#else:
#ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
#ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
def cal_prob(crosstab):
'''
?????????????????c?????????(N(x=c,y=1)+p)/(N(x=c)+1)?
crosstab????DataFrame?index????????column?y???0/1??
?????????????????????????
'''
total=crosstab.sum(axis=0)
p=total.loc[1]/total.sum()
N=crosstab.sum(axis=1)+1
N1=crosstab[1]+p
N.name=''
N.index.name=''
N1.name=''
N1.index.name=''
return dict(N1/N)
def cal_woe(crosstab):
'''
???????WOE??????c???WOE???log(r(x=c,y=1)/r(x=c,y=0))?
??r(x=c,y=1)=N(x=c,y=1)/N(y=1)??????r(x=c,y=0)=N(x=c,y=0)/N(y=0)??????
crosstab????DataFrame?index????????column?y???0/1??
???????????????????WOE?
'''
tmp=crosstab.copy()
#??????????????
tmp[tmp==0]=1
r=tmp/tmp.sum(axis=0)
result=np.log(r[1]/r[0])
return dict(result)
def cal_ks(y,y_prob,pos_label=1,return_split=False,decimals=0):
'''
??KS????????
y: ?????series?????????{0,1}?{-1,1}??
y_prob: ?????dataframe???????????????????????????????????
?????????series?????????dataframe?????
pos_label: int?????positive?????
return_split: ??????????
decimals: ?????????
??KS??????????????sklearn???????
'''
y=pd.Series(pd.Series(y).values)
if len(y_prob.shape)==1:
y_pred=pd.Series(pd.Series(y_prob).values)
else:
y_pred=pd.Series(pd.DataFrame(y_prob).iloc[:,1].values)
Bad=y_pred[y==pos_label]
Good=y_pred[y!=pos_label]
ks, pvalue = stats.ks_2samp(Bad.values, Good.values)
if not return_split:
return ks
crossfreq=pd.crosstab(y_pred.round(decimals),y)
crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
crossdens['gap'] = abs(crossdens[0] - crossdens[1])
score_split = crossdens[crossdens['gap'] == crossdens['gap'].max()].index[0]
return score_split
def get_city_experience():
city_experience = pd.crosstab(frame.city,frame.experience,margins=True).sort_values(by='All',ascending=False)[:11]
city_education = city_experience.drop('All',axis=0).drop('All',axis=1)
ce_chart = pygal.Bar()
ce_chart.title = u'?????????????'
ce_chart.x_labels = city_education.index
for i in range(len(list(city_education.T.index))):
ce_chart.add(city_education.T.index[i], city_education.T.values[i])
ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_experience.svg')
def get_city_phase():
city_pahse = pd.crosstab(frame.city,frame.phase,margins=True).sort_values(by='All',ascending=False)[:11]
city_pahse = city_pahse.drop('All',axis=0).drop('All',axis=1)
funnel_chart = pygal.StackedBar()
funnel_chart.title = u'??????????????'
funnel_chart.x_labels = city_pahse.index
for i in range(len(list(city_pahse.T.index))):
funnel_chart.add(city_pahse.T.index[i], city_pahse.T.values[i])
funnel_chart.render_to_file(os.path.dirname(__file__)+'/chart/phase.svg')
def get_city_education():
city_education = pd.crosstab(frame.city,frame.education,margins=True).sort_values(by='All',ascending=False)[:11]
city_education = city_education.drop('All',axis=0).drop('All',axis=1)
ce_chart = pygal.Bar()
ce_chart.title = u'??????????????'
ce_chart.x_labels = city_education.index
for i in range(len(list(city_education.T.index))):
ce_chart.add(city_education.T.index[i], city_education.T.values[i])
ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_edu.svg')
def multi(uid):
tmp = log[log.user_id==uid]
ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number')
li = []
for pid in ct.columns:
streak = 0
sw_odr = False
for onb,odr in enumerate(ct[pid].values):
onb+=1
if sw_odr == False and odr == 1:
sw_odr = True
streak = 1
li.append([uid, pid, onb, streak])
continue
if sw_odr == True:
if odr == 1 and streak>0:
streak += 1
li.append([uid, pid, onb, streak])
elif odr == 1 and streak<=0:
streak = 1
li.append([uid, pid, onb, streak])
elif odr == 0 and streak>0:
streak = 0
li.append([uid, pid, onb, streak])
elif odr == 0 and streak<=0:
streak -= 1
li.append([uid, pid, onb, streak])
return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak'])
def confusion_matrix(Y_true, Y_pred):
Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)])
Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)])
return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):
cont = features.select_dtypes(include=[np.floating])
disc = features.select_dtypes(include=[np.integer, np.bool])
cont_imp = pd.DataFrame(index=cont.columns)
disc_imp = pd.DataFrame(index=disc.columns)
# Continuous features
if cont_imp.index.size > 0:
# F-test
f_test = feature_selection.f_classif(cont, target)
cont_imp['f_statistic'] = f_test[0]
cont_imp['f_p_value'] = f_test[1]
# Mutual information
mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
n_neighbors=n_neighbors,
random_state=random_state)
cont_imp['mutual_information'] = mut_inf
# Discrete features
if disc_imp.index.size > 0:
# Chi²-test
chi2_tests = defaultdict(dict)
for feature in disc.columns:
cont = pd.crosstab(disc[feature], target)
statistic, p_value, _, _ = stats.chi2_contingency(cont)
chi2_tests[feature]['chi2_statistic'] = statistic
chi2_tests[feature]['chi2_p_value'] = p_value
chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']
# Cramér's V (corrected)
disc_imp['cramers_v'] = [
cramers_v_corrected_stat(pd.crosstab(feature, target).values)
for _, feature in disc.iteritems()
]
# Mutual information
mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
n_neighbors=n_neighbors,
random_state=random_state)
disc_imp['mutual_information'] = mut_inf
return cont_imp, disc_imp
def run_knn(trainx, trainy, testx, testy):
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(trainx, trainy)
pred_y = knn.predict(testx)
print(pd.crosstab(testy, pred_y, rownames=['Actual'],
colnames=['Predicted']))
print('\nAccuracy: ' + str(accuracy_score(testy, pred_y)))
def output_confusion_matrix(self, y, y_pred):
assert y.size == y_pred.size
print("Actual IDV")
print(y.value_counts())
print("Predicted IDV")
print(y_pred.value_counts())
print()
print("Confusion matrix:")
cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
print(cmat)
sys.stdout.flush()
return cmat
#-----------------------------------------------------------------------------
def calc_model_characteristics(self, performCV=True):
# Determine key metrics to analyze the classification model. These
# are stored in the classification_output series object belonginf to
# this class.
for metric in [self.scoring_metric]+self.additional_display_metrics:
#Determine for both test and train, except predict:
for key,data in self.dp.items():
if key!='predict':
name = '%s_%s'%(metric,key)
#Case where probabilities to be passed as arguments
if base_classification.metrics_map[metric][2]:
self.classification_output[name] = \
base_classification.metrics_map[metric][0](
data[self.datablock.target],
self.predictions_probabilities[key])
#case where class predictions to be passed as arguments
else:
self.classification_output[name] = \
base_classification.metrics_map[metric][0](
data[self.datablock.target],
self.predictions_class[key])
#Determine confusion matrix:
name = 'ConfusionMatrix_%s'%key
self.classification_output[name] = pd.crosstab(
data[self.datablock.target],
self.predictions_class[key]
).to_string()
if performCV:
cv_score = self.KFold_CrossValidation(
scoring_metric=self.scoring_metric)
else:
cv_score = {
'mean_error': 0.0,
'std_error': 0.0
}
self.classification_output['CVMethod'] = \
'KFold - ' + str(self.cv_folds)
self.classification_output['CVScore_mean'] = cv_score['mean_error']
self.classification_output['CVScore_std'] = cv_score['std_error']
self.classification_output['Predictors'] = str(self.predictors)
def printReport(self, printConfusionMatrix, printModelParameters):
# Print the metric determined in the previous function.
print("\nModel Report")
#Outpute the parameters used for modeling
if printModelParameters:
print('\nModel being built with the following parameters:')
print(self.alg.get_params())
if printConfusionMatrix:
for key,data in self.dp.items():
if key!='predict':
print("\nConfusion Matrix for %s data:"%key)
print(pd.crosstab(
data[self.datablock.target],
self.predictions_class[key])
)
print('Note: rows - actual; col - predicted')
print("\nScoring Metric:")
for key,data in self.dp.items():
if key!='predict':
name = '%s_%s'%(self.scoring_metric,key)
print("\t%s (%s): %s" %
(
self.scoring_metric,
key,
"{0:.3%}".format(self.classification_output[name])
)
)
print("\nCV Score for Scoring Metric (%s):"%self.scoring_metric)
print("\tMean - %f | Std - %f" % (
self.classification_output['CVScore_mean'],
self.classification_output['CVScore_std'])
)
if self.additional_display_metrics:
print("\nAdditional Scoring Metrics:")
for metric in self.additional_display_metrics:
for key,data in self.dp.items():
if key!='predict':
name = '%s_%s'%(metric,key)
print("\t%s (%s): %s" % (
metric,
key,
"{0:.3%}".format(
self.classification_output[name])
)
)
def advanced_scoring_classifiers(probas, actuals, name=None):
# pandas Series don't play nice here. Make sure our actuals list is indeed a list
actuals = list(actuals)
predictions = list(probas)
print('Here is our brier-score-loss, which is the default value we optimized for while training, and is the value returned from .score() unless you requested a custom scoring metric')
print('It is a measure of how close the PROBABILITY predictions are.')
if name != None:
print(name)
# Sometimes we will be given "flattened" probabilities (only the probability of our positive label), while other times we might be given "nested" probabilities (probabilities of both positive and negative, in a list, for each item).
try:
probas = [proba[1] for proba in probas]
except:
pass
print(format(brier_score_loss(actuals, probas), '.4f'))
print('\nHere is the trained estimator\'s overall accuracy (when it predicts a label, how frequently is that the correct label?)')
predicted_labels = []
for pred in probas:
if pred >= 0.5:
predicted_labels.append(1)
else:
predicted_labels.append(0)
print(format(accuracy_score(y_true=actuals, y_pred=predicted_labels) * 100, '.1f') + '%')
print('\nHere is a confusion matrix showing predictions and actuals by label')
#it would make sense to use sklearn's confusion_matrix here but it apparently has no labels
#took this idea instead from: http://stats.stackexchange.com/a/109015
conf = pd.crosstab(pd.Series(actuals), pd.Series(predicted_labels), rownames=['v Actual v'], colnames=['Predicted >'], margins=True)
print(conf)
print('Here is the accuracy of our trained estimator at each level of predicted probabilities')
# create summary dict
summary_dict = OrderedDict()
for num in range(0, 110, 10):
summary_dict[num] = []
for idx, proba in enumerate(probas):
proba = math.floor(int(proba * 100) / 10) * 10
summary_dict[proba].append(actuals[idx])
for k, v in summary_dict.items():
if len(v) > 0:
print('Predicted probability: ' + str(k) + '%')
actual = sum(v) * 1.0 / len(v)
# Format into a prettier number
actual = round(actual * 100, 0)
print('Actual: ' + str(actual) + '%')
print('# preds: ' + str(len(v)) + '\n')
print('\n\n')
def test_alex(self):
class_index = 0
image_index = 0
total_count = 0.0
accept_sum = 0
actual = []
predict = []
for filename in filenames:
#query-feature
X=self.read_imagelist(filelist_path + filename + extension)
test_num=np.shape(X)[0]
out = self.forward_all(data=X)
predicts=out[self.outputs[0]]
predicts=np.reshape(predicts,(test_num,10))
confusion_array = np.zeros((class_size), dtype = np.int)
for i in range(test_num):
actual.append(class_index)
for j in range(class_size):
if np.max(predicts[i]) == predicts[i][j]:
confusion_array[j] += 1
predict.append(j)
image_index += 1
#print(confusion_array)
total_count += test_num
accept_sum += confusion_array[class_index]
class_index += 1
print 'total:%d' % (round(total_count))
print 'accept:%d' % (accept_sum)
print 'reject:%d' % (round(total_count) - accept_sum)
print 'accuray:%.4f' % (accept_sum / total_count)
#conf_mat = confusion_matrix(actual,predict)
#print(conf_mat)
#actual = np.array(actual)
#predict = np.array(predict)
#y_actual = pd.Series(actual, name='Actual')
#y_predict = pd.Series(predict, name='Predicted')
#df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
#print(df_confusion)
#plot_confusion_matrix(df_confusion)
return (accept_sum / total_count)
#process a text file
def evaluate(self,metric='cosine'):
#sample-feature
X=self.read_imagelist(filelist_sample)
sample_num=np.shape(X)[0]
out = self.forward_all(data=X)
feature1=np.float64(out['deepid'])
feature1=np.reshape(feature1,(sample_num,feature_size))
#np.savetxt('feature1.txt', feature1, delimiter=',')
class_index = 0
image_index = 0
total_count = 0.0
accept_sum = 0
actual = []
predict = []
for filename in filenames:
#query-feature
X=self.read_imagelist(filelist_path + filename + extension)
test_num=np.shape(X)[0]
out = self.forward_all(data=X)
feature2=np.float64(out['deepid'])
feature2=np.reshape(feature2,(test_num,feature_size))
#np.savetxt('feature2.txt', feature2, delimiter=',')
#mt=pw.pairwise_distances(feature2, feature1, metric=metric)
mt=pw.cosine_similarity(feature2, feature1)
false=0
for i in range(test_num):
actual.append(class_index)
for j in range(sample_num):
if np.max(mt[i]) == mt[i][j]:
confusion_array[j] += 1
predict.append(j)
image_index += 1
total_count += test_num
accept_sum += confusion_array[class_index]
class_index += 1
print 'total:%d' % (round(total_count))
print 'accept:%d' % (accept_sum)
print 'reject:%d' % (round(total_count) - accept_sum)
print 'accuray:%.4f' % (accept_sum / total_count)
#conf_mat = confusion_matrix(actual,predict)
#print(conf_mat)
actual = np.array(actual)
predict = np.array(predict)
y_actual = pd.Series(actual, name='Actual')
y_predict = pd.Series(predict, name='Predicted')
df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)
plot_confusion_matrix(df_confusion)
return (accept_sum / total_count)
#process a text file
def evaluate2(self,metric='cosine'):
feature1=np.fromfile('./features/' + model_name +'-features.dat',dtype=np.float64)
feature1=np.reshape(feature1,(class_size,feature_size))
#np.savetxt('feature1.txt', feature1, delimiter=',')
class_index = 0
image_index = 0
total_count = 0.0
accept_sum = 0
actual = []
predict = []
for filename in filenames:
#query-feature
X=self.read_imagelist(filelist_path + filename + extension)
test_num=np.shape(X)[0]
out = self.forward_all(data=X)
feature2=np.float64(out['deepid'])
feature2=np.reshape(feature2,(test_num,feature_size))
#np.savetxt('feature2.txt', feature2, delimiter=',')
#mt=pw.pairwise_distances(feature2, feature1, metric=metric)
mt=pw.cosine_similarity(feature2, feature1)
false=0
for i in range(test_num):
actual.append(class_index)
for j in range(class_size):
if np.max(mt[i]) == mt[i][j]:
confusion_array[j] += 1
predict.append(j)
image_index += 1
total_count += test_num
accept_sum += confusion_array[class_index]
class_index += 1
print 'total:%d' % (round(total_count))
print 'accept:%d' % (accept_sum)
print 'reject:%d' % (round(total_count) - accept_sum)
print 'accuray:%.4f' % (accept_sum / total_count)
#conf_mat = confusion_matrix(actual,predict)
#print(conf_mat)
#actual = np.array(actual)
#predict = np.array(predict)
#y_actual = pd.Series(actual, name='Actual')
#y_predict = pd.Series(predict, name='Predicted')
#df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
#print(df_confusion)
#plot_confusion_matrix(df_confusion)
return (accept_sum / total_count)
#process a text file
def _create_significance_table(self,data):
"""
Create a table containing p values for significance tests. Add features of
the distributions and the p values to the dataframe.
"""
# list features of the variable e.g. matched, paired, n_expected
df=pd.DataFrame(index=self.continuous+self.categorical,
columns=['continuous','nonnormal','min_observed','pval','ptest'])
df.index.rename('variable', inplace=True)
df['continuous'] = np.where(df.index.isin(self.continuous),True,False)
df['nonnormal'] = np.where(df.index.isin(self.nonnormal),True,False)
# list values for each variable, grouped by groupby levels
for v in df.index:
# compute p value
is_continuous = df.loc[v]['continuous']
is_categorical = ~df.loc[v]['continuous']
is_normal = ~df.loc[v]['nonnormal']
# if continuous, group data into list of lists
if is_continuous:
catlevels = None
grouped_data = []
for s in self.groupbylvls:
lvl_data = data[data[self.groupby]==s].dropna(subset=[v])[v]
grouped_data.append(lvl_data.values)
min_observed = len(min(grouped_data,key=len))
# if categorical, create contingency table
elif is_categorical:
catlevels = sorted(data[v].astype('category').cat.categories)
grouped_data = pd.crosstab(data[self.groupby],data[v])
min_observed = grouped_data.sum(axis=1).min()
# minimum number of observations across all levels
df.loc[v,'min_observed'] = min_observed
# compute pvalues
df.loc[v,'pval'],df.loc[v,'ptest'] = self._p_test(v,
grouped_data,is_continuous,is_categorical,
is_normal,min_observed,catlevels)
return df
def draw(self, **kwargs):
""" Draw time series. """
def plot_facet(data, color, **kwargs):
num = []
date = []
time = data[self._time_column]
num = data[self._time_column].apply(self.convert_to_datetime)
date = data[self._time_column].apply(self.convert_to_timeseries)
if pd.isnull(num).sum() <= pd.isnull(date).sum():
data[self._time_column] = num
else:
data[self._time_column] = date
data.dropna(inplace=True)
if len(self._groupby) == 2:
ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
ct = ct[pd.notnull(ct.index)]
else:
ct = pd.crosstab(
data[self._time_column],
pd.Series([""] * len(self._table[self._time_column]), name=""))
# percentage area plot:
if self.percentage:
# if there is only one grouping variable (the time column),
# the cross table produces a Series, not a data frame. It
# isn't really very informative to plot it, but we provide
# for this special case anyway_
if type(ct) == pd.Series:
ct = ct.apply(lambda x: 100)
else:
ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
else:
if self.area:
# Stacked area plot:
if len(self._groupby) == 2:
self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
else:
# Line plot:
self.vmax = max(self.vmax, ct.values.max())
ct.plot(ax=plt.gca(), color=self.get_palette())
self.map_data(plot_facet)
if self.percentage:
self.g.set(ylim=(0, 100))
else:
self.g.set(ylim=(0, self.vmax))
self.g.set_axis_labels(self.options["label_x_axis"], self.options["label_y_axis"])
if len(self._groupby) == 2:
self.add_legend()