python类crosstab()的实例源码

analysing.py 文件源码 项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_python_guangzhou():
    frame2 = frame[(frame.kd == 'Python') &(frame.city == u'??') ]
    cframe = [v for k, v in frame2.to_dict(orient='index').items()]
    pattern = r'\d{4}-\d{2}-\d{2}'
    for c in cframe:
        if re.match(pattern, c['published']):
            pass
        else:
            c['published'] = datetime.datetime.utcnow().strftime("%Y-%m-%d")
    df = DataFrame(cframe)
    df['published'] = pd.to_datetime(df['published'])
    mask = (df['published'] > '2016-04-01') & (df['published'] <= '2016-05-02')
    dataframe = df.loc[mask]
    jobframe = pd.crosstab(dataframe.experience, frame.salary, margins=True).sort_values(by='All', ascending=False)
    jobframe = jobframe.drop('All', axis=0).drop('All', axis=1)
    pie_chart = pygal.StackedBar()
    pie_chart.title = u'???python?????'
    pie_chart.x_labels = jobframe.index
    for cit, num in jobframe.iteritems():
        pie_chart.add("%s" % (cit), num)
    pie_chart.render_to_file(os.path.dirname(__file__) + '/chart/guangzhou_salary.svg')
recipe_classification.py 文件源码 项目:Flavor-Network 作者: lingcheng99 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
continuous.py 文件源码 项目:ModelFlow 作者: yuezPrincetechs 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_crosstab(self,X,y):
        '''
        ?feature_names?????????????
        X?DataFrame???????Series??????
        y?Series?index???X????????0-1????????
        ?????DataFrame?X?Series??????????X?DataFrame?????????????????DataFrame?
        '''
        if len(X.shape)==1:
            result=pd.crosstab(X,y)
        else:
            result={}
            if self.feature_names is None:
                if isinstance(X,pd.DataFrame):
                    feature_names=list(X.columns)
                else:
                    feature_names=[i for i in range(X.shape[1])]
            else:
                feature_names=self.feature_names
            if isinstance(X,pd.DataFrame):
                for feature in feature_names:
                    result[feature]=pd.crosstab(X[feature],y)
            else:
                for feature in feature_names:
                    result[feature]=pd.crosstab(X[:,feature],y)
        return result
103_visit_time.py 文件源码 项目:Instacart 作者: KazukiOnodera 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make(T):
    log_tr = log[log.order_number_rev>T]

    # dow
    dow  = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_')
    dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_')

    # timezone
    timezone  = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_')
    timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_')

    # dow * timezone
    dow_tz  = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_')
    dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_')

    tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1)

    tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T))
recipe_cleanup.py 文件源码 项目:Flavor-Network 作者: lingcheng99 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
DescribeData.py 文件源码 项目:pretrial-release 作者: natethedrummer 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def ptr_stats(df):

    df = df[['CASE DISPOSED STATUS','HCJ Booked','MADE Y / N','PRETRIAL STATUS AT DISPOSITION','bail type made simple']] 

    crosstab = pd.crosstab([df['CASE DISPOSED STATUS'],df['HCJ Booked'],df['MADE Y / N'],df['PRETRIAL STATUS AT DISPOSITION']], df['bail type made simple'],  margins=True)

    print(crosstab)

    crosstab.to_csv('ptr_stats.csv')
dummy_model.py 文件源码 项目:app-skeleton 作者: rragundez 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def train_model(split=.25):
    """Tran model based on the iris dataset.

    This will split the iris dataset into train and test set, will
    train a Random Forest CLassifier and fit the trained model to
    the test dataset.
    In addition the confusion matrix and features importance will be
    calculated.

    Args:
        split (float): Fraction of observations in the test dataset.

    Returns:
        RandomForestClassifier: Trained model.
        pandas.DataFrame: Confusion matrix.
        dictionary: Features importance
    """
    iris = load_iris()
    all_data = pd.DataFrame(iris.data, columns=iris.feature_names)
    features = all_data.columns.str.replace('\s+', '_').str.replace('\W+', '')
    all_data['species'] = pd.Categorical.from_codes(iris.target,
                                                    iris.target_names)
    train, test = train_test_split(all_data, test_size=split)
    clf = RandomForestClassifier(n_jobs=1)
    clf.fit(train.drop('species', axis=1), train.species)
    preds = clf.predict(test.drop('species', axis=1))
    conf_matrix = pd.crosstab(test['species'], preds,
                              rownames=['Actual Species'],
                              colnames=['Predicted Species'])
    f_importances = list(zip(train.drop('species', axis=1).columns,
                             clf.feature_importances_))
    return clf, conf_matrix, f_importances, features
recommender_wide_and_deep.py 文件源码 项目:tflearn 作者: tflearn 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def output_confusion_matrix(self, y, y_pred):
        assert y.size == y_pred.size
        print("Actual IDV")
        print(y.value_counts())
        print("Predicted IDV")
        print(y_pred.value_counts())
        print()
        print("Confusion matrix:")
        cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
        print(cmat)
        sys.stdout.flush()
        return cmat

#-----------------------------------------------------------------------------
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Line plot:
        #self.vmax = max(self.vmax, ct.values.max())
        #ct.plot(ax=plt.gca(), color=self.get_palette())
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Stacked area plot:
        #if len(self._groupby) == 2:
            #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
        #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## percentage area plot:
        ## if there is only one grouping variable (the time column),
        ## the cross table produces a Series, not a data frame. It
        ## isn't really very informative to plot it, but we provide
        ## for this special case anyway_
        #if type(ct) == pd.Series:
            #ct = ct.apply(lambda x: 100)
        #else:
            #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
        #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
continuous.py 文件源码 项目:ModelFlow 作者: yuezPrincetechs 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def cal_prob(crosstab):
        '''
        ?????????????????c?????????(N(x=c,y=1)+p)/(N(x=c)+1)?
        crosstab????DataFrame?index????????column?y???0/1??
        ?????????????????????????
        '''
        total=crosstab.sum(axis=0)
        p=total.loc[1]/total.sum()
        N=crosstab.sum(axis=1)+1
        N1=crosstab[1]+p
        N.name=''
        N.index.name=''
        N1.name=''
        N1.index.name=''
        return dict(N1/N)
continuous.py 文件源码 项目:ModelFlow 作者: yuezPrincetechs 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def cal_woe(crosstab):
        '''
        ???????WOE??????c???WOE???log(r(x=c,y=1)/r(x=c,y=0))?
        ??r(x=c,y=1)=N(x=c,y=1)/N(y=1)??????r(x=c,y=0)=N(x=c,y=0)/N(y=0)??????
        crosstab????DataFrame?index????????column?y???0/1??
        ???????????????????WOE?
        '''
        tmp=crosstab.copy()
        #??????????????
        tmp[tmp==0]=1
        r=tmp/tmp.sum(axis=0)
        result=np.log(r[1]/r[0])
        return dict(result)
ModelEvaluate.py 文件源码 项目:ModelFlow 作者: yuezPrincetechs 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def cal_ks(y,y_prob,pos_label=1,return_split=False,decimals=0):
    '''
    ??KS????????
    y: ?????series?????????{0,1}?{-1,1}??
    y_prob: ?????dataframe???????????????????????????????????
            ?????????series?????????dataframe?????
    pos_label: int?????positive?????
    return_split: ??????????
    decimals: ?????????
    ??KS??????????????sklearn???????
    '''
    y=pd.Series(pd.Series(y).values)
    if len(y_prob.shape)==1:
        y_pred=pd.Series(pd.Series(y_prob).values)
    else:
        y_pred=pd.Series(pd.DataFrame(y_prob).iloc[:,1].values)
    Bad=y_pred[y==pos_label]
    Good=y_pred[y!=pos_label]
    ks, pvalue = stats.ks_2samp(Bad.values, Good.values)
    if not return_split:
        return ks
    crossfreq=pd.crosstab(y_pred.round(decimals),y)
    crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
    crossdens['gap'] = abs(crossdens[0] - crossdens[1])
    score_split = crossdens[crossdens['gap'] == crossdens['gap'].max()].index[0]
    return score_split
analysing.py 文件源码 项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_city_experience():
    city_experience = pd.crosstab(frame.city,frame.experience,margins=True).sort_values(by='All',ascending=False)[:11]
    city_education = city_experience.drop('All',axis=0).drop('All',axis=1)
    ce_chart = pygal.Bar()
    ce_chart.title = u'?????????????'
    ce_chart.x_labels = city_education.index
    for i in range(len(list(city_education.T.index))):
        ce_chart.add(city_education.T.index[i], city_education.T.values[i])
    ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_experience.svg')
analysing.py 文件源码 项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_city_phase():
    city_pahse = pd.crosstab(frame.city,frame.phase,margins=True).sort_values(by='All',ascending=False)[:11]
    city_pahse = city_pahse.drop('All',axis=0).drop('All',axis=1)
    funnel_chart = pygal.StackedBar()
    funnel_chart.title = u'??????????????'
    funnel_chart.x_labels = city_pahse.index
    for i in range(len(list(city_pahse.T.index))):
        funnel_chart.add(city_pahse.T.index[i], city_pahse.T.values[i])
    funnel_chart.render_to_file(os.path.dirname(__file__)+'/chart/phase.svg')
analysing.py 文件源码 项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def get_city_education():
    city_education = pd.crosstab(frame.city,frame.education,margins=True).sort_values(by='All',ascending=False)[:11]
    city_education = city_education.drop('All',axis=0).drop('All',axis=1)
    ce_chart = pygal.Bar()
    ce_chart.title = u'??????????????'
    ce_chart.x_labels = city_education.index
    for i in range(len(list(city_education.T.index))):
        ce_chart.add(city_education.T.index[i], city_education.T.values[i])
    ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_edu.svg')
010_streak.py 文件源码 项目:Instacart 作者: KazukiOnodera 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def multi(uid):
    tmp = log[log.user_id==uid]
    ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number')
    li = []
    for pid in ct.columns:
        streak = 0
        sw_odr = False
        for onb,odr in enumerate(ct[pid].values):
            onb+=1
            if sw_odr == False and odr == 1:
                sw_odr = True
                streak = 1
                li.append([uid, pid, onb, streak])
                continue
            if sw_odr == True:
                if odr == 1 and streak>0:
                    streak += 1
                    li.append([uid, pid, onb, streak])
                elif odr == 1 and streak<=0:
                    streak = 1
                    li.append([uid, pid, onb, streak])
                elif odr == 0 and streak>0:
                    streak = 0
                    li.append([uid, pid, onb, streak])
                elif odr == 0 and streak<=0:
                    streak -= 1
                    li.append([uid, pid, onb, streak])
    return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak'])
utils.py 文件源码 项目:Human-Activity-Recognition 作者: servomac 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])
eda.py 文件源码 项目:xam 作者: MaxHalford 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp
knn.py 文件源码 项目:CKME136 作者: asterix135 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def run_knn(trainx, trainy, testx, testy):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainx, trainy)
    pred_y = knn.predict(testx)
    print(pd.crosstab(testy, pred_y, rownames=['Actual'],
                      colnames=['Predicted']))
    print('\nAccuracy: ' + str(accuracy_score(testy, pred_y)))
tflearn_wide_and_deep.py 文件源码 项目:tflearn_wide_and_deep 作者: ichuang 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def output_confusion_matrix(self, y, y_pred):
        assert y.size == y_pred.size
        print("Actual IDV")
        print(y.value_counts())
        print("Predicted IDV")
        print(y_pred.value_counts())
        print()
        print("Confusion matrix:")
        cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
        print(cmat)
        sys.stdout.flush()
        return cmat

#-----------------------------------------------------------------------------
models_classification.py 文件源码 项目:easyML 作者: aarshayj 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def calc_model_characteristics(self, performCV=True):
        # Determine key metrics to analyze the classification model. These 
        # are stored in the classification_output series object belonginf to 
        # this class.
        for metric in [self.scoring_metric]+self.additional_display_metrics:
            #Determine for both test and train, except predict:
            for key,data in self.dp.items():
                if key!='predict':  
                    name = '%s_%s'%(metric,key)
                    #Case where probabilities to be passed as arguments
                    if base_classification.metrics_map[metric][2]:
                        self.classification_output[name] = \
                            base_classification.metrics_map[metric][0](
                                data[self.datablock.target],
                                self.predictions_probabilities[key])
                    #case where class predictions to be passed  as arguments
                    else:                                                   
                        self.classification_output[name] = \
                            base_classification.metrics_map[metric][0](
                                data[self.datablock.target],
                                self.predictions_class[key])

                #Determine confusion matrix:
                name = 'ConfusionMatrix_%s'%key
                self.classification_output[name] = pd.crosstab(
                        data[self.datablock.target], 
                        self.predictions_class[key]
                    ).to_string()

        if performCV:
            cv_score = self.KFold_CrossValidation(
                        scoring_metric=self.scoring_metric)
        else:
            cv_score = {
                'mean_error': 0.0, 
                'std_error': 0.0
            }

        self.classification_output['CVMethod'] = \
                                        'KFold - ' + str(self.cv_folds)
        self.classification_output['CVScore_mean'] = cv_score['mean_error']
        self.classification_output['CVScore_std'] = cv_score['std_error']
        self.classification_output['Predictors'] = str(self.predictors)
models_classification.py 文件源码 项目:easyML 作者: aarshayj 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def printReport(self, printConfusionMatrix, printModelParameters):
        # Print the metric determined in the previous function.

        print("\nModel Report")
        #Outpute the parameters used for modeling
        if printModelParameters:
            print('\nModel being built with the following parameters:')
            print(self.alg.get_params())

        if printConfusionMatrix:
            for key,data in self.dp.items():
                if key!='predict':
                    print("\nConfusion Matrix for %s data:"%key)
                    print(pd.crosstab(
                            data[self.datablock.target], 
                            self.predictions_class[key])
                    )
            print('Note: rows - actual; col - predicted')

        print("\nScoring Metric:")
        for key,data in self.dp.items():
            if key!='predict':
                name = '%s_%s'%(self.scoring_metric,key)
                print("\t%s (%s): %s" % 
                    (
                    self.scoring_metric,
                    key,
                    "{0:.3%}".format(self.classification_output[name])
                    )
                )

        print("\nCV Score for Scoring Metric (%s):"%self.scoring_metric)
        print("\tMean - %f | Std - %f" % (
            self.classification_output['CVScore_mean'],
            self.classification_output['CVScore_std'])
        )

        if self.additional_display_metrics:
            print("\nAdditional Scoring Metrics:")
            for metric in self.additional_display_metrics:
                for key,data in self.dp.items():
                    if key!='predict':
                        name = '%s_%s'%(metric,key)
                        print("\t%s (%s): %s" % (
                            metric,
                            key,
                            "{0:.3%}".format(
                                    self.classification_output[name])
                            )
                        )
utils_scoring.py 文件源码 项目:auto_ml 作者: doordash 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def advanced_scoring_classifiers(probas, actuals, name=None):
    # pandas Series don't play nice here. Make sure our actuals list is indeed a list
    actuals = list(actuals)
    predictions = list(probas)

    print('Here is our brier-score-loss, which is the default value we optimized for while training, and is the value returned from .score() unless you requested a custom scoring metric')
    print('It is a measure of how close the PROBABILITY predictions are.')
    if name != None:
        print(name)

    # Sometimes we will be given "flattened" probabilities (only the probability of our positive label), while other times we might be given "nested" probabilities (probabilities of both positive and negative, in a list, for each item).
    try:
        probas = [proba[1] for proba in probas]
    except:
        pass

    print(format(brier_score_loss(actuals, probas), '.4f'))


    print('\nHere is the trained estimator\'s overall accuracy (when it predicts a label, how frequently is that the correct label?)')
    predicted_labels = []
    for pred in probas:
        if pred >= 0.5:
            predicted_labels.append(1)
        else:
            predicted_labels.append(0)
    print(format(accuracy_score(y_true=actuals, y_pred=predicted_labels) * 100, '.1f') + '%')


    print('\nHere is a confusion matrix showing predictions and actuals by label')
    #it would make sense to use sklearn's confusion_matrix here but it apparently has no labels
    #took this idea instead from: http://stats.stackexchange.com/a/109015
    conf = pd.crosstab(pd.Series(actuals), pd.Series(predicted_labels), rownames=['v Actual v'], colnames=['Predicted >'], margins=True)
    print(conf)


    print('Here is the accuracy of our trained estimator at each level of predicted probabilities')

    # create summary dict
    summary_dict = OrderedDict()
    for num in range(0, 110, 10):
        summary_dict[num] = []

    for idx, proba in enumerate(probas):
        proba = math.floor(int(proba * 100) / 10) * 10
        summary_dict[proba].append(actuals[idx])

    for k, v in summary_dict.items():
        if len(v) > 0:
            print('Predicted probability: ' + str(k) + '%')
            actual = sum(v) * 1.0 / len(v)

            # Format into a prettier number
            actual = round(actual * 100, 0)
            print('Actual: ' + str(actual) + '%')
            print('# preds: ' + str(len(v)) + '\n')

    print('\n\n')
eval-all.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_alex(self):

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []

    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        predicts=out[self.outputs[0]]
        predicts=np.reshape(predicts,(test_num,10))
        confusion_array = np.zeros((class_size), dtype = np.int)
        for i in range(test_num):
        actual.append(class_index)
        for j in range(class_size):    
           if np.max(predicts[i]) == predicts[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1
        #print(confusion_array)
        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    #actual = np.array(actual)
    #predict = np.array(predict)
    #y_actual = pd.Series(actual, name='Actual')
    #y_predict = pd.Series(predict, name='Predicted')
    #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    #print(df_confusion)
    #plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
eval-all.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def evaluate(self,metric='cosine'):
    #sample-feature
    X=self.read_imagelist(filelist_sample)
    sample_num=np.shape(X)[0]
    out = self.forward_all(data=X)
    feature1=np.float64(out['deepid'])
    feature1=np.reshape(feature1,(sample_num,feature_size))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []

    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        feature2=np.float64(out['deepid'])
        feature2=np.reshape(feature2,(test_num,feature_size))
        #np.savetxt('feature2.txt', feature2, delimiter=',')
        #mt=pw.pairwise_distances(feature2, feature1, metric=metric)
        mt=pw.cosine_similarity(feature2, feature1)
        false=0
        for i in range(test_num):
        actual.append(class_index)
        for j in range(sample_num):
           if np.max(mt[i]) == mt[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1

        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    actual = np.array(actual)
    predict = np.array(predict)
    y_actual = pd.Series(actual, name='Actual')
    y_predict = pd.Series(predict, name='Predicted')
    df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    print(df_confusion)
    plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
eval-all.py 文件源码 项目:image-classifier 作者: gustavkkk 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def evaluate2(self,metric='cosine'):
    feature1=np.fromfile('./features/' + model_name +'-features.dat',dtype=np.float64)
    feature1=np.reshape(feature1,(class_size,feature_size))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []
    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        feature2=np.float64(out['deepid'])
        feature2=np.reshape(feature2,(test_num,feature_size))
        #np.savetxt('feature2.txt', feature2, delimiter=',')
        #mt=pw.pairwise_distances(feature2, feature1, metric=metric)
        mt=pw.cosine_similarity(feature2, feature1)
        false=0
        for i in range(test_num):
        actual.append(class_index)
        for j in range(class_size):
           if np.max(mt[i]) == mt[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1

        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    #actual = np.array(actual)
    #predict = np.array(predict)
    #y_actual = pd.Series(actual, name='Actual')
    #y_predict = pd.Series(predict, name='Predicted')
    #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    #print(df_confusion)
    #plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
tableone.py 文件源码 项目:tableone 作者: tompollard 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _create_significance_table(self,data):
        """
        Create a table containing p values for significance tests. Add features of
        the distributions and the p values to the dataframe.
        """

        # list features of the variable e.g. matched, paired, n_expected
        df=pd.DataFrame(index=self.continuous+self.categorical,
            columns=['continuous','nonnormal','min_observed','pval','ptest'])

        df.index.rename('variable', inplace=True)
        df['continuous'] = np.where(df.index.isin(self.continuous),True,False)
        df['nonnormal'] = np.where(df.index.isin(self.nonnormal),True,False)

        # list values for each variable, grouped by groupby levels
        for v in df.index:

            # compute p value
            is_continuous = df.loc[v]['continuous']
            is_categorical = ~df.loc[v]['continuous']
            is_normal = ~df.loc[v]['nonnormal']

            # if continuous, group data into list of lists
            if is_continuous:
                catlevels = None
                grouped_data = []
                for s in self.groupbylvls:
                    lvl_data = data[data[self.groupby]==s].dropna(subset=[v])[v]
                    grouped_data.append(lvl_data.values)
                min_observed = len(min(grouped_data,key=len))
            # if categorical, create contingency table
            elif is_categorical:
                catlevels = sorted(data[v].astype('category').cat.categories)
                grouped_data = pd.crosstab(data[self.groupby],data[v])
                min_observed = grouped_data.sum(axis=1).min()

            # minimum number of observations across all levels
            df.loc[v,'min_observed'] = min_observed

            # compute pvalues
            df.loc[v,'pval'],df.loc[v,'ptest'] = self._p_test(v, 
                grouped_data,is_continuous,is_categorical,
                is_normal,min_observed,catlevels)

        return df
timeseries.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def draw(self, **kwargs):
        """ Draw time series. """

        def plot_facet(data, color, **kwargs):
            num = []
            date = []
            time = data[self._time_column]
            num = data[self._time_column].apply(self.convert_to_datetime)
            date = data[self._time_column].apply(self.convert_to_timeseries)
            if pd.isnull(num).sum() <= pd.isnull(date).sum():
                data[self._time_column] = num
            else:
                data[self._time_column] = date

            data.dropna(inplace=True)
            if len(self._groupby) == 2:
                ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
                ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
                ct = ct[pd.notnull(ct.index)]
            else:
                ct = pd.crosstab(
                    data[self._time_column],
                    pd.Series([""] * len(self._table[self._time_column]), name=""))

            # percentage area plot:
            if self.percentage:
                # if there is only one grouping variable (the time column), 
                # the cross table produces a Series, not a data frame. It 
                # isn't really very informative to plot it, but we provide 
                # for this special case anyway_
                if type(ct) == pd.Series:
                    ct = ct.apply(lambda x: 100)
                else:
                    ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
                ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
            else:
                if self.area:
                    # Stacked area plot:
                    if len(self._groupby) == 2:
                        self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
                    ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
                else:
                    # Line plot:
                    self.vmax = max(self.vmax, ct.values.max())
                    ct.plot(ax=plt.gca(), color=self.get_palette())

        self.map_data(plot_facet)

        if self.percentage:
            self.g.set(ylim=(0, 100))
        else:
            self.g.set(ylim=(0, self.vmax))
        self.g.set_axis_labels(self.options["label_x_axis"], self.options["label_y_axis"])

        if len(self._groupby) == 2:
            self.add_legend()


问题


面经


文章

微信
公众号

扫码关注公众号