python类crosstab()的实例源码

heatmap.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def draw(self):
        """ Draw a heat map. """

        def get_crosstab(data, row_fact,col_fact, row_names, col_names):
            ct = pd.crosstab(data[row_fact], data[col_fact])
            ct = ct.reindex_axis(row_names, axis=0).fillna(0)
            ct = ct.reindex_axis(col_names, axis=1).fillna(0)
            return ct

        def plot(data, color):
            ct = get_crosstab(
                    data,
                    self._groupby[0],
                    self._groupby[1],
                    self._levels[0],
                    self._levels[1])

            sns.heatmap(ct,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                vmax=vmax,
                #ax=plt.gca(),
                linewidths=1)

        if len(self._groupby) < 2:
            # create a dummy cross tab with one dimension containing empty
            # values:
            data_column = self._table[self._groupby[0]].reset_index(drop=True)
            tab = pd.crosstab(
                pd.Series([""] * len(data_column), name=""),
                data_column)
            plot_facet = lambda data, color: sns.heatmap(
                tab,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                linewidths=1)
        else:
            plot_facet = plot
            vmax = pd.crosstab(
                [self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None],
                [self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max()

        cmap = ListedColormap(self.options["color_palette_values"])
        self.map_data(plot_facet)
ModelEvaluate.py 文件源码 项目:ModelFlow 作者: yuezPrincetechs 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def plot_ks_cdf(y_true,y_score,pos_label=1,label_map=None,color_map=None,decimals=0,
                xlabel='Score',ylabel='CumSum',fontsize=12,figsize=(18,8),close=True):
    '''
    ??: ??KS???????????????????
    ???: 
    y_true: ?????series?????????{0,1}?{-1,1}??
    y_score: ?????series????????????????????
    pos_label: int?????positive?????
    label_map: ???????????????{0:'Good',1:'Bad'}?
    color_map: ????????????????{0:'g',1:'r'}?
    decimals: ?????????
    xlabel: ??????xlabel?
    ylabel: ??????ylabel?
    fontsize: int??????
    close: ???????
    ???: 
    ????????{'ks': KS??'split': KS??????'fig': ?????????}?
    '''
    if label_map is None:
        label_map={0:'Good',1:'Bad'}
    ks_dict = {}
    y_true=pd.Series(y_true)
    y_score=pd.Series(y_score)
    y_score_dataframe=pd.concat([y_true,y_score],axis=1)
    ks=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=False,decimals=decimals)
    score_split=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=True,decimals=decimals)

    crossfreq = pd.crosstab(y_score.round(decimals),y_true)
    crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
    color=crossdens.columns.map(lambda xx: color_map.get(xx,None))
    crossdens=crossdens.rename(columns=label_map)
    crossdens.columns.name=''
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    crossdens.plot(kind='line',ax=ax,fontsize=fontsize,color=color)
    ax.set_xlabel(xlabel,fontsize=fontsize)
    ax.set_ylabel(ylabel,fontsize=fontsize)
    ax.set_title('CDF Curve (KS=%.2f, SPLIT=%.*f)'%(ks,decimals,score_split),fontsize=fontsize)
    if close:
        plt.close('all')    
    ks_dict['ks'] = ks
    ks_dict['split'] = score_split
    ks_dict['fig'] = fig
    return ks_dict
features.py 文件源码 项目:AlphaPy 作者: ScottFreeLLC 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def create_crosstabs(model):
    r"""Create cross-tabulations for categorical variables.

    Parameters
    ----------
    model : alphapy.Model
        The model object containing the data.

    Returns
    -------
    model : alphapy.Model
        The model object with the updated feature map.

    """

    logger.info("Creating Cross-Tabulations")

    # Extract model data
    X = model.X_train
    y = model.y_train

    # Extract model parameters

    factors = model.specs['factors']
    target_value = model.specs['target_value']

    # Iterate through columns, dispatching and transforming each feature.

    crosstabs = {}
    for fname in X:
        if fname in factors:
            logger.info("Creating crosstabs for feature %s", fname)
            ct = pd.crosstab(X[fname], y).apply(lambda r : r / r.sum(), axis=1)
            crosstabs[fname] = ct

    # Save crosstabs to the feature map

    model.feature_map['crosstabs'] = crosstabs
    return model


#
# Function get_factors
#
concordance_analysis.py 文件源码 项目:microbiomeHD 作者: cduvallet 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def concordance(series1, series2, method, nreps=1000):
    """
    Measures the concordance between two pandas Series and returns a pvalue
    and measure of concordance.

    Parameters
    ----------
    series1, series2 : pandas Series
        Series with matching indexes.
    method : str
        ['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
    nreps : int
        number of repititions to build the null. Only needed if method is
        'empirical'

    Returns
    -------
    measure : float
        some sort of measure of concordance (e.g. r for the correlation
        methods, n_observed - mean(n_expected) for empirical, etc)
    p : float
        p value of observed concordance between series1 and series2
    """

    if method == 'fisher':
        # Note: this automatically ignores any bugs which were not present
        # in both series.
        mat = pd.crosstab(series1, series2)
        return fisher_exact(mat)

    elif method == 'spearman':
        return spearmanr(series1, series2)

    elif method == 'kendalltau':
        return kendalltau(series1, series2, nan_policy='omit')

    elif method == 'empirical':
        return empirical_pval(series1, series2, nreps)

    elif method == 'cohen':
        tmp = pd.concat((series1, series2), axis=1).dropna()
        return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan

    else:
        raise ValueError('Unknown concordance method.')
clustering_model_kmeans_external.py 文件源码 项目:ML-Predictions 作者: ltfschoen 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def process_clustering(self):
        print("K-Means Clustering in progress...")

        dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]


        if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]:
            return

        # Explore loaded data
        df = self.prediction_data
        target_column = dataset_choice["target_column"]
        affiliation_column = dataset_choice["affiliation_column"]

        centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY
        # Initialise K-Means Clustering Model using specified quantity of clusters (centroids)
        # for training the model using the whole dataset.
        kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1)

        df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O'])
        print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist())

        print("All dtypes: ", dict(df.dtypes))
        print("Any rows null?: ", df.isnull().values.any())
        print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)])

        # Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row
        # to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a
        # cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid)
        # Important Note: Pass only numeric dataframe columns
        clustered_row_distances = kmeans_model.fit_transform(df_numeric)

        # Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column
        # and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party')
        labels = kmeans_model.labels_
        # Show how many are grouped into say Cluster 0
        # print(labels.tolist().count(0))
        # Count quantity of unique Clusters
        print("Clusters total count: %r" % (len(labels.tolist())))
        print("Clusters unique count: %r" % (len(set(labels.tolist()))))
        cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels))

        print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column])))

        if self.prediction_config.PLOT_KMEANS_OUTLIERS == True:
            self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances)

        # Generate new DataFrame column to be used as Target Column for Prediction Algorithms
        # (i.e. to detect which roll call votes were most likely to cause extremism such
        # that Senators would not vote along their own party lines)
        extremism = (clustered_row_distances ** 3).sum(axis=1)
        df["extremism"] = extremism
        df.sort_values("extremism", inplace=True, ascending=False)
        print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10)))
        self.prediction_data.df_listings = df
plot.py 文件源码 项目:cohorts 作者: hammerlab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def fishers_exact_plot(data, condition1, condition2, ax=None,
                       condition1_value=None,
                       alternative="two-sided", **kwargs):
    """
    Perform a Fisher's exact test to compare to binary columns

    Parameters
    ----------
    data: Pandas dataframe
        Dataframe to retrieve information from

    condition1: str
        First binary column to compare (and used for test sidedness)

    condition2: str
        Second binary column to compare

    ax : Axes, default None
        Axes to plot on

    condition1_value:
        If `condition1` is not a binary column, split on =/!= to condition1_value

    alternative:
        Specify the sidedness of the test: "two-sided", "less"
        or "greater"
    """
    plot = sb.barplot(
        x=condition1,
        y=condition2,
        ax=ax,
        data=data,
        **kwargs
    )

    plot.set_ylabel("Percent %s" % condition2)
    condition1_mask = get_condition_mask(data, condition1, condition1_value)
    count_table = pd.crosstab(data[condition1], data[condition2])
    print(count_table)
    oddsratio, p_value = fisher_exact(count_table, alternative=alternative)
    add_significance_indicator(plot=plot, significant=p_value <= 0.05)
    only_percentage_ticks(plot)

    if alternative != "two-sided":
        raise ValueError("We need to better understand the one-sided Fisher's Exact test")
    sided_str = "two-sided"
    print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str))
    return FishersExactResults(oddsratio=oddsratio,
                               p_value=p_value,
                               sided_str=sided_str,
                               with_condition1_series=data[condition1_mask][condition2],
                               without_condition1_series=data[~condition1_mask][condition2],
                               plot=plot)
read_clean_data.py 文件源码 项目:human_activity 作者: bfetler 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def rfFitScore(clf, dftrain, dftrain_y, dftest, dftest_y):
    '''random forest classifier fit and score.
       clf=RandomForestClassifier, dftrain=train data,
       dftrain_y=train data Y, dftest=test data,
       dftest_y=test data Y'''

    clfit = clf.fit(dftrain, dftrain_y['Y'])  # clf.fit(X, y)

    imp = clfit.feature_importances_  # ndarray of 562    
    # clfit.fit_transform( X, y=None )  # returns X_new

    new_y = clfit.predict( dftest )  # returns predicted Y

    test_score = clfit.score( dftest, dftest_y['Y'] )
    print("test score:", test_score)  # clfit.oob_score_  
    if (clf.oob_score):
        print("oob score", clfit.oob_score_)

    # calculate test score by other means
    print("predict True %.3f percent, %d out of %d" % \
      ((100 * sum(dftest_y['Y'] == new_y) / dftest_y.shape[0]), \
       sum(dftest_y['Y'] == new_y), dftest_y.shape[0]))
    print("predict False %.3f percent, %d out of %d" % \
      ((100 * sum(dftest_y['Y'] != new_y) / dftest_y.shape[0]), \
       sum(dftest_y['Y'] != new_y), dftest_y.shape[0]))

#    new_p = clfit.predict_proba( dftest )
#    # probability of each X variable to predict each y class
#    print("test predict probabilities head:\n", new_p[:5])

    # cross table of variable predictions
    ptab = pd.crosstab(dftest_y['Y'], new_y, \
        rownames=['actual'], colnames=['predicted'])
    print("cross table:\n", ptab)

    # accuracy: percent labeled correctly
    # precision: true positives / (true positives + true negatives)
    # recall:    true positives / (true positives + false negatives)
    precision, recall, fbeta, support = prfs(dftest_y['Y'], new_y)
    print("precision", precision, "\nrecall", recall, \
        "\nfbeta", fbeta, "\nsupport", support)

    if (clf.oob_score):
        return test_score, imp, clfit.oob_score_
    else:
        return test_score, imp
lr_pd_2.py 文件源码 项目:python_utils 作者: Jayhello 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_data():
    f_path = "../dataset/logistic_regression/UCLA_dataset.csv"
    df = pd.read_csv(f_path)
    print df.head()

    print df.describe()

    print df.std()

    print pd.crosstab(df['admit'], df['rank'], rownames=['admit'])

    # df.hist()
    # pl.show()

    # dummy_ranks = pd.get_dummies(df['rank'], prefix='rank')
    # print dummy_ranks.head()

    # train_cols = df.columns[1:]
    # lr = sm.Logit(df['admit'], df[train_cols])
    # ret = lr.fit()
    # print ret.summary()

    train, test = train_test_split(df, test_size=0.2)
    train_x, train_y = train[train.columns[1:]], train['admit']
    test_x, test_y = test[test.columns[1:]], test['admit']

    lr = LogisticRegression()
    lr.fit(train_x, train_y)

    y_pred = lr.predict(test_x)
    print accuracy_score(test_y, y_pred)

    rf = RandomForestClassifier(n_jobs=4)
    rf.fit(train_x, train_y)
    Y_pred = rf.predict(test_x)
    cnf_matrix = confusion_matrix(test_y, Y_pred)
    print cnf_matrix

    accuracy_percent = accuracy_score(test_y, Y_pred)
    print "accuracy is: %s%s" % (accuracy_percent, '%')
    recall_percent = recall_score(test_y, Y_pred)
    print "recall is: %s%s" % (recall_percent, '%')


问题


面经


文章

微信
公众号

扫码关注公众号