python类value_counts()的实例源码-面圈网

pairwise.py 文件源码项目：CausalGAN 作者: mkocaoglu 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def calc_tvd(label_dict,attr):
    '''
    attr should be a 0,1 pandas dataframe with
    columns corresponding to label names

    for example:
    names=zip(*self.graph)[0]
    calc_tvd(label_dict,attr[names])

    label_dict should be a dictionary key:1d-array of samples
    '''
    ####Calculate Total Variation####
    if np.min(attr.values)<0:
        raise ValueError('calc_tvd received \
                 attr that may not have been in {0,1}')

    label_names=label_dict.keys()
    attr=attr[label_names]

    df2=attr.drop_duplicates()
    df2 = df2.reset_index(drop = True).reset_index()
    df2=df2.rename(columns = {'index':'ID'})
    real_data_id=pd.merge(attr,df2)
    real_counts = pd.value_counts(real_data_id['ID'])
    real_pdf=real_counts/len(attr)

    label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()}
    df_dat=pd.DataFrame.from_dict(label_list_dict)
    dat_id=pd.merge(df_dat,df2,on=label_names,how='left')
    dat_counts=pd.value_counts(dat_id['ID'])
    dat_pdf = dat_counts / dat_counts.sum()
    diff=real_pdf.subtract(dat_pdf, fill_value=0)
    tvd=0.5*diff.abs().sum()
    return tvd

summary_stats.py 文件源码项目：fake_news 作者: bmassman 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def global_stats(articles: pd.DataFrame):
    """Calculate global stats on article db."""
    print(f'Number of articles: {len(articles):,}')
    num_sources = len(pd.value_counts(articles['base_url'], sort=False))
    print(f'Number of news sources: {num_sources}')
    mean_wc = articles['word_count'].mean()
    print(f'Global mean word count: {mean_wc:.1f}')
    missing_authors = (articles['authors'] == '').sum()
    print(f'Missing authors: {missing_authors:,}')
    missing_titles = (articles['title'] == '').sum()
    print(f'Missing titles: {missing_titles}')
    missing_texts = (articles['text'] == '').sum()
    print(f'Missing texts: {missing_texts:,}')

remap_chromosomal_regions.py 文件源码项目：tmtk 作者: thehyve 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def return_mean(datafile, mapping, flag_columns=None):
    mapped_regions = pd.DataFrame(datafile[datafile.iloc[:, 0].isin(mapping)])
    mean_values = mapped_regions.iloc[:, 1:].applymap(float).mean()
    if flag_columns.any() and (len(mapping) > 1):
        mean_values[flag_columns] = (datafile[datafile.iloc[:, 0].isin(mapping)][flag_columns]
                                     ).apply(lambda x: pd.value_counts(x).index[0])
    return mean_values

sleepStats.py 文件源码项目：fitbit-analyzer 作者: 5agado 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def normalizedIntradayCountStats(intradayStats, limitCount=5):
    # For each minute, number of days for which we have a valid measure (record)
    notNullCount = intradayStats.count()
    # Ignore minutes where we have low level of records
    notNullCount[notNullCount < limitCount] = None
    # Count how many times each value appears for each minute
    valueCount = intradayStats.apply(pd.value_counts)
    # Normalize each minute by records count
    res = valueCount.div(notNullCount, axis=1)
    return res

preprocess.py 文件源码项目：tianchi_power 作者: lvniqi 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def classify_user():
    new_df_log_scaled = get_scaled_user()
    c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T)
    pd.value_counts(c.labels_)
    d = c.labels_
    types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0]
    types[types == -1] = 2
    return types

tfidf.py 文件源码项目：newsgraph 作者: exchez 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def word_count(string):
    return pd.value_counts( string.split() ).to_dict()

make_entity.py 文件源码项目：memex_ad_features 作者: giantoak 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def get_entity_features(self):
        # First we will calculate the rates, so let's drop all the NaN
        rate_df = self.df.dropna(subset=['rate'])

        # Calculate the rates by hour and delete the old rate column.
        rate_df = rate_df.\
            merge(mean_hourly_rate_df(rate_df),
                  left_on=['_id'], right_on=['_id']).\
            drop('rate', axis=1).\
            drop_duplicates()

        # Now get the stats we want for rate
        rate_df = self.calculate_entity_rate_features(rate_df)

        # Get a count of the entities
        df = pd.value_counts(self.df[self.entity]).\
            reset_index().\
            rename(columns={'index': self.entity,
                            self.entity: self.entity+'_count'})

        # Get counts of unique locations
        for loc_col, unique_loc_col in [('city_wikidata_id',
                                         'unique_cities'),
                                        ('state_wikidata_id',
                                         'unique_states')]:
            unique_loc_df = self.df.loc[:, [self.entity, loc_col]].\
                dropna().\
                drop_duplicates().\
                groupby(self.entity).\
                count().\
                reset_index().\
                rename(columns={loc_col: unique_loc_col})

            df = df.merge(unique_loc_df,
                          how='left',
                          left_on=self.entity,
                          right_on=self.entity)

            df.loc[:, unique_loc_col] = \
                df.loc[:, unique_loc_col].fillna(0).astype(int)

            del unique_loc_df

        # Reset the index on our rate dataframe and rename the columns
        rate_df.reset_index(level=0, inplace=True)
        rate_df.columns = [self.entity, 'rate_count', 'rate_mean', 'rate_std', 'rate_median']

        # Lastly merge the two dataframes
        return df.merge(rate_df, how='outer')

        # Save this code as we may use it later
        """df['incall_count'] = df['index'].apply(lambda x: self.get_incall_count(x))
        df['outcall_count'] = df['index'].apply(lambda x: self.get_outcall_count(x))"""

testPlotting.py 文件源码项目：fitbit-analyzer 作者: 5agado 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def test_plottingOnIntradayStats(self):
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv"
        data1 = utils.loadIntradayData(filepath)
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv"
        data2 = utils.loadIntradayData(filepath)
        stats = sleepStats.generateStatsFrom([data1, data2],
                                             sleepStats.STATS_NAME_INTRADAY)

        data = stats.apply(pd.value_counts)
        mplot.plotSleepValueHeatmap(data, sleepValue=1)

MachineLearning.py 文件源码项目：DiseaseModeling 作者: slerman12 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def describe_data(data, info=False, describe=False, value_counts=None, unique=None,
                  univariate_feature_selection=None, description=None):
    # Data diagnostics
    if description is not None:
        print("\n" + description)

    # Info
    if info:
        print("\nInfo:")
        print(data.info())

    # Description
    if describe:
        print("\nDescribe:")
        print(data.describe())

    # Value counts
    if value_counts is not None:
        for feature in value_counts:
            print("\nValue Counts [" + feature + "]")
            print(pd.value_counts(data[feature]))

    # Unique values
    if unique is not None:
        for feature in unique:
            print("\nUnique [" + feature + "]")
            print(data[feature].unique())

    # Univariate feature selection
    if univariate_feature_selection is not None:
        # Extract predictors and target
        predictors = univariate_feature_selection[0]
        target = univariate_feature_selection[1]

        # Perform feature selection
        selector = SelectKBest(f_classif, k="all")
        selector.fit(data[predictors], data[target])

        # Get the raw p-values for each feature, and transform from p-values into scores
        scores = -np.log10(selector.pvalues_)
        print("\nUnivariate Feature Selection:")
        for feature, imp in sorted(zip(predictors, scores), key=lambda x: x[1] if pd.notnull(x[1]) else 0):
            print(feature, imp)

kaggle_titanic.py 文件源码项目：stacked_generalization 作者: fukatani 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
        def get_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            if title_search:
                return title_search.group(1)
            return ""

        def normalize_fare(data):
            new_data = None
            for embarked in (0, 1, 2):
                temp = data[data.Embarked == embarked]
                temp['Fare'] /= temp['Fare'].values.mean()
                if new_data is None:
                    new_data = temp
                else:
                    new_data = pd.concat([new_data, temp])
            new_data = new_data.sort('PassengerId')
            return new_data

        data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
        data['Age'].fillna(data.Age.median(), inplace=True)
        data['Fare'].fillna(data.Fare.median(), inplace=True)
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
        data['Embarked'].fillna(0, inplace=True)
        if norm_fare:
            data = normalize_fare(data)

        # Get all the titles and print how often each one occurs.
        titles = data["Name"].apply(get_title)
        print(pd.value_counts(titles))

        # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
        title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
        for k,v in title_mapping.items():
            titles[titles == k] = v

        # Add in the title column.
        data['Title'] = titles
        data['Title'].fillna(1, inplace=True)
        #data['Pos'] = data["Title"] + data['Pclass']
        if drop:
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
            data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
        print(data.keys())
        if title_to_onehot:
            self.encode(data, 'Title', [i for i in range(1, 11)])
            data = data.drop(['Title'], axis=1)
        return data