def calc_tvd(label_dict,attr):
'''
attr should be a 0,1 pandas dataframe with
columns corresponding to label names
for example:
names=zip(*self.graph)[0]
calc_tvd(label_dict,attr[names])
label_dict should be a dictionary key:1d-array of samples
'''
####Calculate Total Variation####
if np.min(attr.values)<0:
raise ValueError('calc_tvd received \
attr that may not have been in {0,1}')
label_names=label_dict.keys()
attr=attr[label_names]
df2=attr.drop_duplicates()
df2 = df2.reset_index(drop = True).reset_index()
df2=df2.rename(columns = {'index':'ID'})
real_data_id=pd.merge(attr,df2)
real_counts = pd.value_counts(real_data_id['ID'])
real_pdf=real_counts/len(attr)
label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()}
df_dat=pd.DataFrame.from_dict(label_list_dict)
dat_id=pd.merge(df_dat,df2,on=label_names,how='left')
dat_counts=pd.value_counts(dat_id['ID'])
dat_pdf = dat_counts / dat_counts.sum()
diff=real_pdf.subtract(dat_pdf, fill_value=0)
tvd=0.5*diff.abs().sum()
return tvd
python类value_counts()的实例源码
def global_stats(articles: pd.DataFrame):
"""Calculate global stats on article db."""
print(f'Number of articles: {len(articles):,}')
num_sources = len(pd.value_counts(articles['base_url'], sort=False))
print(f'Number of news sources: {num_sources}')
mean_wc = articles['word_count'].mean()
print(f'Global mean word count: {mean_wc:.1f}')
missing_authors = (articles['authors'] == '').sum()
print(f'Missing authors: {missing_authors:,}')
missing_titles = (articles['title'] == '').sum()
print(f'Missing titles: {missing_titles}')
missing_texts = (articles['text'] == '').sum()
print(f'Missing texts: {missing_texts:,}')
def return_mean(datafile, mapping, flag_columns=None):
mapped_regions = pd.DataFrame(datafile[datafile.iloc[:, 0].isin(mapping)])
mean_values = mapped_regions.iloc[:, 1:].applymap(float).mean()
if flag_columns.any() and (len(mapping) > 1):
mean_values[flag_columns] = (datafile[datafile.iloc[:, 0].isin(mapping)][flag_columns]
).apply(lambda x: pd.value_counts(x).index[0])
return mean_values
def normalizedIntradayCountStats(intradayStats, limitCount=5):
# For each minute, number of days for which we have a valid measure (record)
notNullCount = intradayStats.count()
# Ignore minutes where we have low level of records
notNullCount[notNullCount < limitCount] = None
# Count how many times each value appears for each minute
valueCount = intradayStats.apply(pd.value_counts)
# Normalize each minute by records count
res = valueCount.div(notNullCount, axis=1)
return res
def classify_user():
new_df_log_scaled = get_scaled_user()
c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T)
pd.value_counts(c.labels_)
d = c.labels_
types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0]
types[types == -1] = 2
return types
def word_count(string):
return pd.value_counts( string.split() ).to_dict()
def get_entity_features(self):
# First we will calculate the rates, so let's drop all the NaN
rate_df = self.df.dropna(subset=['rate'])
# Calculate the rates by hour and delete the old rate column.
rate_df = rate_df.\
merge(mean_hourly_rate_df(rate_df),
left_on=['_id'], right_on=['_id']).\
drop('rate', axis=1).\
drop_duplicates()
# Now get the stats we want for rate
rate_df = self.calculate_entity_rate_features(rate_df)
# Get a count of the entities
df = pd.value_counts(self.df[self.entity]).\
reset_index().\
rename(columns={'index': self.entity,
self.entity: self.entity+'_count'})
# Get counts of unique locations
for loc_col, unique_loc_col in [('city_wikidata_id',
'unique_cities'),
('state_wikidata_id',
'unique_states')]:
unique_loc_df = self.df.loc[:, [self.entity, loc_col]].\
dropna().\
drop_duplicates().\
groupby(self.entity).\
count().\
reset_index().\
rename(columns={loc_col: unique_loc_col})
df = df.merge(unique_loc_df,
how='left',
left_on=self.entity,
right_on=self.entity)
df.loc[:, unique_loc_col] = \
df.loc[:, unique_loc_col].fillna(0).astype(int)
del unique_loc_df
# Reset the index on our rate dataframe and rename the columns
rate_df.reset_index(level=0, inplace=True)
rate_df.columns = [self.entity, 'rate_count', 'rate_mean', 'rate_std', 'rate_median']
# Lastly merge the two dataframes
return df.merge(rate_df, how='outer')
# Save this code as we may use it later
"""df['incall_count'] = df['index'].apply(lambda x: self.get_incall_count(x))
df['outcall_count'] = df['index'].apply(lambda x: self.get_outcall_count(x))"""
def test_plottingOnIntradayStats(self):
filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv"
data1 = utils.loadIntradayData(filepath)
filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv"
data2 = utils.loadIntradayData(filepath)
stats = sleepStats.generateStatsFrom([data1, data2],
sleepStats.STATS_NAME_INTRADAY)
data = stats.apply(pd.value_counts)
mplot.plotSleepValueHeatmap(data, sleepValue=1)
def describe_data(data, info=False, describe=False, value_counts=None, unique=None,
univariate_feature_selection=None, description=None):
# Data diagnostics
if description is not None:
print("\n" + description)
# Info
if info:
print("\nInfo:")
print(data.info())
# Description
if describe:
print("\nDescribe:")
print(data.describe())
# Value counts
if value_counts is not None:
for feature in value_counts:
print("\nValue Counts [" + feature + "]")
print(pd.value_counts(data[feature]))
# Unique values
if unique is not None:
for feature in unique:
print("\nUnique [" + feature + "]")
print(data[feature].unique())
# Univariate feature selection
if univariate_feature_selection is not None:
# Extract predictors and target
predictors = univariate_feature_selection[0]
target = univariate_feature_selection[1]
# Perform feature selection
selector = SelectKBest(f_classif, k="all")
selector.fit(data[predictors], data[target])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
print("\nUnivariate Feature Selection:")
for feature, imp in sorted(zip(predictors, scores), key=lambda x: x[1] if pd.notnull(x[1]) else 0):
print(feature, imp)
def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
def normalize_fare(data):
new_data = None
for embarked in (0, 1, 2):
temp = data[data.Embarked == embarked]
temp['Fare'] /= temp['Fare'].values.mean()
if new_data is None:
new_data = temp
else:
new_data = pd.concat([new_data, temp])
new_data = new_data.sort('PassengerId')
return new_data
data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
data['Age'].fillna(data.Age.median(), inplace=True)
data['Fare'].fillna(data.Fare.median(), inplace=True)
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
data['Embarked'].fillna(0, inplace=True)
if norm_fare:
data = normalize_fare(data)
# Get all the titles and print how often each one occurs.
titles = data["Name"].apply(get_title)
print(pd.value_counts(titles))
# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
titles[titles == k] = v
# Add in the title column.
data['Title'] = titles
data['Title'].fillna(1, inplace=True)
#data['Pos'] = data["Title"] + data['Pclass']
if drop:
#data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
#data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
print(data.keys())
if title_to_onehot:
self.encode(data, 'Title', [i for i in range(1, 11)])
data = data.drop(['Title'], axis=1)
return data