def get_entity_features(self):
# First we will calculate the rates, so let's drop all the NaN
rate_df = self.df.dropna(subset=['rate'])
# Calculate the rates by hour and delete the old rate column.
rate_df = rate_df.\
merge(mean_hourly_rate_df(rate_df),
left_on=['_id'], right_on=['_id']).\
drop('rate', axis=1).\
drop_duplicates()
# Now get the stats we want for rate
rate_df = self.calculate_entity_rate_features(rate_df)
# Get a count of the entities
df = pd.value_counts(self.df[self.entity]).\
reset_index().\
rename(columns={'index': self.entity,
self.entity: self.entity+'_count'})
# Get counts of unique locations
for loc_col, unique_loc_col in [('city_wikidata_id',
'unique_cities'),
('state_wikidata_id',
'unique_states')]:
unique_loc_df = self.df.loc[:, [self.entity, loc_col]].\
dropna().\
drop_duplicates().\
groupby(self.entity).\
count().\
reset_index().\
rename(columns={loc_col: unique_loc_col})
df = df.merge(unique_loc_df,
how='left',
left_on=self.entity,
right_on=self.entity)
df.loc[:, unique_loc_col] = \
df.loc[:, unique_loc_col].fillna(0).astype(int)
del unique_loc_df
# Reset the index on our rate dataframe and rename the columns
rate_df.reset_index(level=0, inplace=True)
rate_df.columns = [self.entity, 'rate_count', 'rate_mean', 'rate_std', 'rate_median']
# Lastly merge the two dataframes
return df.merge(rate_df, how='outer')
# Save this code as we may use it later
"""df['incall_count'] = df['index'].apply(lambda x: self.get_incall_count(x))
df['outcall_count'] = df['index'].apply(lambda x: self.get_outcall_count(x))"""
评论列表
文章目录