def app_activity_features():
train = pd.read_csv("gender_age_train.csv")
test = pd.read_csv("gender_age_test.csv")
train.drop(['gender','age','group'],axis=1,inplace=True)
data = train.append(test)
""" Merge with brand_model table"""
device_table = pd.read_csv("phone_brand_device_model.csv")
data = pd.merge(data,device_table,how='left',on='device_id')
data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models
del device_table
print "data build"
"""
Create dataframe indicating for each device id, which app is present, and how much is it active
- merge events and app_events on event_id
- group by device_id and app_id, and take the mean of activity
"""
events = pd.read_csv("events.csv")
events = events[events['device_id'].isin(list(data['device_id']))]
apps = pd.read_csv("app_events.csv")
apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
del events
print "events build"
"""Reshape the dataframe so that each app is a new feature"""
reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
reshaped[list(pd.unique(apps['app_id']))]=0
for app in list(pd.unique(apps['app_id'])):
sliced = apps[apps['app_id']==app]
reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
del apps
return reshaped
评论列表
文章目录