test_submission.py 文件源码-python代码片段

def app_activity_features():
    train = pd.read_csv("gender_age_train.csv")
    test = pd.read_csv("gender_age_test.csv")
    train.drop(['gender','age','group'],axis=1,inplace=True)
    data = train.append(test)

    """ Merge with brand_model table"""
    device_table = pd.read_csv("phone_brand_device_model.csv")
    data = pd.merge(data,device_table,how='left',on='device_id')
    data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
    del device_table
    print "data build"
    """
    Create dataframe indicating for each device id, which app is present, and how much is it active
        - merge events and app_events on event_id
        - group by device_id and app_id, and take the mean of activity
    """
    events = pd.read_csv("events.csv")
    events = events[events['device_id'].isin(list(data['device_id']))]
    apps = pd.read_csv("app_events.csv")
    apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
    apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
    del events
    print "events build"
    """Reshape the dataframe so that each app is a new feature"""
    reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
    reshaped[list(pd.unique(apps['app_id']))]=0

    for app in list(pd.unique(apps['app_id'])):
        sliced = apps[apps['app_id']==app]
        reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
    del apps
    return reshaped