feature_engineering_func.py 文件源码-python代码片段

def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\
                                            countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates multiple user defined functions to create dataframe for ensemble method modeling.
    - it also prints out how long it takes to run
    - processes google quickdraw raw data dataframe
    - after this processing, dataframe contains 404 features
    - the output of this function will be used for ensemble method modeling.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')  
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country code used in country prediction

    output:
    - pickled dataframe that will be used for ensemble method (404 features)
    filename: "./data/MY_feature_{}.pkl".format(category)
    '''
    start_time = time.time()
    #runs feature_eng_pt1 through pt5.
    df_test1 = feature_eng_pt1(df)
    df_test2 = feature_eng_pt2(df_test1)
    df_test3 = feature_eng_pt3(df_test2)
    df_subset = feature_eng_pt4(df_test3)
    df_subset2 = feature_eng_pt5(df_test3)
    df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1)

    # prepares final dataframe
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_final.index = xrange(len(df_final))
        random_ind = np.random.choice(list(df_final.index), sample, replace=False)
        df_final = df_final.loc[list(random_ind)]
    #if purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_final = df_final[(df_final['countrycode']==countries[0])|\
                (df_final['countrycode']==countries[1])|\
               (df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])]
    df_final.index = df_final['key_id']
    df_final.to_pickle("./data/MY_feature_{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))