feature_engineering_func.py 文件源码-python代码片段

def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates 2 user defined functions that prepares dataframe for CNN modeling.
    - it also prints out how long it takes to run.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word') 
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country codes used in country prediction

    output:
    - pickled dataframe that will be used for CNN modeling (1176 features)
    - each row represents 42 by 28 pixel image
    file name: "./data/{}.pkl".format(category)
    '''

    start_time = time.time()
    #runs CNN feature engineering functions
    df_1 = CNN_feat_eng_pt1(df)
    df_2 = CNN_feat_eng_pt2(df_1)
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_2.index = xrange(len(df_2))
        random_ind = np.random.choice(list(df_2.index), sample, replace=False)
        df_2 = df_2.loc[list(random_ind)]
    #If purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\
               (df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])]
    df_2.index = df_2['key_id']
    df_2.to_pickle("./data/{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
    return df_2



##############################################################################
#           functions for feature engineeering for ensemble methods          #
##############################################################################