feature_engineering_func.py 文件源码-python代码片段

def feature_eng_pt4(df_cf):
    '''
    function:
    - feature engineering pt4
      create new dataframe that need to be combined with output dataframe
      of feature_eng_pt3
    - it creates 5 features per 1 stroke.
    - this function will creates these 5 features for first 15 strokes of an image

    - Create following features:
      datapoint_percentage_stroke'i' = # of data points in stroke i divide by
                            total number of data points of an image. [float]
            * do not confuse with dp_percent_per_stroke column I previously made.
            dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float!

      direction_stroke'i' = direction of stroke 'i' [float]

      time_stroke'i' = total time spent on stroke'i' [int]

      datapoints_stroke'i' = number of data points in stroke i [int]

      switch_stroke'i' = boolean indicates whether stroke'i' exist in an image
                            0: stroke exist 1: stroke does not exist [int]

    input:
      df_cf = output dataframe from feature_eng_pt3

    output:
      new dataframe with 75 features (5 * 15 features)
    '''

    ar = np.zeros((len(df_cf),75))
    c = 0
    for index_ in df_cf.index:
        stroke = (df_cf.stroke_number[index_])
        ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_])
        ar[c][15:15+stroke] = np.array(df_cf['direction'][index_])
        ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_])
        ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_])
        ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke))
        c += 1
    subset = pd.DataFrame(ar)
    subset.index = df_cf.index
    for num in xrange(15):
        subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)})
    for num in xrange(15,30):
        subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)})
    for num in xrange(30,45):
        subset = subset.rename(columns={num:"time_stroke{}".format(num-30)})
    for num in xrange(45,60):
        subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)})
    for num in xrange(60,75):
        subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)})
    return subset