def feature_eng_pt4(df_cf):
'''
function:
- feature engineering pt4
create new dataframe that need to be combined with output dataframe
of feature_eng_pt3
- it creates 5 features per 1 stroke.
- this function will creates these 5 features for first 15 strokes of an image
- Create following features:
datapoint_percentage_stroke'i' = # of data points in stroke i divide by
total number of data points of an image. [float]
* do not confuse with dp_percent_per_stroke column I previously made.
dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float!
direction_stroke'i' = direction of stroke 'i' [float]
time_stroke'i' = total time spent on stroke'i' [int]
datapoints_stroke'i' = number of data points in stroke i [int]
switch_stroke'i' = boolean indicates whether stroke'i' exist in an image
0: stroke exist 1: stroke does not exist [int]
input:
df_cf = output dataframe from feature_eng_pt3
output:
new dataframe with 75 features (5 * 15 features)
'''
ar = np.zeros((len(df_cf),75))
c = 0
for index_ in df_cf.index:
stroke = (df_cf.stroke_number[index_])
ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_])
ar[c][15:15+stroke] = np.array(df_cf['direction'][index_])
ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_])
ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_])
ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke))
c += 1
subset = pd.DataFrame(ar)
subset.index = df_cf.index
for num in xrange(15):
subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)})
for num in xrange(15,30):
subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)})
for num in xrange(30,45):
subset = subset.rename(columns={num:"time_stroke{}".format(num-30)})
for num in xrange(45,60):
subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)})
for num in xrange(60,75):
subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)})
return subset
feature_engineering_func.py 文件源码
python
阅读 32
收藏 0
点赞 0
评论 0
评论列表
文章目录