def parse_context_ftl(path):
raw = read(path, "ftl")
df = raw.copy()
df['ut_ms'] = pd.to_datetime(raw['utb_ms'], unit='ms')
df.sort_values("ut_ms", inplace=True)
# dummies
df = df.set_index('ut_ms')
dummies = pd.get_dummies(df.type).join(df['flagcomms'], how="outer")
dummies = dummies.resample("1h").sum().fillna(0.0)
df = raw.copy()
df['event'] = df.type + df.flagcomms.astype("str")
del df['type'], df['flagcomms']
df['ute_ms'] = pd.to_datetime(df['ute_ms'], unit='ms')
df['utb_ms'] = pd.to_datetime(df['utb_ms'], unit='ms')
durations = [event_to_min_per_hour(df, event) for event in df.event.unique()]
durations = pd.concat(durations, axis=1).fillna(0)
return dummies.join(durations, how="outer")
python类get_dummies()的实例源码
def predict():
if clf:
try:
json_ = request.json
query = pd.get_dummies(pd.DataFrame(json_))
# https://github.com/amirziai/sklearnflask/issues/3
# Thanks to @lorenzori
query = query.reindex(columns=model_columns, fill_value=0)
prediction = list(clf.predict(query))
return jsonify({'prediction': prediction})
except Exception, e:
return jsonify({'error': str(e), 'trace': traceback.format_exc()})
else:
print 'train first'
return 'no model here'
def preprocess(file,istrian):
df=pd.read_csv(file,parse_dates=['Date'],dayfirst=True)
end_missing=['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure',
'Min_Atmospheric_Pressure','Min_Ambient_Pollution','Max_Ambient_Pollution']
df=df.fillna(-1)
if istrian:
outcome=df.Footfall
df=df.drop(['Footfall'],axis=1)
else:
outcome=np.nan
df['month']=df['Date'].apply(lambda x: x.month)
df['date']=df['Date'].apply(lambda x: x.day)
df['weekday']=df['Date'].apply(lambda x: x.weekday())
df['sardiya']=df['month'].apply(lambda x: 1 if x in [1,2,11,12,3] else 0)
df.date=df.date.apply(get_normal_date)
dummies=pd.get_dummies(df.Park_ID,prefix='park')
dummies=pd.get_dummies(df.Location_Type,prefix='location')
df['Direction_Of_Wind2']=df.Direction_Of_Wind.apply(get_wind_dir)
return df,outcome
#load training set
def get_comment_product_fea(endtime):
enddt = pd.to_datetime(endtime,format = '%Y-%m-%d')
if enddt == pd.to_datetime('2016-04-15',format = '%Y-%m-%d'):
commentdata = pd.read_csv(FilePath + CommentFile)
commentdata = commentdata[(commentdata["dt"] == "2016-04-15")]
commentdata = commentdata.sort_values(by="sku_id").reset_index()[["sku_id", "comment_num", "has_bad_comment", "bad_comment_rate"]]
return commentdata
else:
startdt = enddt - pd.Timedelta(days=7)
commentpath = FilePath + CommentFile
commentdata_ALL = pd.read_csv(commentpath) # ?Jdatya_comment.csv??????
commentdata_ALL.dt = pd.to_datetime(commentdata_ALL.dt, format='%Y-%m-%d') # ?dt????date??
comment = commentdata_ALL[(commentdata_ALL.dt <= enddt) & (commentdata_ALL.dt > startdt)]
df = pd.get_dummies(comment['comment_num'], prefix='comment_num')
comment = pd.concat([comment, df], axis=1)
comment = comment[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3','comment_num_4']]
sorted_comment = comment.sort_values(by=['sku_id']).reset_index().drop('index',1)
#sorted_comment.to_csv(FilePath + 'skuFeaInComment_before'+str(enddt), index=False)
return sorted_comment
# ????????
def get_action_feat(start_time, end_time,action_data):
actions=action_data[(action_data['time']>=start_time)&(action_data['time']<=end_time)]
#actions = get_actions(start_time, end_time)
#actions = actions[actions['cate'] == 8]
actions = actions[['user_id', 'sku_id', 'type']]
df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_time, end_time))
actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
actions.fillna(0,inplace=True)
name='%s-%s-action' % (start_time, end_time)
actions[name+'_1256']=actions[name+'_1']+actions[name+'_2']+actions[name+'_5']+actions[name+'_6']
actions[name+'_1256_d_4']=actions[name+'_4']/actions[name+'_1256']
del actions['type']
# action_fea_file = 'action_fea_' + STARTdt_str + 'to' + ENDdt_str + '.csv'
# action_fea.to_csv(FilePath + action_fea_file, index=False)
return actions
#????????????????????
def get_basic_user_fea():
user = pd.read_csv(FilePath+UserFile, encoding='gbk')
# user['age'] = user['age'].map(convert_age)
user['age']=user['age'].replace([u'16-25?',u'26-35?',u'36-45?',u'46-55?',u'56???'],[1,2,3,4,5])
user=user[((user['age']==1) |
(user['age']==2) |
( user['age']==3) |
(user['age']==4) |
(user['age']==5)|
(user['age']==-1))]
age_df = pd.get_dummies(user["age"], prefix="age")
sex_df = pd.get_dummies(user["sex"], prefix="sex")
user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
user.to_csv(FilePath + 'user_basic_fea.csv',index=False)
return user
#???????????????
def next_batch(self):
df = self.batch_df[self.pointer]
x = np.array([d[0] for d in df])
xl = np.array([d[1] for d in df])
xr = np.array([d[2] for d in df])
tar = np.array([d[3] for d in df])
y = np.array([d[-1] for d in df])
y = pd.get_dummies(y).values.astype(np.int32)
seq_len = [len(seq) for seq in x]
seq_len_l = [len(seq) for seq in xl]
seq_len_r = [len(seq) for seq in xr]
if self.dynamic_padding:
x = np.array(self.pad_minibatches(x, 'RIGHT'))
xl = np.array(self.pad_minibatches(xl, 'RIGHT'))
xr = np.array(self.pad_minibatches(xr, 'RIGHT'))
self.pointer += 1
return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar
def next_batch(self):
df = self.batch_df[self.pointer]
x = np.array([d[0] for d in df])
xl = np.array([d[1] for d in df])
xr = np.array([d[2] for d in df])
tar = np.array([d[3] for d in df])
y = np.array([d[-1] for d in df])
# y = pd.get_dummies(y).values.astype(np.int32)
seq_len = [len(seq) for seq in x]
seq_len_l = [len(seq) for seq in xl]
seq_len_r = [len(seq) for seq in xr]
if self.dynamic_padding:
x = np.array(self.pad_minibatches(x, 'RIGHT'))
xl = np.array(self.pad_minibatches(xl, 'RIGHT'))
xr = np.array(self.pad_minibatches(xr, 'RIGHT'))
self.pointer += 1
return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar
def load_data(in_file):
# read csv file prepared by freddie_data_analysis module
df = pd.read_csv(in_file)
# drop unneeded columns
columns = df.columns.tolist()
for col in columns:
if 'Unnamed' in col:
df.drop(col, axis=1, inplace=True)
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
df.drop(['published_date'], axis=1, inplace=True)
# replace nan values with 0
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)
# apply get_dummies to particular columns
df = pd.get_dummies(df, prefix=['state'], columns=['property_state'])
df = pd.get_dummies(df, prefix=['ss'], columns=['special_servicer'])
# return prepared dataframe
return df
def gen_fer2013_csv(csv_path, reshape_width=48, reshape_height=48):
data = pd.read_csv(csv_path)
pixels = data['pixels'].tolist()
width, height = 48, 48
faces = []
for pixel_sequence in pixels:
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
face = np.asarray(face).reshape(width, height)
face = cv2.resize(face.astype('uint8'),
(reshape_width, reshape_height))
faces.append(face.astype('float32'))
faces = np.asarray(faces)
faces = np.expand_dims(faces, -1)
emotions = pd.get_dummies(data['emotion']).as_matrix()
return faces, emotions
def make_x(self, df):
x_spec = self.get_individualised_x_spec()
X = df[XY.reduce_tuples(
[a for a, b in x_spec if b != 'linear_by_categorical']
)].copy()
cats = XY.reduce_tuples(
[a for a, b in x_spec if b == 'categorical' or b == 'ordinal']
)
X = self.prep_work(X, x_spec)
X = pd.get_dummies(
X, prefix=cats, prefix_sep='_', columns=cats,
drop_first=False, dummy_na=False
)
return X
def get_comments_product_feat(start_date, end_date):
dump_path = './cache/comments_accumulate_%s_%s.pkl' % (start_date, end_date)
if os.path.exists(dump_path):
comments = pickle.load(open(dump_path))
else:
comments = pd.read_csv(comment_path)
comment_date_end = end_date
comment_date_begin = comment_date[0]
for date in reversed(comment_date):
if date < comment_date_end:
comment_date_begin = date
break
comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)]
df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame
# del comments['dt']
# del comments['comment_num']
comments = comments[
['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3',
'comment_num_4']]
pickle.dump(comments, open(dump_path, 'w'))
return comments
def get_accumulate_product_feat(start_date, end_date):
feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio',
'product_action_5_ratio', 'product_action_6_ratio']
dump_path = './cache/product_feat_accumulate_%s_%s.pkl' % (start_date, end_date)
if os.path.exists(dump_path):
actions = pickle.load(open(dump_path))
else:
actions = get_actions(start_date, end_date)
df = pd.get_dummies(actions['type'], prefix='action')
actions = pd.concat([actions['sku_id'], df], axis=1)
actions = actions.groupby(['sku_id'], as_index=False).sum()
actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1']
actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2']
actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3']
actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5']
actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6']
actions = actions[feature]
pickle.dump(actions, open(dump_path, 'w'))
return actions
def get_basic_user_feat():
dump_path = './cache/basic_user.csv'
# one-hot coding age,sex,lv-cd
if os.path.exists(dump_path):
# user = pickle.load(open(dump_path))
user = pd.read_csv(dump_path)
else:
user = pd.read_csv(user_path, encoding='gbk')
user['age'] = user['age'].map(convert_age) # ?????
user['user_reg_tm'] = user['user_reg_tm'].map(convert_reg_date)
age_df = pd.get_dummies(user["age"], prefix="age")
sex_df = pd.get_dummies(user["sex"], prefix="sex")
# user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
user = pd.concat([user[['user_id', 'user_reg_tm', 'user_lv_cd']], age_df, sex_df], axis=1)
# pickle.dump(user, open(dump_path, 'w'))
user.to_csv(dump_path, index=False, encoding='utf-8')
print 'finish get basic user info'
return user
def get_basic_product_feat():
dump_path = './cache/basic_product.csv'
# one-hot coding a1,a2,a3
if os.path.exists(dump_path):
# product = pickle.load(open(dump_path))
product = pd.read_csv(dump_path)
else:
product = pd.read_csv(product_path)
attr1_df = pd.get_dummies(product["a1"], prefix="a1")
attr2_df = pd.get_dummies(product["a2"], prefix="a2")
attr3_df = pd.get_dummies(product["a3"], prefix="a3")
cate_df = pd.get_dummies(product['cate'], prefix='cate')
brand_df = pd.get_dummies(product['brand'], prefix='brand')
# product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df,cate_df], axis=1)
product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df, brand_df, cate_df], axis=1)
# pickle.dump(product, open(dump_path, 'w'))
product.to_csv(dump_path, index=False)
print 'finish get basic product info'
return product
def get_action_feat(start_date, end_date):
'''
Action:
1.????????????
2.???3.??????4.???5.???6.??
'''
dump_path = './cache/action_accumulate_%s_%s.csv' % (start_date, end_date)
if os.path.exists(dump_path):
# actions = pickle.load(open(dump_path))
actions = pd.read_csv(dump_path)
else:
actions = get_actions(start_date, end_date)
# actions = pd.read_csv(action_1_path)
actions = actions[['user_id', 'sku_id', 'type']]
df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
# ??????????????
actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
del actions['type']
# pickle.dump(actions, open(dump_path, 'w'))
actions.to_csv(dump_path, index=False)
print 'finish get action feat'
return actions
def get_accumulate_brand_feat(start_date, end_date):
feature = ['brand', 'brand_action_1_ratio', 'brand_action_2_ratio', 'brand_action_3_ratio',
'brand_action_5_ratio', 'brand_action_6_ratio', 'brand_action_num']
dump_path = './cache/brand_feat_accumulate_%s_%s.csv' %(start_date,end_date)
if os._exists(dump_path):
actions = pd.read_csv(dump_path)
else:
actions = get_actions(start_date,end_date)
df = pd.get_dummies(actions['type'],prefix='action')
actions = pd.concat([actions['brand'],df],axis=1)
actions = actions.groupby(['brand'],as_index = False).sum()
actions['brand_action_1_ratio'] = actions['action_4']/actions['action_1']
actions['brand_action_2_ratio'] = actions['action_4']/actions['action_2']
actions['brand_action_3_ratio'] = actions['action_4']/actions['action_3']
actions['brand_action_5_ratio'] = actions['action_4']/actions['action_5']
actions['brand_action_6_ratio'] = actions['action_4']/actions['action_6']
actions['brand_action_num'] = actions['action_1'] + actions['action_2'] + actions['action_3'] + actions[
'action_4'] + actions['action_5'] + actions['action_6']
actions = actions[feature]
actions.replace(np.inf, 9999)
actions.to_csv(dump_path)
return actions
pass
def load_data(shuffle=True, n_cols=None):
train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv')
test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv')
usecols = list(range(n_cols)) if n_cols else None
df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
df_test = pd.read_csv(test_path, engine='c', usecols=usecols)
if shuffle:
df_train = df_train.sample(frac=1, random_state=seed)
df_test = df_test.sample(frac=1, random_state=seed)
X_train = df_train.iloc[:, 2:].as_matrix()
X_test = df_test.iloc[:, 2:].as_matrix()
y_train = pd.get_dummies(df_train[['cancer_type']]).as_matrix()
y_test = pd.get_dummies(df_test[['cancer_type']]).as_matrix()
return (X_train, y_train), (X_test, y_test)
def build(self):
train, _, test, _ = data.get()
cset = []
ntrain = len(train)
df = pd.concat([train, test], axis=0)
to_drop = df.columns
for sc in ['height', 'weight', 'ap_hi', 'ap_lo']:
tc = df[sc].apply(str)
maxc = tc.apply(len).max()
for n in range(maxc):
df['ft_l_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[n]) if n < len(s) else -1)
df['ft_r_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[-n]) if n < len(s) else -1)
cset.append('ft_l_'+sc+'_'+str(n))
cset.append('ft_r_'+sc+'_'+str(n))
df = pd.get_dummies(df, columns=cset).drop(to_drop, axis=1)
self.train_= df[:ntrain]
self.test_ = df[ntrain:]
return self.train_, self.test_, None
def build(self):
train, y, test, _ = data.get()
ntrain = len(train)
df = pd.concat([train, test], axis=0)
to_drop = df.columns
dcn = []
for n in [2, 5, 10, 15, 25]:
cname = 'kmeans_' + str(n)
dcn.append(cname)
df[cname] = cluster.KMeans(n_clusters=n).fit_predict(df)
df = pd.get_dummies(df, columns=dcn)
df = df.drop(to_drop, axis=1)
train = df[:ntrain]
test = df[ntrain:].copy()
return train.astype('int32'), test.astype('int32'), None