def run():
data = load_binary()
# Extract features
user_feat_matrix = process_level2(data) # X
del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0
cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
with open("result.txt", 'w') as f:
f.write("user_id,day_in_cycle,symptom,probability\n")
labels = final_labels['labels']
for symptom in symptoms:
print(symptom)
s_Y = Y[[x for x in cols if x[1] == symptom]]
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
for cluster in range(3): #number of clusters
print (cluster)
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X[labels == cluster], s_Y.values[labels == cluster])
print("dumping...")
data_dir = 'data'
cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])}
dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])
pipeline_with_clustering.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录