def get_data():
f_path = "../dataset/logistic_regression/UCLA_dataset.csv"
df = pd.read_csv(f_path)
print df.head()
print df.describe()
print df.std()
print pd.crosstab(df['admit'], df['rank'], rownames=['admit'])
# df.hist()
# pl.show()
# dummy_ranks = pd.get_dummies(df['rank'], prefix='rank')
# print dummy_ranks.head()
# train_cols = df.columns[1:]
# lr = sm.Logit(df['admit'], df[train_cols])
# ret = lr.fit()
# print ret.summary()
train, test = train_test_split(df, test_size=0.2)
train_x, train_y = train[train.columns[1:]], train['admit']
test_x, test_y = test[test.columns[1:]], test['admit']
lr = LogisticRegression()
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
print accuracy_score(test_y, y_pred)
rf = RandomForestClassifier(n_jobs=4)
rf.fit(train_x, train_y)
Y_pred = rf.predict(test_x)
cnf_matrix = confusion_matrix(test_y, Y_pred)
print cnf_matrix
accuracy_percent = accuracy_score(test_y, Y_pred)
print "accuracy is: %s%s" % (accuracy_percent, '%')
recall_percent = recall_score(test_y, Y_pred)
print "recall is: %s%s" % (recall_percent, '%')
评论列表
文章目录