def main():
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)
clrTree = tree.DecisionTreeClassifier()
clrTree = clrTree.fit(x_train, y_train)
outTree = clrTree.predict(x_test)
clrKN = KNeighborsClassifier()
clrKN = clrKN.fit(x_train, y_train)
outKN = clrKN.predict(x_test)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%")
print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
python类train_test_split()的实例源码
def main():
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)
clr = NewClassifier()
clr.fit(x_train, y_train)
prediction = clr.predict(x_test)
# Prediction accuracy
print("Accuracy: " + str(accuracy_score(y_test, prediction) * 100) + "%")
# Run main
def do_ml(ticker):
X, y, df = extract_featuresets(ticker)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y,
test_size=0.25)
#clf = neighbors.KNeighborsClassifier()
clf = VotingClassifier([('lsvc',svm.LinearSVC()),
('knn',neighbors.KNeighborsClassifier()),
('rfor',RandomForestClassifier())])
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print('accuracy:',confidence)
predictions = clf.predict(X_test)
print('predicted class counts:',Counter(predictions))
print()
print()
return confidence
# examples of running:
def threshold_estimate(x,y):
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
print("samples: %d %d %f" % (x_train.shape[0], x_test.shape[0], weight))
estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
estimator.fit(x_train, y_train, sample_weight=w1)
y_scores = estimator.predict_proba(x_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
m_idx = np.argmax(f1)
m_thresh = thresholds[2+m_idx]
print("%d %f %f" % (precision.shape[0], f1[m_idx], m_thresh))
return m_thresh
# Estimate threshold for the classifier using inner-round cross validation
def load_data():
global training_data, testing_data
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
xs = lfw_people.data
ys = lfw_people.target
inputs = []
labels = list(ys)
for face in xs:
V = Vol(50, 37, 1, 0.0)
V.w = list(face)
inputs.append(augment(V, 30))
x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25)
training_data = zip(x_tr, y_tr)
testing_data = zip(x_te, y_te)
print 'Dataset made...'
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
from sklearn.cross_validation import train_test_split
from sklearn.metrics import silhouette_score
shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)
train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
full_mat = np.array(list(shape_df.values))
centroids = None
labels = None
best_score = 0
for k in k_range:
res = cluster_shapes(train_mat, full_mat, k)
score = silhouette_score(full_mat, res[1])
if score > best_score:
centroids = res[0]
labels = res[1]
best_score = score
mols[cluster_key] = labels
return mols, centroids
def rede_neural(X, y):
print("Iniciando treinamento da Rede Neural")
X2 = normalize(X)
clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5,
learning_rate='constant',tol=1e-8,learning_rate_init=0.0002,
early_stopping=True,validation_fraction=0.2)
kf = KFold(len(y),n_folds=3)
i = 0
for train,test in kf:
start = time.time()
i = i + 1
print("Treinamento",i)
# dividindo dataset em treino e test
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test]
# fit
clf.fit(X_train, y_train)
print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )")
return clf
data_preparation_tools.py 文件源码
项目:corpus-to-graph-ml
作者: CatalystCode
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE):
d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size)
d_test_2 = []
l_test_2 = []
c_test_2 = []
train_dict = {}
for d in d_train:
train_dict[d] = 1
for d,l,c in zip(d_test, l_test, c_test):
if (train_dict.has_key(d)):
continue
d_test_2.append(d)
l_test_2.append(l)
c_test_2.append(c)
return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2)
# utility to extracts entities from preproceseed files
def get_train_test(pandas_data, target_col):
# Separating target from the rest of the data
x = pandas_data.drop(target_col, 1)
x = data_scaling.scale_numeric_data(x)
# Selection of training/target data for validation and training.
target_loc = pandas_data.columns.get_loc(target_col)
data = pd.DataFrame.as_matrix(pandas_data)
y = data[:, target_loc]
x = pd.DataFrame.as_matrix(x)
# Selecting training and test sets
return cross_validation.train_test_split(x, y, test_size=0.2)
# Removes the target column from the input data.
# Returns two DataFrames.
def getDatas(dataset_dir_name):
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
#word_tokenizer ??????????????????????????????????????????????????
vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
word_tokenizer = vectorizer.build_tokenizer()
#????????list
doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)
return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
def fastLapModel(xList, labels, names, multiple=0, full_set=0):
X = numpy.array(xList)
y = numpy.array(labels)
featureNames = []
featureNames = numpy.array(names)
# take fixed holdout set 30% of data rows
xTrain, xTest, yTrain, yTest = train_test_split(
X, y, test_size=0.30, random_state=531)
# for final model (no CV)
if full_set:
xTrain = X
yTrain = y
check_set(xTrain, xTest, yTrain, yTest)
print "Fitting the model to the data set..."
# train random forest at a range of ensemble sizes in order to see how the
# mse changes
mseOos = []
m = 10 ** multiple
nTreeList = range(500 * m, 1000 * m, 100 * m)
# iTrees = 10000
for iTrees in nTreeList:
depth = None
maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1 # try tweaking
RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
oob_score=False, random_state=531, n_jobs=-1)
# RFmd.n_features = 5
RFmd.fit(xTrain, yTrain)
# Accumulate mse on test set
prediction = RFmd.predict(xTest)
mseOos.append(mean_squared_error(yTest, prediction))
# plot training and test errors vs number of trees in ensemble
plot.plot(nTreeList, mseOos)
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Mean Squared Error')
#plot.ylim([0.0, 1.1*max(mseOob)])
plot.show()
print("MSE")
print(mseOos[-1])
return xTrain, xTest, yTrain, yTest, RFmd
def build_decision_tree(filename):
"""
??????????????
"""
f=open(sys.argv[1],'r')
reader=csv.reader(f)
x=[]
y=[]
for line in reader:
if line[1] in ['1','2','3']:#??????,??????
x.append(line[2:4]+line[5:])
y.append(line[1])
x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y, test_size=0.2, random_state=42)
clf=tree.DecisionTreeClassifier(max_depth=5)
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
print score
return clf,score
def train_xgboost():
df = pd.read_csv('data/stage1_labels.csv')
print(df.head())
x = np.array([np.mean(np.load('npy_result/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
y = df['cancer'].as_matrix()
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
test_size=0.20)
clf = xgb.XGBRegressor(max_depth=10,
n_estimators=1500,
min_child_weight=9,
learning_rate=0.05,
nthread=8,
subsample=0.80,
colsample_bytree=0.80,
seed=4242)
clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
return clf
def pipeline(iteration,C,gamma,random_seed):
x_train, _x , y_train, _y = train_test_split(train_x,train_y,test_size=0.4,random_state=random_seed)
print x_train.shape
clf = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,cache_size=7000,class_weight='balanced',verbose=True,random_state=random_seed)
clf.fit(x_train,y_train)
#predict test set
pred = clf.predict_proba(test_x)
test_result = pd.DataFrame(columns=["Idx","score"])
test_result.Idx = test_Idx
test_result.score = pred[:,1]
test_result.to_csv('./test/svm_{0}.csv'.format(iteration),index=None)
#predict val set
pred = clf.predict_proba(val_x)
val_result = pd.DataFrame(columns=["Idx","score"])
val_result.Idx = val_Idx
val_result.score = pred[:,1]
val_result.to_csv('./val/svm_{0}.csv'.format(iteration),index=None)
def splitValidateModel(self, visualizePredictions = False):
(label_vector, input_vector) = loadData(self.featureFile)
indexArray = range(0, len(input_vector))
trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \
cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit))
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(trainData, trainLabels)
predictedLabels = kNNClassifier.predict(testData)
print("Classification report for classifier %s:\n%s\n"
% ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels))
print('Split Validation training :: Done.\n')
if visualizePredictions:
self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
def trainLimited(self, featureFile, n_datapoints):
(label_vector, input_vector) = loadData(featureFile)
trainData, testData, trainLabels, testLabels = \
cross_validation.train_test_split(input_vector, label_vector, test_size=(0))
n_totalrows = int((len(label_vector)/n_datapoints))
for n in range(0, n_totalrows):
limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
limited_input_vector = trainData[0: (n+1) * n_datapoints]
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(limited_input_vector, limited_label_vector)
scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def trainLimitedSoftmax(self, featureFile, n_datapoints):
(label_vector, input_vector) = self.__loadData__(featureFile)
n_totalrows = int((len(label_vector)/n_datapoints))
k=[]
trainData, testData, trainLabels, testLabels = \
cross_validation.train_test_split(input_vector, label_vector, test_size=(0.2))
for n in range(0, n_totalrows):
limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
limited_input_vector = trainData[0: (n+1) * n_datapoints]
_, maxVal = self.trainSoftmaxWithData(limited_input_vector, limited_label_vector, 1000)
print 'Total Average Value: %s \n\n' % (maxVal)
k.append(maxVal)
print('Limited Softmax training result ----------')
for i in range (0,len(k)):
print '%f on %d datapoints' % (k[i], (n_datapoints * (i+1)))
print '------------------------------------------'
tune_nn.py 文件源码
项目:Kaggle-Competition---Facebook-V---Predicting-Check-Ins
作者: TenaciousTerrier
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def load_split_data(grid_variable):
""" Load train_validation and validation data sets for testing and tuning different
machine learning models.
"""
# Set work directory
os.chdir('C://Users//thep3//OneDrive//Documents//Kaggle//Facebook V - Predicting Check Ins//data//')
# Load data
train = pd.read_csv("train_modified.csv", sep = ",")
grid_variables = ['grid_cell_20x40', 'grid_cell_50x50', 'grid_cell_100x100', 'grid_cell_50x100', 'grid_cell_75x150', 'grid_cell_100x200']
grid_variables.remove(grid_variable)
train = train.drop(grid_variables, 1)
train, test = train_test_split(train, test_size = 0.3, random_state = 0)
# Return data
return train, test
tune_nn_features.py 文件源码
项目:Kaggle-Competition---Facebook-V---Predicting-Check-Ins
作者: TenaciousTerrier
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def load_split_data(grid_variable):
""" Load train_validation and validation data sets for testing and tuning different
machine learning models.
"""
# Set work directory
os.chdir('C://Users//thep3//OneDrive//Documents//Kaggle//Facebook V - Predicting Check Ins//data//')
# Load data
train = pd.read_csv("train_modified.csv", sep = ",")
grid_variables = ['grid_cell_20x40', 'grid_cell_50x50', 'grid_cell_100x100', 'grid_cell_50x100', 'grid_cell_75x150', 'grid_cell_100x200']
grid_variables.remove(grid_variable)
train = train.drop(grid_variables, 1)
train, test = train_test_split(train, test_size = 0.3, random_state = 0)
# Return data
return train, test
def split_dataset(data_set,split=0.5):
'''
According to 'spilt',split the dataset to train_set and test_set
:param data_set: a Bunch object
:param split: integer
:return: x_train, x_test, y_train, y_test:Training data and target values
'''
print('spilting dataset......')
start_time = time.time()
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data_set.data, data_set.target,
test_size=split, random_state=0)
print('spilting took %.2f s' % (time.time() - start_time))
# train_set=(x_train,y_train)
# test_set=(x_test,y_test)
# return train_set,test_set
return x_train, x_test, y_train, y_test
def test_onehot():
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=333)
train = pd.DataFrame(X_train)
test = pd.DataFrame(X_test)
t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=False,
dummy_na=True)
assert t_train.shape[1] == t_test.shape[1]
assert t_train.shape[1] == 441
t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=True,
dummy_na=False)
assert t_train.shape[1] == t_test.shape[1]
assert t_train.shape[1] == 500
def baseline_logisticRegression():
train_data = pd.read_csv(r"data/train.csv")
#print u"?????\n",train_data.info()
#print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = pre_processData(train_data,'process_train_data') # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
#=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
#=prediction = model.predict(X_test)
#=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
#=cv_error.to_csv(r'error.csv',index=True)
#=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
'''??????'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data') # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
# baseline?SVM??——0.78947
def baseline_randomForest():
train_data = pd.read_csv(r"data/train.csv")
print u"?????\n",train_data.info()
print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = pre_processData(train_data,'process_train_data',optimize=False) # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
model = RandomForestClassifier(n_estimators=100).fit(X,y)
#predictions = model.predict(X_test)
#print np.float32(np.sum(predictions == y_test))/np.float32(predictions.shape[0])
'''??'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'baseline_randomForest_result/prediction.csv',index=False)
# baseline crossValidate?SVM??———???????
def baseline_svm_crossValidate():
origin_train_data = pd.read_csv(r"data/train.csv")
process_data = pre_processData(origin_train_data,'process_train_data') # ????????????
process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X_train = train_np[:,1:]
y_train = train_np[:,0]
model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train)
#print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
cv_np = cv_data.as_matrix()
X_cv = cv_np[:,1:]
y_cv = cv_np[:,0]
predictions = model.predict(X_cv)
print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
predictions_item.columns=['error_PassengerId']
# error_items = error_items.reset_index(drop=True)
error_result = pd.concat([error_items,predictions_item],axis=1)
error_result.to_csv(r'error.csv',index=False)
'''??????'''
'''test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'svm_result/prediction.csv',index=False)'''
# baseline crossValidate???????——??????
def baseline_logisticRegression_crossValidate():
origin_train_data = pd.read_csv(r"data/train.csv")
process_data = fe_preprocessData(origin_train_data,'process_train_data') # ????????????
process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X_train = train_np[:,1:]
y_train = train_np[:,0]
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})
cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
cv_np = cv_data.as_matrix()
X_cv = cv_np[:,1:]
y_cv = cv_np[:,0]
predictions = model.predict(X_cv)
print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
'''?????????????????'''
error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
predictions_item.columns=['error_PassengerId']
error_result = pd.concat([error_items,predictions_item],axis=1)
error_result.to_csv(r'error.csv',index=False)
#=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
#=prediction = model.predict(X_test)
#=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
'''??????'''
'''test_data = pd.read_csv(r"data/test.csv")
process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True) # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
def optimize_logisticRegression():
train_data = pd.read_csv(r"data/train.csv")
print u"?????\n",train_data.info()
print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = fe_preprocessData(train_data,'process_train_data') # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
#=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
'''??????'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = fe_preprocessData(test_data,'process_test_data') # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
## ????????
def logistic_test(X,y):
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print 'First round:',metrics.accuracy_score(y_test,y_pred)
#tune parameter C
crange =[0.01,0.1,1,10,100]
for num in crange:
model = LogisticRegression(C=num)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)
def svm_test(X,y):
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)
model = svm.LinearSVC(C=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print 'First round:',metrics.accuracy_score(y_test,y_pred)
#tune parameter C
crange =[0.01,0.1,1,10,100]
for num in crange:
model = svm.LinearSVC(C=num)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)
def nb_test(X,y):
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print metrics.accuracy_score(y_test,y_pred)
def rf_test(X,y):
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)
rf_model = RandomForestClassifier(n_estimators = 100, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print metrics.accuracy_score(y_test,y_pred)
#plot confusion_matrix, 'col' is the y target