def fastLapModel(xList, labels, names, multiple=0, full_set=0):
X = numpy.array(xList)
y = numpy.array(labels)
featureNames = []
featureNames = numpy.array(names)
# take fixed holdout set 30% of data rows
xTrain, xTest, yTrain, yTest = train_test_split(
X, y, test_size=0.30, random_state=531)
# for final model (no CV)
if full_set:
xTrain = X
yTrain = y
check_set(xTrain, xTest, yTrain, yTest)
print "Fitting the model to the data set..."
# train random forest at a range of ensemble sizes in order to see how the
# mse changes
mseOos = []
m = 10 ** multiple
nTreeList = range(500 * m, 1000 * m, 100 * m)
# iTrees = 10000
for iTrees in nTreeList:
depth = None
maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1 # try tweaking
RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
oob_score=False, random_state=531, n_jobs=-1)
# RFmd.n_features = 5
RFmd.fit(xTrain, yTrain)
# Accumulate mse on test set
prediction = RFmd.predict(xTest)
mseOos.append(mean_squared_error(yTest, prediction))
# plot training and test errors vs number of trees in ensemble
plot.plot(nTreeList, mseOos)
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Mean Squared Error')
#plot.ylim([0.0, 1.1*max(mseOob)])
plot.show()
print("MSE")
print(mseOos[-1])
return xTrain, xTest, yTrain, yTest, RFmd
评论列表
文章目录