RFfastestLap.py 文件源码-python代码片段

def fastLapModel(xList, labels, names, multiple=0, full_set=0):
    X = numpy.array(xList)
    y = numpy.array(labels)
    featureNames = []
    featureNames = numpy.array(names)
    # take fixed holdout set 30% of data rows
    xTrain, xTest, yTrain, yTest = train_test_split(
        X, y, test_size=0.30, random_state=531)
    # for final model (no CV)
    if full_set:
        xTrain = X
        yTrain = y
    check_set(xTrain, xTest, yTrain, yTest)
    print "Fitting the model to the data set..."
    # train random forest at a range of ensemble sizes in order to see how the
    # mse changes
    mseOos = []
    m = 10 ** multiple
    nTreeList = range(500 * m, 1000 * m, 100 * m)
    # iTrees = 10000
    for iTrees in nTreeList:
        depth = None
        maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1  # try tweaking
        RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
                                              oob_score=False, random_state=531, n_jobs=-1)
        # RFmd.n_features = 5
        RFmd.fit(xTrain, yTrain)

        # Accumulate mse on test set
        prediction = RFmd.predict(xTest)
        mseOos.append(mean_squared_error(yTest, prediction))
    # plot training and test errors vs number of trees in ensemble
    plot.plot(nTreeList, mseOos)
    plot.xlabel('Number of Trees in Ensemble')
    plot.ylabel('Mean Squared Error')
    #plot.ylim([0.0, 1.1*max(mseOob)])
    plot.show()
    print("MSE")
    print(mseOos[-1])
    return xTrain, xTest, yTrain, yTest, RFmd