python类cross_val_score()的实例源码

utilities.py 文件源码 项目:Python-Machine-Learning-Cookbook 作者: PacktPublishing 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def print_accuracy_report(classifier, X, y, num_validations=5):
    accuracy = cross_validation.cross_val_score(classifier, 
            X, y, scoring='accuracy', cv=num_validations)
    print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"

    f1 = cross_validation.cross_val_score(classifier, 
            X, y, scoring='f1_weighted', cv=num_validations)
    print "F1: " + str(round(100*f1.mean(), 2)) + "%"

    precision = cross_validation.cross_val_score(classifier, 
            X, y, scoring='precision_weighted', cv=num_validations)
    print "Precision: " + str(round(100*precision.mean(), 2)) + "%"

    recall = cross_validation.cross_val_score(classifier, 
            X, y, scoring='recall_weighted', cv=num_validations)
    print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
sk_feature_process.py 文件源码 项目:python_utils 作者: Jayhello 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def rfr_feature_select():
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.cross_validation import cross_val_score, ShuffleSplit

    boston = load_boston()
    X = boston["data"]
    Y = boston["target"]
    names = boston["feature_names"]

    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i + 1],
                                Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))

    print sorted(scores, reverse=True)
classification_model.py 文件源码 项目:BotBoosted 作者: brityboy 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def evaluate_model(model, X_train, y_train):
    '''
    INPUT
         - model: this is a classification model from sklearn
         - X_train: 2d array of the features
         - y_train: 1d array of the target
    OUTPUT
         - information about the model's accuracy using 10
         fold cross validation
         - model: the fit model
    Returns the model
    '''
    print(np.mean(cross_val_score(model, X_train, y_train,
                                  cv=10, n_jobs=-1, verbose=10)))
    model.fit(X_train, y_train)
    return model
utilities.py 文件源码 项目:Python-Machine-Learning-Cookbook 作者: PacktPublishing 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def print_accuracy_report(classifier, X, y, num_validations=5):
    accuracy = cross_validation.cross_val_score(classifier, 
            X, y, scoring='accuracy', cv=num_validations)
    print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"

    f1 = cross_validation.cross_val_score(classifier, 
            X, y, scoring='f1_weighted', cv=num_validations)
    print "F1: " + str(round(100*f1.mean(), 2)) + "%"

    precision = cross_validation.cross_val_score(classifier, 
            X, y, scoring='precision_weighted', cv=num_validations)
    print "Precision: " + str(round(100*precision.mean(), 2)) + "%"

    recall = cross_validation.cross_val_score(classifier, 
            X, y, scoring='recall_weighted', cv=num_validations)
    print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
spam.py 文件源码 项目:ml-talks-duolingo 作者: burrsettles 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def experiment(model_class, vectorizer, xval):
    name = model_class.__class__.__name__ + '.' + model_class.penalty
    model = model_class.fit(X, y)
    model_weights = vectorizer.inverse_transform(model.coef_)[0]
    with open('weights.%s.txt' % name, 'w') as f:
        f.write('%s\t%f\n' % ('(intercept)', model.intercept_))
        f.writelines('%s\t%f\n' % k for k in model_weights.items())
    acc_scores = cross_validation.cross_val_score(model, X, y, cv=xval)
    auc_scores = cross_validation.cross_val_score(model, X, y, scoring='roc_auc', cv=xval)
    prec_scores = cross_validation.cross_val_score(model, X, y, scoring='precision', cv=xval)
    recall_scores = cross_validation.cross_val_score(model, X, y, scoring='recall', cv=xval)
    f1_scores = cross_validation.cross_val_score(model, X, y, scoring='f1', cv=xval)
    print '-'*80
    print 'acc\t%.4f\t%s' % (np.mean(acc_scores), name)
    print 'auc\t%.4f\t%s' % (np.mean(auc_scores), name)
    print 'prec\t%.4f\t%s' % (np.mean(prec_scores), name)
    print 'recall\t%.4f\t%s' % (np.mean(recall_scores), name)
    print 'f1\t%.4f\t%s' % (np.mean(f1_scores), name)
training.py 文件源码 项目:static-gesture-recognition 作者: windmark 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def trainLimited(self, featureFile, n_datapoints):
    (label_vector, input_vector) = loadData(featureFile)

    trainData, testData, trainLabels, testLabels = \
      cross_validation.train_test_split(input_vector, label_vector, test_size=(0))

    n_totalrows = int((len(label_vector)/n_datapoints))
    for n in range(0, n_totalrows):
      limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
      limited_input_vector = trainData[0: (n+1) * n_datapoints]

      kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
      kNNClassifier.fit(limited_input_vector, limited_label_vector)

      scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
      print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
code.py 文件源码 项目:The_Ultimate_Student_Hunt 作者: analyticsvidhya 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'):
    cv_method = KFold(len(dtrain),5)
    cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores))

    dtrain_for_val = dtrain[dtrain['Year']<2000]
    dtest_for_val = dtrain[dtrain['Year']>1999]
    #cv_method = KFold(len(dtrain_for_val),5)
    #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores_2, np.mean(cv_scores_2)

    dtrain_for_val_ini = dtrain_for_val[predictor_var]
    dtest_for_val_ini = dtest_for_val[predictor_var]
    model.fit(dtrain_for_val_ini,dtrain_for_val[target])
    pred_for_val = model.predict(dtest_for_val_ini)

    #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
svm.py 文件源码 项目:dancedeets-monorepo 作者: mikelambert 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def eval_model(name, model, data):
    print '=' * 20
    print name, 'training'
    model.fit(data, train.target, sample_weight=sample_weights)
    print name, 'trained'

    predictions = model.predict(processed_test_data)
    print name, 'accuracy', np.mean(predictions == test.target)

    print(metrics.classification_report(test.target, predictions))
    print metrics.confusion_matrix(test.target, predictions)

    print name, 'f1 cross validation', cross_validation.cross_val_score(model, grammar_processed_data, train.target, scoring='f1')
    print name, 'precision cross validation', cross_validation.cross_val_score(
        model, grammar_processed_data, train.target, scoring='precision'
    )
    return model, predictions


# SVM need balance on input features, same ranges and variances and stuff like that
svm.py 文件源码 项目:svm-text-classification-api 作者: viniciusbo 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def cross_validation_report(clf, dataset):
  data = count_vectorizer.transform([row[0] for row in dataset])
  target = [row[1] for row in dataset]
  return cross_validation.cross_val_score(clf, data, target)
04_model_preparation.py 文件源码 项目:uda-da-p5-enron-fraud-detection 作者: watanabe8760 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def evaluate(model, name):
    """
    Evaluates model by cross validation.
    """
    # Get scores through cross validation
    score_f1 = cross_val_score(model, X, y, scoring='f1', cv=splitter_)
    score_pr = cross_val_score(model, X, y, scoring='precision', cv=splitter_)
    score_re = cross_val_score(model, X, y, scoring='recall', cv=splitter_)
    # Save image of score distributions
    save_dist(name, score_f1, score_pr, score_re)
    # Compute mean and std of each score
    result = DataFrame(index=['f1', 'precision', 'recall'],
                       columns=['mean', 'std'])
    result.loc['f1', 'mean'] = np.mean(score_f1)
    result.loc['precision', 'mean'] = np.mean(score_pr)
    result.loc['recall', 'mean'] = np.mean(score_re)
    result.loc['f1', 'std'] = np.std(score_f1)
    result.loc['precision', 'std'] = np.std(score_pr)
    result.loc['recall', 'std'] = np.std(score_re)
    print model
    print result
rf.py 文件源码 项目:SMAC3 作者: automl 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def rf_from_cfg(cfg, seed):
    """
        Creates a random forest regressor from sklearn and fits the given data on it.
        This is the function-call we try to optimize. Chosen values are stored in
        the configuration (cfg).

        Parameters:
        -----------
        cfg: Configuration
            configuration chosen by smac
        seed: int or RandomState
            used to initialize the rf's random generator

        Returns:
        -----------
        np.mean(rmses): float
            mean of root mean square errors of random-forest test predictions
            per cv-fold
    """
    rfr = RandomForestRegressor(
        n_estimators=cfg["num_trees"],
        criterion=cfg["criterion"],
        min_samples_split=cfg["min_samples_to_split"],
        min_samples_leaf=cfg["min_samples_in_leaf"],
        min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
        max_features=cfg["max_features"],
        max_leaf_nodes=cfg["max_leaf_nodes"],
        bootstrap=cfg["do_bootstrapping"],
        random_state=seed)

    def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))
    # Creating root mean square error for sklearns crossvalidation
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
    return -1 * np.mean(score)  # Because cross_validation sign-flips the score
Models.py 文件源码 项目:Stock-Prediction-Time-Series-Analysis-Python 作者: Nekooeimehr 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def Second_Model_KRR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)}
    krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error")
    krr_Tuned.fit(Scaled_Input_Data, Output_Data)
    KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma'])
    KRR_Time = time.time() - T0
    print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time)
    MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_KRR = np.mean(list(MSEs_KRR))
    print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR))
    return(MeanMSE_KRR, krr_Tuned)
lightweight_classifier.py 文件源码 项目:BotBoosted 作者: brityboy 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def evaluate_model(model, X_train, y_train):
    """
    Args:
        model (sklearn classification model): this model from sklearn that
        will be used to fit the data and to see the 10 fold cross val score of
        X_train (2d numpy array): this is the feature matrix
        y_train (1d numpy array): this is the array of targets
    Returns:
        prints information about the model's accuracy using 10
         fold cross validation
        model (sklearn classification model): the model that has already been
        fit to the data
    """
    print(np.mean(cross_val_score(model, X_train, y_train,
                                  cv=10, n_jobs=-1, verbose=10)))
    model.fit(X_train, y_train)
    return model
insights.py 文件源码 项目:menrva 作者: amirziai 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def clf_scores(clf, x_train, y_train, x_test, y_test):
    info = dict()

    # TODO: extend this to a confusion matrix per fold for more flexibility downstream (tuning)
    # TODO: calculate a set of ROC curves per fold instead of running it on test, currently introducing bias
    scores = cross_val_score(clf, x_train, y_train, cv=cv, n_jobs=-1)
    runtime = time()
    clf.fit(x_train, y_train)
    runtime = time() - runtime
    y_test_predicted = clf.predict(x_test)
    info['runtime'] = runtime
    info['accuracy'] = min(scores)
    info['accuracy_test'] = accuracy_score(y_test, y_test_predicted)
    info['accuracy_folds'] = scores
    info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted)
    clf.fit(x_train, y_train)
    fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test))
    info['fpr'] = fpr
    info['tpr'] = tpr
    info['auc'] = auc(fpr, tpr)

    return info
test_cross_validation.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    cv_indices = cval.KFold(len(y), 5)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    cv_indices = cval.KFold(len(y), 5)
    cv_masks = []
    for train, test in cv_indices:
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks)
test_cross_validation.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_cross_val_score_precomputed():
    # test for svm with precomputed kernel
    svm = SVC(kernel="precomputed")
    iris = load_iris()
    X, y = iris.data, iris.target
    linear_kernel = np.dot(X, X.T)
    score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
    svm = SVC(kernel="linear")
    score_linear = cval.cross_val_score(svm, X, y)
    assert_array_equal(score_precomputed, score_linear)

    # Error raised for non-square X
    svm = SVC(kernel="precomputed")
    assert_raises(ValueError, cval.cross_val_score, svm, X, y)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cval.cross_val_score, svm,
                  linear_kernel.tolist(), y)
test_cross_validation.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_cross_val_score_with_score_func_classification():
    iris = load_iris()
    clf = SVC(kernel='linear')

    # Default score (should be the accuracy score)
    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # Correct classification score (aka. zero / one score) - should be the
    # same as the default estimator score
    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="accuracy", cv=5)
    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # F1 score (class are balanced so f1_score should be equal to zero/one
    # score
    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="f1_weighted", cv=5)
    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
test_cross_validation.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
test_cross_validation.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
lr.py 文件源码 项目:2016CCF-unicom 作者: xuguanggen 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def run():
    tr_data = np.loadtxt('../new/TRAIN_LRFORMAT.txt')
    te_data = np.loadtxt('../new/TEST_LRFORMAT.txt')

    tr_x = tr_data[:,1:]
    tr_y = tr_data[:,0]
    te_x = te_data[:,1:]

    lr = LogisticRegression(
            solver='liblinear',
            multi_class='ovr',
            class_weight='balanced',
            penalty='l2',
            n_jobs=-1)
    #te_pred = lr.predict_proba(te_x)
    cv = 10
    scores = cross_val_score(lr,tr_x,tr_y,cv=cv,scoring='accuracy')
    print(str(scores))
    #np.savetxt('result/te_lr.txt',te_pred)
linearRegression_lassoRegularization.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def rmse_cv(model, X, y):
     return (cross_val_score(model, X, y, scoring=scorer)).mean()
solution.py 文件源码 项目:Kaggle 作者: lawlite19 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def baseline_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    #print u"?????\n",train_data.info()
    #print u'?????\n',train_data.describe()  
    #display_data(train_data)  # ????????
    #display_with_process(train_data) # ??????????????????,????
    process_data = pre_processData(train_data,'process_train_data')  # ????????????
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
    #=cv_error.to_csv(r'error.csv',index=True)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])

    '''??????'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data')  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)


# baseline?SVM??——0.78947
solution.py 文件源码 项目:Kaggle 作者: lawlite19 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def baseline_logisticRegression_crossValidate():
    origin_train_data = pd.read_csv(r"data/train.csv")
    process_data = fe_preprocessData(origin_train_data,'process_train_data')  # ????????????
    process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
    train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X_train = train_np[:,1:]
    y_train = train_np[:,0]
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})

    cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    cv_np = cv_data.as_matrix()
    X_cv = cv_np[:,1:]
    y_cv = cv_np[:,0]
    predictions = model.predict(X_cv)
    print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])

    '''?????????????????'''
    error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
    predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
    predictions_item.columns=['error_PassengerId']
    error_result = pd.concat([error_items,predictions_item],axis=1)
    error_result.to_csv(r'error.csv',index=False)

    #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])    

    '''??????'''
    '''test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True)  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)
solution.py 文件源码 项目:Kaggle 作者: lawlite19 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def optimize_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    print u"?????\n",train_data.info()
    print u'?????\n',train_data.describe()  
    #display_data(train_data)  # ????????
    #display_with_process(train_data) # ??????????????????,????
    process_data = fe_preprocessData(train_data,'process_train_data')  # ????????????
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})

    '''??????'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data')  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)    
## ????????
rank_tags.py 文件源码 项目:TGIF-Release 作者: raingo 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def stump(X, y):
    score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision')
    clf = LinearSVC()
    clf.fit(X, y)
    coef = clf.coef_[0,0]
    inter = clf.intercept_[0]
    return np.mean(score), np.sign(coef), inter / np.abs(coef)
sklearnbasemodel.py 文件源码 项目:Supply-demand-forecasting 作者: LevinJ 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def run_croos_validation(self):
        features,labels,cv = self.getFeaturesLabel()
        scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1)
        print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores))
        return -np.absolute(scores.mean())
training.py 文件源码 项目:bguFinalProject 作者: liranfar 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def build_random_forest_model(x_train, y_train):

    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(x_train, y_train.ravel())
    print "10-fold Cross validation score is :"
    print np.mean(cross_val_score(rf_model, x_train, y_train, cv=10))
    return rf_model
sklearn_data.py 文件源码 项目:-Classification-on-Chinese-Magazine- 作者: lixiaosi33 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))
hack_dev.py 文件源码 项目:ml-projects 作者: saopayne 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def hackathon_GBC_model(clf, train, features):
    clf.fit(train[features], train["Class"])
    probab_of_predict = clf.predict_proba(train[features])[:,1]
    predict_train = clf.predict(train[features])
    cv_score = cross_val_score(clf, train[features], train["Class"], cv=5, scoring="roc_auc")
    print("----------------------Model performance-----------------------")
    print("Accuracy score: ", accuracy_score(train["Class"].values, predict_train))
    print("AUC: ", roc_auc_score(train["Class"],probab_of_predict) )
    print("CV score: Mean - {}, Max - {}, Min - {}, Std - {}".format(np.mean(cv_score), np.max(cv_score),
                                                                     np.min(cv_score), np.std(cv_score)))

    Relative_Feature_importance = pd.Series(clf.feature_importances_, features).sort_values(ascending=False)
    Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance')
    plt.ylabel('Feature Importance')
    plt.show()
TwitterResults.py 文件源码 项目:Movie-Success-Predictor 作者: Blueteak 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def print_metrics(clf):

    #scores = cross_validation.cross_val_score(clf,features,labels,cv=5,scoring='accuracy')
    #print 'Accuracy:',scores.mean()

    cv = cross_validation.StratifiedKFold(labels,n_folds=5)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    all_tpr = []

    for i, (train,test) in enumerate(cv):
        probas_ = clf.fit(features[train],labels[train]).predict_proba(features[test])

        fpr,tpr,thresholds = metrics.roc_curve(labels[test],probas_[:,1])
        mean_tpr += interp(mean_fpr,fpr,tpr)
        mean_tpr[0] = 0.0
        roc_auc = metrics.auc(fpr,tpr)

        plt.plot(fpr,tpr,lw=1,label='ROC fold %d (area = %0.2f)' % (i,roc_auc))

    plt.plot([0,1],[0,1],'--',color=(0.6,0.6,0.6),label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('auc_sent.png')


问题


面经


文章

微信
公众号

扫码关注公众号