python类train_test_split()的实例源码

utils_testing.py 文件源码 项目:auto_ml 作者: ClimbsRocks 项目源码 文件源码 阅读 48 收藏 0 点赞 0 评论 0
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.csv')

    try:
        df_twitter = pd.read_csv(open(file_name,'rU'), encoding='latin-1', engine='python')
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_csv(file_name, index=False, encoding='latin-1')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
utils_testing.py 文件源码 项目:auto_ml 作者: ClimbsRocks 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def get_titanic_binary_classification_dataset(basic=True):

    dir_name = os.path.abspath(os.path.dirname(__file__))
    file_name = os.path.join(dir_name, 'titanic.csv')
    print('file_name')
    print(file_name)
    print('dir_name')
    print(dir_name)
    try:
        df_titanic = pd.read_csv(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(file_name, index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test
utils_testing.py 文件源码 项目:auto_ml 作者: ClimbsRocks 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.h5')

    try:
        df_twitter = pd.read_hdf(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_hdf(file_name, key='df', format='fixed')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
Adaboost.py 文件源码 项目:Machine-Learning-Tools-on-Iris-Dataset 作者: debjitpaul 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
Random_forest.py 文件源码 项目:Machine-Learning-Tools-on-Iris-Dataset 作者: debjitpaul 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
#scale training data before training
logistic.py 文件源码 项目:Machine-Learning-Tools-on-Iris-Dataset 作者: debjitpaul 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
svm_oop.py 文件源码 项目:Machine-Learning-Tools-on-Iris-Dataset 作者: debjitpaul 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale data before training it
AIserver.py 文件源码 项目:Using-machine-learning-to-detect-malicious-URLs 作者: faizann24 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def de_lda(X,y):
    """ lda """
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = LDA(n_components = de)
    _,x_mini,_,y_mini = train_test_split(X,y,test_size = 0.33)
    clf.fit(x_mini,y_mini)
    def _func(X1,X2):
        return clf.transform(X1), clf.transform(X2)
    return _func

# def de_ps(X,y):
#     """ pearsonr method """
#     dim = X.shape[1]
#     de = min(2000,dim)
#     clf = SelectKBest(Pearsonr , k=de)
#     clf.fit(X,y)
#     def _func(X1,X2):
#         return clf.transform(X1),clf.transform(X2)
#     return _func
Sentiment.py 文件源码 项目:sentiment_comments_zh 作者: zhouhoo 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def prepare_train_data(self):
        texts,labels = load_corpus()
        volcabulary, train_words = get_volcabulary_and_list_words(texts)

        self.set_volcabulary(volcabulary)

        del volcabulary,texts
        words_index = self.get_word_index(train_words, self.volcabulary, self.max_words, self.max_length)

        # del reviews_words, volcabulary

        index = np.arange(words_index.shape[0])
        train_index, valid_index = train_test_split(
            index, train_size=0.8, random_state=520)
        train_data = words_index[train_index]
        valid_data = words_index[valid_index]
        labels = np.asarray(labels)
        train_labels = labels[train_index]
        valid_labels = labels[valid_index]
        print(train_data.shape)
        print(valid_data.shape)

        pickle.dump((words_index, labels), open("output/zh_comments.pkl", 'wb'))

        return train_data, train_labels, valid_data, valid_labels
tp3_solutions.py 文件源码 项目:TPs 作者: DataMiningP7 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_train_test_sets(X, y):
    """ Split X and y into a train and a test sets.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        y: a binary vector where the i-th value indicates whether the i-th is a
           spam or a ham.
    Returns:
        X_train: train subset of X
        X_test: test subset of X
        y_train: train subset of y
        y_test: test subset of y
    """
    return train_test_split(X, y)

# Ex4.2, 4.3, 4.4
onehot.py 文件源码 项目:House-Pricing 作者: playing-kaggle 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']):
    est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    est.fit(X_train,Y_train)
    y_train_pred = est.predict(X_test)
    plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    print('rmse value:',rmse(Y_test,y_train_pred))

    return est
Stock_Prediction_Model_Random_Forrest.py 文件源码 项目:StockRecommendSystem 作者: doncat99 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def best_window(self, X_train, y_train, w_min, w_max, t_min,t_max,f_min,f_max):
        w_opt = 0
        t_opt = 0
        f_opt = 0
        accur_opt = 0.

        x_w = []
        y_accu= []

        # range of window : w_min --> w_max     
        for w in range(w_min,w_max+1):
            #X,y = preprocess_data(w)
            #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
            t, f, accur = self.best_forrest(X_train,y_train,10,t_min,t_max,f_min,f_max)
            print('Window = '+str(w)+' days --> Best Forrest : number of trees : ' + str(t) + ', maximum of features : ' + str(f) + ', with accuracy :' + str(accur))

            if (accur > accur_opt) : w_opt, t_opt, f_opt, accur_opt = w, t, f, accur
            x_w.append(w), y_accu.append(accur)

        print('Best window : w = '+str(w_opt)+'. Best Forrest : number of trees : ' + str(t_opt) + ', maximum of features : ' + str(f_opt) + ', with accuracy :' + str(accur_opt))
        return w_opt, t_opt, f_opt
Stock_Prediction_Model_Random_Forrest.py 文件源码 项目:StockRecommendSystem 作者: doncat99 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.3)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        #print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        #print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test
Stock_Prediction_Model_DBN.py 文件源码 项目:StockRecommendSystem 作者: doncat99 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        return X_train, y_train, X_test, y_test
Stock_Prediction_Recommand_System.py 文件源码 项目:StockRecommendSystem 作者: doncat99 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):

        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False, array_format=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)
            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train.append(X_train_temp, ignore_index=True)
                X_test.append(X_test_temp, ignore_index=True)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        # print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        # print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test
models.py 文件源码 项目:xcessiv 作者: reiinakano 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def return_train_dataset(self):
        """Returns train data set

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        X, y = self.return_main_dataset()

        if self.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=self.test_dataset['split_ratio'],
                random_state=self.test_dataset['split_seed'],
                stratify=y
            )

        return X, y
data_process.py 文件源码 项目:pytorch_crowd_count 作者: BingzheWu 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def gen_train_data(dataset_paths):
    X_fs = []
    Y_fs = []

    for path in dataset_paths:
        images, gts, densities = load_images_and_gts(path)
        X_fs += images
        Y_fs += densities
    from sklearn.model_selection import train_test_split
    X_fs_train, X_fs_test, Y_fs_train, Y_fs_test = train_test_split(X_fs, Y_fs, test_size = 0.2)
    X_train, Y_train = X_fs_train, Y_fs_train
    X_test, Y_test = X_fs_test, Y_fs_test
    print(len(X_train))
    X_train, Y_train = multiscale_pyramidal(X_train, Y_train)
    #X_train, Y_train = adapt_images_and_densities(X_train, Y_train, slice_w, slice_h)
    print(len(X_train))
    X_train, Y_train = generate_slices(X_train, Y_train, slice_w = patch_w, slice_h = patch_h, offset = 8)
    print(len(X_train))
    #X_train, Y_train = crop_slices(X_train, Y_train)
    X_train, Y_train = flip_slices(X_train, Y_train)
    print(len(X_train))
    X_train, Y_train = samples_distribution(X_train,Y_train)
    print(len(X_train))
    X_train,Y_train = shuffle_slices(X_train, Y_train)
    return X_train, Y_train
convert_tfrecord.py 文件源码 项目:GestureRecognition 作者: gkchai 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def main(unused_argv):

    # Get the data.
    data_train = np.loadtxt(os.path.join(FLAGS.input_directory,'train'), delimiter=',')
    data_test = np.loadtxt(os.path.join(FLAGS.input_directory, 'test'), delimiter=',')

    X_train, X_val, y_train, y_val = train_test_split(data_train[:,1:], data_train[:,0].astype(np.int32),
                                                          test_size=FLAGS.validation_ratio,
                                                          random_state=100)
    X_test = data_test[:, 1:]
    y_test = data_test[:, 0].astype(np.int32)

    # Convert to Examples and write the result to TFRecords.
    convert_to((X_train, y_train), PREFIX + '_train')
    convert_to((X_val, y_val), PREFIX + '_validation')
    convert_to((X_test, y_test), PREFIX + '_test')
gbm.py 文件源码 项目:MLAlgorithms 作者: rushter 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def classification():
    # Generate a random binary classification problem.
    X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
                               random_state=1111, n_classes=2,
                               class_sep=1., n_redundant=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                        random_state=1111)

    model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
                                       max_features=8, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(predictions)
    print(predictions.min())
    print(predictions.max())
    print('classification, roc auc score: %s'
          % roc_auc_score(y_test, predictions))


问题


面经


文章

微信
公众号

扫码关注公众号