python类StandardScaler()的实例源码

chapter_5.py 文件源码 项目:python-machine-learning-book 作者: jeremyn 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_standardized_wine_data():
    df = pd.read_csv(os.path.join('datasets', 'wine.data'), header=None)
    df.columns = [
        'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
        'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
        'Proanthocyanins', 'Color intensity', 'Hue',
        'OD280/OD315 of diluted wines', 'Proline',
    ]
    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=0,
    )
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_test_std = sc.transform(X_test)

    return X_train_std, X_test_std, y_train, y_test
fbank-dnn.py 文件源码 项目:leap-scd 作者: smittal6 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def load_data_train(trainfile):
        print "Getting the training data"
        a=htk.open(trainfile)
        train_data=a.getall()
        print "Done with Loading the training data: ",train_data.shape
        data=filter_data_train(train_data)
        # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
        x_train=data[:,:-2] #Set to different column based on different model
        scaler=StandardScaler().fit(x_train)
        # x_train=scaler.transform(x_train)
        Y_train=data[:,-2]
        print Y_train.shape
        # print np.where(Y_train==2)
        Y_train=Y_train.reshape(Y_train.shape[0],1)
        y_train=np_utils.to_categorical(Y_train,2)
        print y_train[0:5,:]
        gender_train=data[:,-1]
        del data
        return x_train,y_train,gender_train,scaler
gamma-pitch-dnn-800.py 文件源码 项目:leap-scd 作者: smittal6 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def load_data_train(trainfile):
        print "Getting the training data"
        a=htk.open(trainfile)
        train_data=a.getall()
        print "Done with Loading the training data: ",train_data.shape
        data=filter_data_train(train_data)
        # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
        x_train=data[:,:-2] #Set to different column based on different model
        scaler=StandardScaler().fit(x_train)
        # x_train=scaler.transform(x_train)
        Y_train=data[:,-2]
        print Y_train.shape
        # print np.where(Y_train==2)
        Y_train=Y_train.reshape(Y_train.shape[0],1)
        y_train=np_utils.to_categorical(Y_train,2)
        print y_train[0:5,:]
        gender_train=data[:,-1]
        del data
        #x_train has complete data, that is gammatone and also the pitch variance values.
        return x_train,y_train,gender_train,scaler
gamma-pitch-dnn.py 文件源码 项目:leap-scd 作者: smittal6 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def load_data_train(trainfile):
        print "Getting the training data"
        a=htk.open(trainfile)
        train_data=a.getall()
        print "Done with Loading the training data: ",train_data.shape
        data=filter_data_train(train_data)
        # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
        x_train=data[:,:-2] #Set to different column based on different model
        scaler=StandardScaler().fit(x_train)
        # x_train=scaler.transform(x_train)
        Y_train=data[:,-2]
        print Y_train.shape
        # print np.where(Y_train==2)
        Y_train=Y_train.reshape(Y_train.shape[0],1)
        y_train=np_utils.to_categorical(Y_train,2)
        print y_train[0:5,:]
        gender_train=data[:,-1]
        del data
        #x_train has complete data, that is gammatone and also the pitch variance values.
        return x_train,y_train,gender_train,scaler
gammatone-dnn.py 文件源码 项目:leap-scd 作者: smittal6 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def load_data_train(trainfile):
        print "Getting the training data"
        a=htk.open(trainfile)
        train_data=a.getall()
        print "Done with Loading the training data: ",train_data.shape
        data=filter_data_train(train_data)
        # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
        x_train=data[:,:-2] #Set to different column based on different model
        scaler=StandardScaler().fit(x_train)
        # x_train=scaler.transform(x_train)
        Y_train=data[:,-2]
        print Y_train.shape
        # print np.where(Y_train==2)
        Y_train=Y_train.reshape(Y_train.shape[0],1)
        y_train=np_utils.to_categorical(Y_train,2)
        print y_train[0:5,:]
        gender_train=data[:,-1]
        del data
        return x_train,y_train,gender_train,scaler
fbank-cnn.py 文件源码 项目:leap-scd 作者: smittal6 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def load_data_train(trainfile):
        print "Getting the training data"
        a=htk.open(trainfile)
        train_data=a.getall()
        print "Done with Loading the training data: ",train_data.shape
        data=filter_data_train(train_data)
        x_train=data[:,:-2]
        scaler=StandardScaler().fit(x_train)
        # x_train=scaler.transform(x_train)
        x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
        Y_train=data[:,-2]
        print Y_train.shape
        # print np.where(Y_train==2)
        Y_train=Y_train.reshape(Y_train.shape[0],1)
        y_train=np_utils.to_categorical(Y_train,2)
        gender_train=data[:,-1]
        del data
        return x_train,y_train,gender_train,scaler
test_mlp_classifier.py 文件源码 项目:muffnn 作者: civisanalytics 项目源码 文件源码 阅读 55 收藏 0 点赞 0 评论 0
def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96)
data_analysis.py 文件源码 项目:algo-trading-pipeline 作者: NeuralKnot 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def create_model(self, training_articles):
        model = OneVsRestClassifier(svm.SVC(probability=True))

        features = []
        labels = []
        i = 0
        for article in training_articles:
            print("Generating features for article " + str(i) + "...")
            google_cloud_response = self.analyze_text_google_cloud(article["article"])
            relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"])

            # Only count this article if a relevant entity is present
            if relevant_entities:
                article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"])
                features.append(article_features)
                labels.append(article["label"])
            else:
                print("Skipping article " + str(i) + "...")

            i = i + 1

        print("Performing feature scaling...")
        scaler = preprocessing.StandardScaler().fit(features)
        features_scaled = scaler.transform(features)

        print("Fitting model...")
        model.fit(features_scaled, labels)

        print("Saving model...")
        joblib.dump(scaler, "data_analysis/caler.pkl")
        joblib.dump(model, "data_analysis/model.pkl")

        print("Done!")

    # For use in prod
spikesorting.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __load_chn_data(self,selectChan,file_name):
        spk_startswith = "spike_{0}".format(selectChan)
        with hp.File(file_name,"r") as f:
            times = list()
            waveforms = list()
            units = list()
            for chn_unit in f["spikes"].keys():
                if chn_unit.startswith(spk_startswith):
                    tep_time = f["spikes"][chn_unit]["times"].value
                    waveform = f["spikes"][chn_unit]["waveforms"].value
                    unit = int(chn_unit.split("_")[-1])
                    unit = np.ones(tep_time.shape,dtype=np.int32)*unit
                    times.append(tep_time)
                    waveforms.append(waveform)
                    units.append(unit)
            if times:
                times = np.hstack(times)
                units = np.hstack(units)
                waveforms = np.vstack(waveforms)
                sort_index = np.argsort(times)
                units = units[sort_index]
                waveforms = waveforms[sort_index]
                times = times[sort_index]
                # calculate waveform_range 
                waveforms_max = np.apply_along_axis(max,1,waveforms)
                waveforms_min = np.apply_along_axis(min,1,waveforms)
                waveforms_range = np.vstack([waveforms_min,waveforms_max]).T
                # calculate PCA of waveforms
                scaler = StandardScaler()
                scaler.fit(waveforms)
                waveforms_scaled = scaler.transform(waveforms)
                pca = PCA(n_components=self.pca_used_num)
                pca.fit(waveforms_scaled)
                wavePCAs = pca.transform(waveforms_scaled)
                return times,units,waveforms_range,wavePCAs
            else:
                return None,None,None,None
spikesorting.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def __load_chn_data(self,selectChan,file_name):
        spk_startswith = "spike_{0}".format(selectChan)
        with hp.File(file_name,"r") as f:
            times = list()
            waveforms = list()
            units = list()
            for chn_unit in f["spikes"].keys():
                if chn_unit.startswith(spk_startswith):
                    tep_time = f["spikes"][chn_unit]["times"].value
                    waveform = f["spikes"][chn_unit]["waveforms"].value
                    unit = int(chn_unit.split("_")[-1])
                    unit = np.ones(tep_time.shape,dtype=np.int32)*unit
                    times.append(tep_time)
                    waveforms.append(waveform)
                    units.append(unit)
            if times:
                times = np.hstack(times)
                units = np.hstack(units)
                waveforms = np.vstack(waveforms)
                sort_index = np.argsort(times)
                units = units[sort_index]
                waveforms = waveforms[sort_index]
                times = times[sort_index]
                # calculate waveform_range 
                waveforms_max = np.apply_along_axis(max,1,waveforms)
                waveforms_min = np.apply_along_axis(min,1,waveforms)
                waveforms_range = np.vstack([waveforms_min,waveforms_max]).T
                # calculate PCA of waveforms
                scaler = StandardScaler()
                scaler.fit(waveforms)
                waveforms_scaled = scaler.transform(waveforms)
                pca = PCA(n_components=self.pca_used_num)
                pca.fit(waveforms_scaled)
                wavePCAs = pca.transform(waveforms_scaled)
                return times,units,waveforms_range,wavePCAs
            else:
                return None,None,None,None
solution.py 文件源码 项目:Kaggle 作者: lawlite19 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def pre_processData(train_data,file_path):
    train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age)  # ???????????
    train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes
    train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'    
    '''0/1????'''
    dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin')  # get_dummies?????0/1??????????????prefix???Cabin
    dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
    dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
    dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
    train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1)  # ??dataframe,axis=1??
    train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True)   # ????????????            
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    '''???????(Age?Fare)'''
    scaler = StandardScaler()
    age_scaler = scaler.fit(train_data['Age'])
    train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
    if np.sum(train_data.Fare.isnull()):  # ??Fare???????????
        train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
    fare_scaler = scaler.fit(train_data['Fare'])
    train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    return train_data






## feature engineering?????-?????
two_sigma_financial_modelling.py 文件源码 项目:PortfolioTimeSeriesAnalysis 作者: MizioAnd 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def feature_scaling(self, df):
        df = df.copy()
        # Standardization (centering and scaling) of dataset that removes mean and scales to unit variance
        standard_scaler = StandardScaler()
        numerical_feature_names_of_non_modified_df = TwoSigmaFinModTools._numerical_feature_names
        if any(tuple(df.columns == 'y')):
            if not TwoSigmaFinModTools._is_one_hot_encoder:
                numerical_feature_names_of_non_modified_df = np.concatenate(
                    [TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values])
            # Include scaling of y
            y = df['y'].values
            relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
                (df[numerical_feature_names_of_non_modified_df].columns != 'y')
                & (df[numerical_feature_names_of_non_modified_df].columns != 'id')]
            mask = ~df[relevant_features].isnull()
            res = standard_scaler.fit_transform(X=df[relevant_features][mask].values, y=y)
            if (~mask).sum().sum() > 0:
                df = self.standardize_relevant_features(df, relevant_features, res)
            else:
                df.loc[:, tuple(relevant_features)] = res
        else:
            if not TwoSigmaFinModTools._is_one_hot_encoder:
                numerical_feature_names_of_non_modified_df = np.concatenate(
                    [TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values])
            relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
                (df[numerical_feature_names_of_non_modified_df].columns != 'id')]
            mask = ~df[relevant_features].isnull()
            res = standard_scaler.fit_transform(df[relevant_features][mask].values)
            if mask.sum().sum() > 0:
                df = self.standardize_relevant_features(df, relevant_features, res)
            else:
                df.loc[:, tuple(relevant_features)] = res
        return df
preprocess.py 文件源码 项目:pokedex-as-it-should-be 作者: leotok 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def make_standard(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    pickle.dump(scaler, open("scaler_model.sav", 'wb'))
    return X_train, X_test
setup_ifruitfly.py 文件源码 项目:iFruitFly 作者: AdnanMuhib 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return
iFruitFly_v2.0.py 文件源码 项目:iFruitFly 作者: AdnanMuhib 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return
iFruitFly_Testing_weka.py 文件源码 项目:iFruitFly 作者: AdnanMuhib 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah
    #Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return

##############################################################################################
# Getting the clusters and printing in the most trivial way as asked by Dr Sheikh Faisal
supervised_reduction_multiple.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def supervised_reduction(method=None, dataset=None):
    np.random.seed(1)
    sklearn.utils.check_random_state(1)

    train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)


    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)

    if dataset == 'yale':
        regularizer_weight = 0.0001
    else:
        regularizer_weight = 1

    n_classes = len(np.unique(train_labels))

    if method == 'lda':
        proj = LinearDiscriminantAnalysis(n_components=n_classes - 1)
        proj.fit(train_data, train_labels)
    elif method == 's-lda':
        proj = LinearSEF(train_data.shape[1], output_dimensionality=(n_classes - 1))
        proj.cuda()
        loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100,
                        target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001,
                        verbose=False)

    elif method == 's-lda-2x':
        # SEF output dimensions are not limited
        proj = LinearSEF(train_data.shape[1], output_dimensionality=2 * (n_classes - 1))
        proj.cuda()
        loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100,
                        target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001,
                        verbose=False)

    acc = evaluate_svm(proj.transform(train_data), train_labels,
                       proj.transform(test_data), test_labels)

    print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
linear_outofsample_mutiple.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def outofsample_extensions(method=None, dataset=None):
    np.random.seed(1)
    sklearn.utils.check_random_state(1)

    train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)

    # Learn a new space using Isomap
    isomap = Isomap(n_components=10, n_neighbors=20)
    train_data_isomap = np.float32(isomap.fit_transform(train_data))

    if method == 'linear-regression':
        from sklearn.preprocessing import StandardScaler
        std = StandardScaler()
        train_data = std.fit_transform(train_data)
        test_data = std.transform(test_data)

        # Use linear regression to provide baseline out-of-sample extensions
        proj = LinearRegression()
        proj.fit(np.float64(train_data), np.float64(train_data_isomap))
        acc = evaluate_svm(proj.predict(train_data), train_labels,
                           proj.predict(test_data), test_labels)
    elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d':
        # Use the SEF to provide out-of-sample extensions
        if method == 'c-ISOMAP-10d':
            proj = LinearSEF(train_data.shape[1], output_dimensionality=10)
            proj.cuda()
        else:
            proj = LinearSEF(train_data.shape[1], output_dimensionality=20)
            proj.cuda()
        loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy',
                        epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1)
        acc = evaluate_svm(proj.transform(train_data), train_labels,
                           proj.transform(test_data), test_labels)

    print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
sef_base.py 文件源码 项目:sef 作者: passalis 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def __init__(self, input_dimensionality, output_dimensionality, scaler='default'):
        """
        SEF_Base constuctor
        :param input_dimensionality: dimensionality of the input space
        :param output_dimensionality: dimensionality of the target space
        :param scaler: the scaler used to scale the data
        """

        self.input_dimensionality = input_dimensionality
        self.output_dimensionality = output_dimensionality

        if scaler == 'default':
            self.scaler = StandardScaler()
        elif scaler is not None:
            self.scaler = scaler()
        else:
            self.scaler = None

        # Scaling factor for computing the similarity matrix of the projected data
        self.sigma_projection = np.float32(0.1)
        self.use_gpu = False

        # The parameters of the model that we want to learn
        self.trainable_params = []

        # Other non-trainable parametsr
        self.non_trainable_params = []
standardscaler.py 文件源码 项目:AutoFolio 作者: mlindauer 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def add_params(cs: ConfigurationSpace):
        '''
            adds parameters to ConfigurationSpace 
        '''
        switch = CategoricalHyperparameter(
            "StandardScaler", choices=[True, False], default=True)
        cs.add_hyperparameter(switch)


问题


面经


文章

微信
公众号

扫码关注公众号