def get_standardized_wine_data():
df = pd.read_csv(os.path.join('datasets', 'wine.data'), header=None)
df.columns = [
'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
'Proanthocyanins', 'Color intensity', 'Hue',
'OD280/OD315 of diluted wines', 'Proline',
]
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.3,
random_state=0,
)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
return X_train_std, X_test_std, y_train, y_test
python类StandardScaler()的实例源码
def load_data_train(trainfile):
print "Getting the training data"
a=htk.open(trainfile)
train_data=a.getall()
print "Done with Loading the training data: ",train_data.shape
data=filter_data_train(train_data)
# x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
x_train=data[:,:-2] #Set to different column based on different model
scaler=StandardScaler().fit(x_train)
# x_train=scaler.transform(x_train)
Y_train=data[:,-2]
print Y_train.shape
# print np.where(Y_train==2)
Y_train=Y_train.reshape(Y_train.shape[0],1)
y_train=np_utils.to_categorical(Y_train,2)
print y_train[0:5,:]
gender_train=data[:,-1]
del data
return x_train,y_train,gender_train,scaler
def load_data_train(trainfile):
print "Getting the training data"
a=htk.open(trainfile)
train_data=a.getall()
print "Done with Loading the training data: ",train_data.shape
data=filter_data_train(train_data)
# x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
x_train=data[:,:-2] #Set to different column based on different model
scaler=StandardScaler().fit(x_train)
# x_train=scaler.transform(x_train)
Y_train=data[:,-2]
print Y_train.shape
# print np.where(Y_train==2)
Y_train=Y_train.reshape(Y_train.shape[0],1)
y_train=np_utils.to_categorical(Y_train,2)
print y_train[0:5,:]
gender_train=data[:,-1]
del data
#x_train has complete data, that is gammatone and also the pitch variance values.
return x_train,y_train,gender_train,scaler
def load_data_train(trainfile):
print "Getting the training data"
a=htk.open(trainfile)
train_data=a.getall()
print "Done with Loading the training data: ",train_data.shape
data=filter_data_train(train_data)
# x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
x_train=data[:,:-2] #Set to different column based on different model
scaler=StandardScaler().fit(x_train)
# x_train=scaler.transform(x_train)
Y_train=data[:,-2]
print Y_train.shape
# print np.where(Y_train==2)
Y_train=Y_train.reshape(Y_train.shape[0],1)
y_train=np_utils.to_categorical(Y_train,2)
print y_train[0:5,:]
gender_train=data[:,-1]
del data
#x_train has complete data, that is gammatone and also the pitch variance values.
return x_train,y_train,gender_train,scaler
def load_data_train(trainfile):
print "Getting the training data"
a=htk.open(trainfile)
train_data=a.getall()
print "Done with Loading the training data: ",train_data.shape
data=filter_data_train(train_data)
# x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
x_train=data[:,:-2] #Set to different column based on different model
scaler=StandardScaler().fit(x_train)
# x_train=scaler.transform(x_train)
Y_train=data[:,-2]
print Y_train.shape
# print np.where(Y_train==2)
Y_train=Y_train.reshape(Y_train.shape[0],1)
y_train=np_utils.to_categorical(Y_train,2)
print y_train[0:5,:]
gender_train=data[:,-1]
del data
return x_train,y_train,gender_train,scaler
def load_data_train(trainfile):
print "Getting the training data"
a=htk.open(trainfile)
train_data=a.getall()
print "Done with Loading the training data: ",train_data.shape
data=filter_data_train(train_data)
x_train=data[:,:-2]
scaler=StandardScaler().fit(x_train)
# x_train=scaler.transform(x_train)
x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model
Y_train=data[:,-2]
print Y_train.shape
# print np.where(Y_train==2)
Y_train=Y_train.reshape(Y_train.shape[0],1)
y_train=np_utils.to_categorical(Y_train,2)
gender_train=data[:,-1]
del data
return x_train,y_train,gender_train,scaler
def test_cross_val_predict():
# Make sure it works in cross_val_predict for multiclass.
X, y = load_iris(return_X_y=True)
y = LabelBinarizer().fit_transform(y)
X = StandardScaler().fit_transform(X)
mlp = MLPClassifier(n_epochs=10,
solver_kwargs={'learning_rate': 0.05},
random_state=4567).fit(X, y)
cv = KFold(n_splits=4, random_state=457, shuffle=True)
y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
auc = roc_auc_score(y, y_oos, average=None)
assert np.all(auc >= 0.96)
def create_model(self, training_articles):
model = OneVsRestClassifier(svm.SVC(probability=True))
features = []
labels = []
i = 0
for article in training_articles:
print("Generating features for article " + str(i) + "...")
google_cloud_response = self.analyze_text_google_cloud(article["article"])
relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"])
# Only count this article if a relevant entity is present
if relevant_entities:
article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"])
features.append(article_features)
labels.append(article["label"])
else:
print("Skipping article " + str(i) + "...")
i = i + 1
print("Performing feature scaling...")
scaler = preprocessing.StandardScaler().fit(features)
features_scaled = scaler.transform(features)
print("Fitting model...")
model.fit(features_scaled, labels)
print("Saving model...")
joblib.dump(scaler, "data_analysis/caler.pkl")
joblib.dump(model, "data_analysis/model.pkl")
print("Done!")
# For use in prod
def __load_chn_data(self,selectChan,file_name):
spk_startswith = "spike_{0}".format(selectChan)
with hp.File(file_name,"r") as f:
times = list()
waveforms = list()
units = list()
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
tep_time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
unit = int(chn_unit.split("_")[-1])
unit = np.ones(tep_time.shape,dtype=np.int32)*unit
times.append(tep_time)
waveforms.append(waveform)
units.append(unit)
if times:
times = np.hstack(times)
units = np.hstack(units)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
units = units[sort_index]
waveforms = waveforms[sort_index]
times = times[sort_index]
# calculate waveform_range
waveforms_max = np.apply_along_axis(max,1,waveforms)
waveforms_min = np.apply_along_axis(min,1,waveforms)
waveforms_range = np.vstack([waveforms_min,waveforms_max]).T
# calculate PCA of waveforms
scaler = StandardScaler()
scaler.fit(waveforms)
waveforms_scaled = scaler.transform(waveforms)
pca = PCA(n_components=self.pca_used_num)
pca.fit(waveforms_scaled)
wavePCAs = pca.transform(waveforms_scaled)
return times,units,waveforms_range,wavePCAs
else:
return None,None,None,None
def __load_chn_data(self,selectChan,file_name):
spk_startswith = "spike_{0}".format(selectChan)
with hp.File(file_name,"r") as f:
times = list()
waveforms = list()
units = list()
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
tep_time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
unit = int(chn_unit.split("_")[-1])
unit = np.ones(tep_time.shape,dtype=np.int32)*unit
times.append(tep_time)
waveforms.append(waveform)
units.append(unit)
if times:
times = np.hstack(times)
units = np.hstack(units)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
units = units[sort_index]
waveforms = waveforms[sort_index]
times = times[sort_index]
# calculate waveform_range
waveforms_max = np.apply_along_axis(max,1,waveforms)
waveforms_min = np.apply_along_axis(min,1,waveforms)
waveforms_range = np.vstack([waveforms_min,waveforms_max]).T
# calculate PCA of waveforms
scaler = StandardScaler()
scaler.fit(waveforms)
waveforms_scaled = scaler.transform(waveforms)
pca = PCA(n_components=self.pca_used_num)
pca.fit(waveforms_scaled)
wavePCAs = pca.transform(waveforms_scaled)
return times,units,waveforms_range,wavePCAs
else:
return None,None,None,None
def pre_processData(train_data,file_path):
train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age) # ???????????
train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes
train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'
'''0/1????'''
dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin') # get_dummies?????0/1??????????????prefix???Cabin
dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1) # ??dataframe,axis=1??
train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True) # ????????????
header_string = ','.join(train_data.columns.tolist()) # ?????string???????
np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string) # ?????????????
'''???????(Age?Fare)'''
scaler = StandardScaler()
age_scaler = scaler.fit(train_data['Age'])
train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
if np.sum(train_data.Fare.isnull()): # ??Fare???????????
train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
fare_scaler = scaler.fit(train_data['Fare'])
train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
header_string = ','.join(train_data.columns.tolist()) # ?????string???????
np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string) # ?????????????
return train_data
## feature engineering?????-?????
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def feature_scaling(self, df):
df = df.copy()
# Standardization (centering and scaling) of dataset that removes mean and scales to unit variance
standard_scaler = StandardScaler()
numerical_feature_names_of_non_modified_df = TwoSigmaFinModTools._numerical_feature_names
if any(tuple(df.columns == 'y')):
if not TwoSigmaFinModTools._is_one_hot_encoder:
numerical_feature_names_of_non_modified_df = np.concatenate(
[TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values])
# Include scaling of y
y = df['y'].values
relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
(df[numerical_feature_names_of_non_modified_df].columns != 'y')
& (df[numerical_feature_names_of_non_modified_df].columns != 'id')]
mask = ~df[relevant_features].isnull()
res = standard_scaler.fit_transform(X=df[relevant_features][mask].values, y=y)
if (~mask).sum().sum() > 0:
df = self.standardize_relevant_features(df, relevant_features, res)
else:
df.loc[:, tuple(relevant_features)] = res
else:
if not TwoSigmaFinModTools._is_one_hot_encoder:
numerical_feature_names_of_non_modified_df = np.concatenate(
[TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values])
relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
(df[numerical_feature_names_of_non_modified_df].columns != 'id')]
mask = ~df[relevant_features].isnull()
res = standard_scaler.fit_transform(df[relevant_features][mask].values)
if mask.sum().sum() > 0:
df = self.standardize_relevant_features(df, relevant_features, res)
else:
df.loc[:, tuple(relevant_features)] = res
return df
def make_standard(X_train, X_test):
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pickle.dump(scaler, open("scaler_model.sav", 'wb'))
return X_train, X_test
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
_val = []
_coords = []
file_dir_fix = dir + "\\output_INFLO.csv"
#f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
with open(file_dir_fix, 'rU') as inp:
rd = csv.reader(inp)
for row in rd:
_val.append([row[1], row[2], row[0]])
#print(_center)
_val = np.asarray(_val)
_val_original = _val
_val_original = map(myFloat, _val_original)
_val_original = map(myInt, _val_original)
#_val_original = map(myTemp, _val_original)
_val_original = np.asarray(_val_original)
_val = preprocessing.StandardScaler().fit_transform(_val)
#_center = preprocessing.MinMaxScaler()
#_center.fit_transform(_val)
#_arr = StandardScaler().inverse_transform(_center)
#print(_arr)
#print(_center)
new_file = prefix + file_name + ".png"
dbFun(_val, _val_original, new_file)
#_len = len(_center)
return
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
_val = []
_coords = []
file_dir_fix = dir + "\\output_INFLO.csv"
#f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
with open(file_dir_fix, 'rU') as inp:
rd = csv.reader(inp)
for row in rd:
_val.append([row[1], row[2], row[0]])
#print(_center)
_val = np.asarray(_val)
_val_original = _val
_val_original = map(myFloat, _val_original)
_val_original = map(myInt, _val_original)
#_val_original = map(myTemp, _val_original)
_val_original = np.asarray(_val_original)
_val = preprocessing.StandardScaler().fit_transform(_val)
#_center = preprocessing.MinMaxScaler()
#_center.fit_transform(_val)
#_arr = StandardScaler().inverse_transform(_center)
#print(_arr)
#print(_center)
new_file = prefix + file_name + ".png"
dbFun(_val, _val_original, new_file)
#_len = len(_center)
return
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
_val = []
_coords = []
file_dir_fix = dir + "\\output_INFLO.csv"
#f = "C:\Users\Abdullah
#Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
with open(file_dir_fix, 'rU') as inp:
rd = csv.reader(inp)
for row in rd:
_val.append([row[1], row[2], row[0]])
#print(_center)
_val = np.asarray(_val)
_val_original = _val
_val_original = map(myFloat, _val_original)
_val_original = map(myInt, _val_original)
#_val_original = map(myTemp, _val_original)
_val_original = np.asarray(_val_original)
_val = preprocessing.StandardScaler().fit_transform(_val)
#_center = preprocessing.MinMaxScaler()
#_center.fit_transform(_val)
#_arr = StandardScaler().inverse_transform(_center)
#print(_arr)
#print(_center)
new_file = prefix + file_name + ".png"
dbFun(_val, _val_original, new_file)
#_len = len(_center)
return
##############################################################################################
# Getting the clusters and printing in the most trivial way as asked by Dr Sheikh Faisal
def supervised_reduction(method=None, dataset=None):
np.random.seed(1)
sklearn.utils.check_random_state(1)
train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)
if dataset == 'yale':
regularizer_weight = 0.0001
else:
regularizer_weight = 1
n_classes = len(np.unique(train_labels))
if method == 'lda':
proj = LinearDiscriminantAnalysis(n_components=n_classes - 1)
proj.fit(train_data, train_labels)
elif method == 's-lda':
proj = LinearSEF(train_data.shape[1], output_dimensionality=(n_classes - 1))
proj.cuda()
loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100,
target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001,
verbose=False)
elif method == 's-lda-2x':
# SEF output dimensions are not limited
proj = LinearSEF(train_data.shape[1], output_dimensionality=2 * (n_classes - 1))
proj.cuda()
loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100,
target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001,
verbose=False)
acc = evaluate_svm(proj.transform(train_data), train_labels,
proj.transform(test_data), test_labels)
print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def outofsample_extensions(method=None, dataset=None):
np.random.seed(1)
sklearn.utils.check_random_state(1)
train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)
# Learn a new space using Isomap
isomap = Isomap(n_components=10, n_neighbors=20)
train_data_isomap = np.float32(isomap.fit_transform(train_data))
if method == 'linear-regression':
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
train_data = std.fit_transform(train_data)
test_data = std.transform(test_data)
# Use linear regression to provide baseline out-of-sample extensions
proj = LinearRegression()
proj.fit(np.float64(train_data), np.float64(train_data_isomap))
acc = evaluate_svm(proj.predict(train_data), train_labels,
proj.predict(test_data), test_labels)
elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d':
# Use the SEF to provide out-of-sample extensions
if method == 'c-ISOMAP-10d':
proj = LinearSEF(train_data.shape[1], output_dimensionality=10)
proj.cuda()
else:
proj = LinearSEF(train_data.shape[1], output_dimensionality=20)
proj.cuda()
loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy',
epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1)
acc = evaluate_svm(proj.transform(train_data), train_labels,
proj.transform(test_data), test_labels)
print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def __init__(self, input_dimensionality, output_dimensionality, scaler='default'):
"""
SEF_Base constuctor
:param input_dimensionality: dimensionality of the input space
:param output_dimensionality: dimensionality of the target space
:param scaler: the scaler used to scale the data
"""
self.input_dimensionality = input_dimensionality
self.output_dimensionality = output_dimensionality
if scaler == 'default':
self.scaler = StandardScaler()
elif scaler is not None:
self.scaler = scaler()
else:
self.scaler = None
# Scaling factor for computing the similarity matrix of the projected data
self.sigma_projection = np.float32(0.1)
self.use_gpu = False
# The parameters of the model that we want to learn
self.trainable_params = []
# Other non-trainable parametsr
self.non_trainable_params = []
def add_params(cs: ConfigurationSpace):
'''
adds parameters to ConfigurationSpace
'''
switch = CategoricalHyperparameter(
"StandardScaler", choices=[True, False], default=True)
cs.add_hyperparameter(switch)