def load_data(data_dir, num_files=30):
files_list = os.listdir(data_dir)
data = None
ac_data = None
for fname in files_list[:num_files]:
print fname
f = os.path.join(data_dir, fname)
with netcdf.netcdf_file(f, 'r') as fid:
m = fid.variables['outputMeans'][:].copy()
s = fid.variables['outputStdevs'][:].copy()
feats = fid.variables['targetPatterns'][:].copy()
ac_feats = fid.variables['inputs'][:].copy()
scaler = preprocessing.StandardScaler()
scaler.mean_ = m
scaler.scale_ = s
feats = scaler.inverse_transform(feats)
assert feats.shape[0] == ac_feats.shape[0]
# feats = np.concatenate((feats,ac_feats),axis=1)
if data == None and ac_data == None:
data = feats
ac_data = ac_feats
else:
data = np.vstack((data, feats))
ac_data = np.vstack((ac_data, ac_feats))
return data, ac_data
python类StandardScaler()的实例源码
def computeNeighboursScores(self):
all_instances = self.iteration.datasets.instances
# Connectivity matrix
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))])
pipeline.fit(all_instances.getFeatures())
# Labels
labels = np.array([generateLabel(x) for x in all_instances.getLabels()])
# Compute neighbour scores
scores = []
all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False)
for i, label in enumerate(labels):
if label != 0:
continue
else:
neighbours = all_neighbours[i]
score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours)
scores.append(score)
return np.array(scores)
def scale_numeric_data(pandas_data):
# Scaling is important because if the variables are too different from
# one another, it can throw off the model.
# EX: If one variable has an average of 1000, and another has an average
# of .5, then the model won't be as accurate.
for col in pandas_data.columns:
if pandas_data[col].dtype == np.float64 or pandas_data[col].dtype == np.int64:
pandas_data[col] = preprocessing.scale(pandas_data[col])
return pandas_data
# Creates a standard scaler based on the training data and applies it to both train
# and test data.
# Input:
# - Two Pandas DataFrames, same number of columns
# Output:
# - Two Pandas DataFrames, both of which have been scaled based on StandardScaler
# trained on training data.
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
def tf2npz(tf_path, export_folder=FAST):
vid_ids = []
labels = []
mean_rgb = []
mean_audio = []
tf_basename = os.path.basename(tf_path)
npz_basename = tf_basename[:-len('.tfrecord')] + '.npz'
isTrain = '/test' not in tf_path
for example in tf.python_io.tf_record_iterator(tf_path):
tf_example = tf.train.Example.FromString(example).features
vid_ids.append(tf_example.feature['video_id'].bytes_list.value[0].decode(encoding='UTF-8'))
if isTrain:
labels.append(np.array(tf_example.feature['labels'].int64_list.value))
mean_rgb.append(np.array(tf_example.feature['mean_rgb'].float_list.value).astype(np.float32))
mean_audio.append(np.array(tf_example.feature['mean_audio'].float_list.value).astype(np.float32))
save_path = export_folder + '/' + npz_basename
np.savez(save_path,
rgb=StandardScaler().fit_transform(np.array(mean_rgb)),
audio=StandardScaler().fit_transform(np.array(mean_audio)),
ids=np.array(vid_ids),
labels=labels
)
def preprocess_data(train_data_matrix, valid_data_matrix, test_data_matrix):
"""
Function to preprocess the data with the standard scaler from sci-kit learn.
It takes in the training, validation, and testing matrices and returns the
standardized versions of them.
Input: train_data_matrix The data matrix with the training set data
valid_data_matrix The data matrix with the validation set data
test_data_matrix The data matrix with the testing set data
.
Output: transform_train_data_matrix The data matrix with the standardized training set data
transform_valid_data_matrix The data matrix with the standardized validation set data
transform_test_data_matrix The data matrix with the standardized testing set data
Usage: analyze_ml_data(actual_bg_test_array, test_prediction, True, False, True, False, "00000001", "Linear Regression", "Pred30Data5")
"""
reg_scaler = prep.StandardScaler().fit(train_data_matrix)
transform_train_data_matrix = reg_scaler.transform(train_data_matrix)
transform_valid_data_matrix = reg_scaler.transform(valid_data_matrix)
transform_test_data_matrix = reg_scaler.transform(test_data_matrix)
return transform_train_data_matrix, transform_valid_data_matrix, transform_test_data_matrix
def load_norm_stats(stats_file, dim, method="MVN"):
#### load norm stats ####
io_funcs = BinaryIOCollection()
norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim)
assert frame_number==2
if method=="MVN":
scaler = preprocessing.StandardScaler()
scaler.mean_ = norm_matrix[0, :]
scaler.scale_ = norm_matrix[1, :]
elif method=="MINMAX":
scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))
scaler.min_ = norm_matrix[0, :]
scaler.scale_ = norm_matrix[1, :]
return scaler
def load_norm_stats(stats_file, dim, method="MVN"):
#### load norm stats ####
io_funcs = BinaryIOCollection()
norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim)
assert frame_number==2
if method=="MVN":
scaler = preprocessing.StandardScaler()
scaler.mean_ = norm_matrix[0, :]
scaler.scale_ = norm_matrix[1, :]
elif method=="MINMAX":
scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))
scaler.min_ = norm_matrix[0, :]
scaler.scale_ = norm_matrix[1, :]
return scaler
def test_group_lasso():
"""Group Lasso test."""
n_samples, n_features = 100, 90
# assign group ids
groups = np.zeros(90)
groups[0:29] = 1
groups[30:59] = 2
groups[60:] = 3
# sample random coefficients
beta0 = np.random.normal(0.0, 1.0, 1)
beta = np.random.normal(0.0, 1.0, n_features)
beta[groups == 2] = 0.
# create an instance of the GLM class
glm_group = GLM(distr='softplus', alpha=1.)
# simulate training data
Xr = np.random.normal(0.0, 1.0, [n_samples, n_features])
yr = simulate_glm(glm_group.distr, beta0, beta, Xr)
# scale and fit
scaler = StandardScaler().fit(Xr)
glm_group.fit(scaler.transform(Xr), yr)
def load_data(data_dir, num_files=30):
files_list = os.listdir(data_dir)
data = None
ac_data = None
for fname in files_list[:num_files]:
print fname
f = os.path.join(data_dir, fname)
with netcdf.netcdf_file(f, 'r') as fid:
m = fid.variables['outputMeans'][:].copy()
s = fid.variables['outputStdevs'][:].copy()
feats = fid.variables['targetPatterns'][:].copy()
ac_feats = fid.variables['inputs'][:].copy()
scaler = preprocessing.StandardScaler()
scaler.mean_ = m
scaler.scale_ = s
feats = scaler.inverse_transform(feats)
assert feats.shape[0] == ac_feats.shape[0]
# feats = np.concatenate((feats,ac_feats),axis=1)
if data == None and ac_data == None:
data = feats
ac_data = ac_feats
else:
data = np.vstack((data, feats))
ac_data = np.vstack((ac_data, ac_feats))
return data, ac_data
def load_data(data_dir, num_files=30):
files_list = os.listdir(data_dir)
data = None
ac_data = None
for fname in files_list[:num_files]:
print fname
f = os.path.join(data_dir, fname)
with netcdf.netcdf_file(f, 'r') as fid:
m = fid.variables['outputMeans'][:].copy()
s = fid.variables['outputStdevs'][:].copy()
feats = fid.variables['targetPatterns'][:].copy()
ac_feats = fid.variables['inputs'][:].copy()
scaler = preprocessing.StandardScaler()
scaler.mean_ = m
scaler.scale_ = s
feats = scaler.inverse_transform(feats)
assert feats.shape[0] == ac_feats.shape[0]
# feats = np.concatenate((feats,ac_feats),axis=1)
if data == None and ac_data == None:
data = feats
ac_data = ac_feats
else:
data = np.vstack((data, feats))
ac_data = np.vstack((ac_data, ac_feats))
return data, ac_data
def load_data(data_dir, num_files=30):
files_list = os.listdir(data_dir)
data = None
for fname in files_list[:num_files]:
print fname
f = os.path.join(data_dir, fname)
with netcdf.netcdf_file(f, 'r') as fid:
m = fid.variables['outputMeans'][:].copy()
s = fid.variables['outputStdevs'][:].copy()
feats = fid.variables['targetPatterns'][:].copy()
scaler = preprocessing.StandardScaler()
scaler.mean_ = m
scaler.scale_ = s
feats = scaler.inverse_transform(feats)
if data == None:
data = feats
else:
data = np.vstack((data, feats))
return data
def load_data(data_dir, num_files=30):
files_list = os.listdir(data_dir)
dataset = []
ac_dataset = []
for fname in files_list[:num_files]:
#print(fname)
f = os.path.join(data_dir, fname)
with netcdf.netcdf_file(f, 'r') as fid:
m = fid.variables['outputMeans'][:].copy()
s = fid.variables['outputStdevs'][:].copy()
feats = fid.variables['targetPatterns'][:].copy()
ac_feats = fid.variables['inputs'][:].copy()
scaler = preprocessing.StandardScaler()
scaler.mean_ = m
scaler.scale_ = s
feats = scaler.inverse_transform(feats)
assert feats.shape[0] == ac_feats.shape[0]
dataset.extend(feats)
ac_dataset.extend(ac_feats)
dataset = np.asarray(dataset)
ac_dataset = np.asarray(ac_dataset)
#print(dataset.shape, ac_dataset.shape)
return dataset, ac_dataset
def init_state(indata, test=False):
close = indata['close'].values
diff = np.diff(close)
diff = np.insert(diff, 0, 0)
sma15 = SMA(indata, timeperiod=15)
sma60 = SMA(indata, timeperiod=60)
rsi = RSI(indata, timeperiod=14)
atr = ATR(indata, timeperiod=14)
#--- Preprocess data
xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr))
xdata = np.nan_to_num(xdata)
if test == False:
scaler = preprocessing.StandardScaler()
xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
joblib.dump(scaler, 'data/scaler.pkl')
elif test == True:
scaler = joblib.load('data/scaler.pkl')
xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
state = xdata[0:1, 0:1, :]
return state, xdata, close
#Take Action
def init_state(data):
close = data
diff = np.diff(data)
diff = np.insert(diff, 0, 0)
#--- Preprocess data
xdata = np.column_stack((close, diff))
xdata = np.nan_to_num(xdata)
scaler = preprocessing.StandardScaler()
xdata = scaler.fit_transform(xdata)
state = xdata[0:1, :]
return state, xdata
#Take Action
def init_state(data):
close = data
diff = np.diff(data)
diff = np.insert(diff, 0, 0)
#--- Preprocess data
xdata = np.column_stack((close, diff))
xdata = np.nan_to_num(xdata)
scaler = preprocessing.StandardScaler()
xdata = scaler.fit_transform(xdata)
state = xdata[0:1, :]
return state, xdata
#Take Action
def sample_pipelines(pca_kernels=None, svr_kernels=None):
"""
Pipelines that can't be fit in a reasonable amount of time on the whole
dataset
"""
# Model instances
model_steps = []
if pca_kernels is None:
pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine']
for pca_kernel in pca_kernels:
model_steps.append([
KernelPCA(n_components=2, kernel=pca_kernel),
LinearRegression(),
])
if svr_kernels is None:
svr_kernels = ['poly', 'rbf', 'sigmoid']
for svr_kernel in svr_kernels:
model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000))
# Pipelines
pipelines = []
for m in model_steps:
# Steps
common_steps = [
StandardScaler(),
]
model_steps = m if isinstance(m, list) else [m]
steps = common_steps + model_steps
pipelines.append(make_pipeline(*steps))
return pipelines
def PCA_analysis(data, mode, cell_stages = None):
"""Principal Component Analysis.
"""
assert mode in {'pca', 'pca2'}
mean_shifter = StandardScaler(with_std = False)
if mode == 'pca':
pca = PCA(min(data.shape))
projected_data = pca.fit_transform(data)
projected_data = pca.fit_transform(mean_shifter.fit_transform(data))
components = pca.components_
else:
assert isinstance(cell_stages, np.ndarray)
idx = np.where(cell_stages == np.max(cell_stages))[0]
pca = PCA(min(idx.size, data.shape[1]))
pca.fit(mean_shifter.fit_transform(data[idx]))
components = pca.components_
projected_data = np.dot(data, components.T)
return components, projected_data
targetAudioProcessing.py 文件源码
项目:jingjuSingingPhraseMatching
作者: ronggong
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def mfccFeature_audio(filename_wav,index_keep,feature_type='mfcc'):
audio = ess.MonoLoader(downmix = 'left', filename = filename_wav, sampleRate = fs)()
if feature_type == 'mfcc':
feature = getFeature(audio)
elif feature_type == 'mfccBands1D':
feature = getMFCCBands1D(audio)
elif feature_type == 'mfccBands2D':
feature = getMFCCBands2D(audio,nbf=True)
if feature_type == 'mfccBands1D' or feature_type == 'mfccBands2D':
feature = np.log(100000 * feature + 1)
scaler = pickle.load(open(kerasScaler_path,'rb'))
feature = scaler.transform(feature)
# feature = preprocessing.StandardScaler().fit_transform(feature)
# index_keep = pitchProcessing_audio(filename_wav)
feature_out = feature[index_keep[0],:]
for index in index_keep[1:]:
feature_out = np.vstack((feature_out,feature[index,:]))
if feature_type == 'mfccBands2D':
feature_out = featureReshape(feature_out)
return feature_out
acousticModelTraining.py 文件源码
项目:jingjuSingingPhraseMatching
作者: ronggong
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def trainValidationSplit(dic_pho_feature_train,validation_size=0.2):
'''
split the feature in dic_pho_feature_train into train and validation set
:param dic_pho_feature_train: input dictionary, key: phoneme, value: feature vectors
:return:
'''
feature_all = []
label_all = []
for key in dic_pho_feature_train:
feature = dic_pho_feature_train[key]
label = [dic_pho_label[key]] * len(feature)
if len(feature):
if not len(feature_all):
feature_all = feature
else:
feature_all = np.vstack((feature_all, feature))
label_all += label
label_all = np.array(label_all,dtype='int64')
feature_all = preprocessing.StandardScaler().fit_transform(feature_all)
feature_train, feature_validation, label_train, label_validation = \
train_test_split(feature_all, label_all, test_size=validation_size, stratify=label_all)
return feature_train, feature_validation, label_train, label_validation
def test_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = StandardScaler().fit(scikit_data.data)
spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec()
input_data = [dict(zip(scikit_data.feature_names, row))
for row in scikit_data.data]
output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]
metrics = evaluate_transformer(spec, input_data, output_data)
assert metrics["num_errors"] == 0
def test_boston_OHE_plus_normalizer(self):
data = load_boston()
pl = Pipeline([
("OHE", OneHotEncoder(categorical_features = [8], sparse=False)),
("Scaler",StandardScaler())])
pl.fit(data.data, data.target)
# Convert the model
spec = convert(pl, data.feature_names, 'out')
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in pl.transform(data.data)]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
def dataset_generator():
"""
generate dataset for binary classification
:return:
"""
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
X, y = datasets[0]
y[y == 0] = -1
X = StandardScaler().fit_transform(X)
return X, y
def make_x_y(self, data, code):
data_x = []
data_y = []
data.loc[:, 'month'] = data.loc[:, '??']%10000/100
data = data.drop(['??', '????'], axis=1)
# normalization
data = np.array(data)
if len(data) <= 0 :
return np.array([]), np.array([])
if code not in self.scaler:
self.scaler[code] = StandardScaler()
data = self.scaler[code].fit_transform(data)
elif code not in self.scaler:
return np.array([]), np.array([])
else:
data = self.scaler[code].transform(data)
for i in range(self.frame_len, len(data)-self.predict_dist+1):
data_x.extend(np.array(data[i-self.frame_len:i, :]))
data_y.append(data[i+self.predict_dist-1][0])
np_x = np.array(data_x).reshape(-1, 23*30)
np_y = np.array(data_y)
return np_x, np_y
def make_x_y(self, data, code):
data_x = []
data_y = []
data.loc[:, 'month'] = data.loc[:, '??']%10000/100
data = data.drop(['??', '????'], axis=1)
# normalization
data = np.array(data)
if len(data) <= 0 :
return np.array([]), np.array([])
if code not in self.scaler:
self.scaler[code] = StandardScaler()
data = self.scaler[code].fit_transform(data)
elif code not in self.scaler:
return np.array([]), np.array([])
else:
data = self.scaler[code].transform(data)
for i in range(self.frame_len, len(data)-self.predict_dist+1):
data_x.extend(np.array(data[i-self.frame_len:i, :]))
data_y.append(data[i+self.predict_dist-1][0])
np_x = np.array(data_x).reshape(-1, 23*self.frame_len)
np_y = np.array(data_y)
return np_x, np_y
def fit_model(self, logging_uuid, model=None, epochs=1000, batch_size=10):
if model is not None:
self.model = model
X, y, _ = self.get_formulation_training_data()
scaler = StandardScaler().fit(X)
lcb = LambdaCallback(
on_epoch_end=
lambda epoch, logs:
r.set(logging_uuid, json.dumps({'model_state': 'training',
'epoch': epoch,
'epochs': epochs,
'loss': logs['loss']})),
on_train_end=
lambda logs:
r.set(logging_uuid, json.dumps({'model_state': 'training',
'epoch': epochs,
'epochs': epochs})),
)
self.fit_history = self.model.fit(scaler.transform(X), y,
epochs=epochs,
batch_size=batch_size,
verbose=0,
callbacks=[lcb])
return self.model, self.fit_history
def save_grid_to_db(self, model=None):
if model is not None:
self.model = model
f_instance = Formulation.query.get(self.f_id)
f_instance.formulation_data_grid.delete()
# prepare data lines to plot
X, y, data_traces = self.get_formulation_training_data()
# train model to fit data lines
scaler = StandardScaler().fit(X)
# prepare mesh grid to plot
max_t, max_f = np.amax(X, axis=0)
min_t, min_f = np.amin(X, axis=0)
xv, yv = np.meshgrid(np.arange(floor(min_t), ceil(max_t)),
np.arange(floor(min_f), ceil(max_f)),
indexing='ij')
xv = xv.reshape((xv.shape[0], xv.shape[1], -1))
yv = yv.reshape((yv.shape[0], yv.shape[1], -1))
grid_xys = np.concatenate((xv, yv), axis=2).reshape((-1, 2))
# predict z for grid
grid_zs = self.model.predict(scaler.transform(grid_xys)).reshape((-1))
for x, y, z in zip(grid_xys[:, 0], grid_xys[:, 1], grid_zs):
f_instance.formulation_data_grid.append(FormulationDataGrid(x_value=x, y_value=y, z_value=z))
db.session.commit()
def scale_features(features, train):
"""Scale features, using test set to learn parameters.
Returns:
Scaled copy of features.
"""
if FLAGS.scaling is None:
return features
logging.info('Scaling features with %s', FLAGS.scaling)
if FLAGS.scaling == 'max_abs':
scaler = preprocessing.MaxAbsScaler()
elif FLAGS.scaling == 'standard':
scaler = preprocessing.StandardScaler()
else:
raise ValueError('Unrecognized scaling %s' % FLAGS.scaling)
scaler.fit(features[train])
return scaler.transform(features)
def test_multiprocessing():
generator = check_random_state(0)
data = genData(n_samples=200, n_features=4, n_redundant=2,strRel=2,
n_repeated=0, class_sep=1, flip_y=0, random_state=generator)
X_orig, y = data
X_orig = StandardScaler().fit(X_orig).transform(X_orig)
X = np.c_[X_orig, generator.normal(size=(len(X_orig), 6))]
y = list(y) # regression test: list should be supported
# Test using the score function
fri = EnsembleFRI(FRIClassification(random_state=generator),n_bootstraps=5,n_jobs=2, random_state=generator)
fri.fit(X, y)
# non-regression test for missing worst feature:
assert len(fri.allrel_prediction_) == X.shape[1]
assert len(fri.interval_) == X.shape[1]
# All strongly relevant features have a lower bound > 0
assert np.all(fri.interval_[0:2,0]>0)
# All weakly relevant features should have a lower bound 0
assert np.any(fri.interval_[2:4,0]>0) == False
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2,
colormap=palettes.DEFAULT_SEQUENCE, **kwargs):
super(PCADecomposition, self).__init__(ax=ax, **kwargs)
# Data Parameters
if proj_dim not in (2, 3):
raise YellowbrickValueError("proj_dim object is not 2 or 3.")
self.color = color
self.pca_features_ = None
self.scale = scale
self.proj_dim = proj_dim
self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)),
('pca', PCA(self.proj_dim, ))
])
# Visual Parameters
self.colormap = colormap