def sk_min_max(X):
min_max_scaler = MinMaxScaler()
# X = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
return min_max_scaler.fit_transform(X)
python类scale()的实例源码
def sk_scale(X):
return scale(X, axis=0, with_mean=True, with_std=True, copy=True )
def Standardization(self):
# feature 10: minimum price so far; feature 11: maximum price so far
# feature 12: current price
scaled = preprocessing.scale(self.X_train[:, 10:13])
self.X_train[:, 10:13] = scaled
scaled = preprocessing.scale(self.X_test[:, 10:13])
self.X_test[:, 10:13] = scaled
def Standardization(self):
scaled = preprocessing.scale(self.X_train[:, 10:12])
self.X_train[:, 10:12] = scaled
scaled = preprocessing.scale(self.X_test[:, 10:12])
self.X_test[:, 10:12] = scaled
def prepare(self):
with open('%s' % self.cfg.pca_pkl, 'r') as pklfile:
self.pca = pickle.load(pklfile)
try:
self.df = self.df.query('face == 1')
except:
print 'Face column not found in the dataframe',
print 'Treated as not being processed by skin_filter.'
x = self.df[self.ftcols].as_matrix()
x = preprocessing.scale(x)
xp = self.pca.transform(x)
self.dfp = pd.DataFrame(xp)
self.dfp[['number','time']] = self.df[['number','time']]
def fit(self, X, STANDARDIZE=True, n=10):
if not isinstance(X, np.ndarray):
X = to_array(X)
assert(X.ndim == 2), "Input array must have two dimensions."
if not check_standardized(X):
if STANDARDIZE:
X = preprocessing.scale(X)
print "Standardize input data for fit."
else:
print "WARNING: data is not standardized and you switch off STANDARDIZE option.",
print "Make sure this is what you intended."
self.model = PCA(n_components=n)
self.model.fit(X)
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def svc_rbf_xyat(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["x"] = df["x"]
df_new["y"] = df["y"]
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"]
df_new["accuracy"] = df["accuracy"].apply(np.log10)
return preprocessing.scale(df_new.values)
logging.info("train svc_rbf_xyat model")
clf = SVC(kernel='rbf', probability=True, cache_size=3000)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def svc_lin_xyat(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["x"] = df["x"]
df_new["y"] = df["y"]
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"]
df_new["accuracy"] = df["accuracy"].apply(np.log10)
return preprocessing.scale(df_new.values)
logging.info("train svc_lin_xyat model")
clf = SVC(kernel='linear', probability=True, cache_size=3000)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def svc_rbf_xyatu(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["x"] = df["x"]
df_new["y"] = df["y"]
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"]
df_new["accuracy"] = df["accuracy"]
return preprocessing.scale(df_new.values)
logging.info("train svc_rbf_xyatu model")
clf = SVC(kernel='rbf', probability=True, cache_size=3000)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def svc_lin_xyatu(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["x"] = df["x"]
df_new["y"] = df["y"]
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"]
df_new["accuracy"] = df["accuracy"]
return preprocessing.scale(df_new.values)
logging.info("train svc_lin_xyatu model")
clf = SVC(kernel='linear', probability=True, cache_size=3000)
clf.fit(prepare_feats(df_cell_train_feats), y_train)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
def scaleDataset(data):
'''
Scaling the dataset between 1 to -1
'''
data = scale(data)
return data
def get_recommendations():
module_dir = os.path.dirname(__file__)
train_df = build_training_set()
if train_df is None:
return []
x_train = train_df.iloc[:, 5:]
try:
x_train = scale(x_train)
except:
print("First migrations")
y_train = train_df.iloc[:, 3]
x_train_labels = train_df.iloc[:, 0]
target_df = pd.read_csv(os.path.join(module_dir,'data.csv'))
target_df = pd.DataFrame(target_df)
target_df = target_df.append(train_df)
target_df = target_df.append(train_df)
target_df = target_df.drop_duplicates('SeriesName', keep=False)
x_target = scale(target_df.iloc[:, 5:])
x_target_labels = target_df.iloc[:, 0]
clf = RandomForestClassifier()
clf.fit(x_train,y_train)
y_target = clf.predict(x_target)
new_df = pd.DataFrame()
new_df['seriesName'] = x_target_labels
new_df['tvdbID'] = target_df.iloc[:, 1]
new_df['PredictedRating'] = y_target
new_df['indicator'] = (target_df.iloc[:, 4]/target_df.iloc[:, 3])*new_df['PredictedRating']
new_df = new_df.sort_values(['indicator'], ascending=False)
initial_list = list(new_df.iloc[:4, 1])
latter_list = list(new_df.iloc[5:15, 1])
shuffle(latter_list)
return list(initial_list + latter_list[:5])
def scale_feature(self, col=None, scaling=None, scaling_parms=None):
'''
Scales a given set of numerical columns. This only works for columns
with numerical values.
Parameters
----------
col : a string of a column name, or a list of many columns names or
None (default). If col is None, all numerical columns will
be used.
scaling : {'zscore', 'minmax_scale' (default), 'scale', 'maxabs_scale',
'robust_scale'}
User-defined scaling functions can also be used through self.transform_feature
scaling_parms : dictionary
any additional parameters to be used for sklearn's scaling functions.
'''
self._validate_params(params_list = {'col':col,'scaling':scaling},
expected_types= {'col':[str,list,type(None)], 'scaling':[str,type(None)]})
if scaling is None: scaling = 'minmax_scale'
if scaling == 'zscore':
scaling = 'lambda x: (x - x.mean()) / x.std()'
elif scaling == 'minmax_scale' and scaling_parms is None:
scaling_parms = {'feature_range':(0, 1),'axis':0}
elif scaling == 'scale' and scaling_parms is None:
scaling_parms = {'with_mean':True, 'with_std':True,'axis':0}
elif scaling == 'maxabs_scale' and scaling_parms is None:
scaling_parms = {'axis':0}
elif scaling == 'robust_scale' and scaling_parms is None:
scaling_parms = {'with_centering':True, 'with_scaling':True, 'axis':0} # 'quantile_range':(25.0, 75.0),
else:
raise TypeError('UNSUPPORTED scaling TYPE')
self.transform_feature(col=col, func_str=scaling, addtional_params=scaling_parms)
def gen_feature_imp_matrix(model_id_list, features_df):
feature_imp_matrix = pd.DataFrame
for model_id in model_id_list[:1]:
feature_imp_matrix = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values
for model_id in model_id_list[1:]:
b = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values
feature_imp_matrix = np.vstack((feature_imp_matrix, b))
feature_imp_matrix_normd = scale(np.transpose(feature_imp_matrix),axis=0,with_mean=True, with_std=True, copy=True)
return feature_imp_matrix_normd
def spectrogramPower(audio, window_size=0.02, window_stride=0.01):
""" short time fourier transform
Details:
audio - This is the input time-domain signal you wish to find the spectrogram of. It can't get much simpler than that. In your case, the
signal you want to find the spectrogram of is defined in the following code:
win_length - If you recall, we decompose the image into chunks, and each chunk has a specified width. window defines the width of each
chunkin terms of samples. As this is a discrete-time signal, you know that this signal was sampled with a particular sampling
frequency and sampling period. You can determine how large the window is in terms of samples by:
window_samples = window_time/Ts
hop_length - the same as stride in convolution network, overlapping width
"""
samplingRate, samples = wav.read(audio)
win_length = int(window_size * samplingRate)
hop_length = int(window_stride * samplingRate)
n_fft = win_length
D = librosa.core.stft(samples, n_fft=n_fft,hop_length=hop_length,
win_length=win_length)
mag = np.abs(D)
log_mag = np.log1p(mag)
# normalization
log_mag = preprocessing.scale(log_mag)
# size: frequency_bins*time_len
return log_mag
def standardize(data):
numeric_list = ['BsmtFullBath', 'LotArea', 'YearRemodAdd', 'GrLivArea', 'BsmtHalfBath', 'MiscVal', 'YearBuilt',
'WoodDeckSF', 'KitchenAbvGr', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'OpenPorchSF', 'MoSold',
'LowQualFinSF', 'BedroomAbvGr', 'Fireplaces', '1stFlrSF', 'FullBath', 'BsmtFinSF1', 'BsmtFinSF2',
'HalfBath',
'Porch', '2ndFlrSF', 'MasVnrArea', 'YrSold', 'BsmtUnfSF', 'LotFrontage', 'TotRmsAbvGrd']
data.loc[:, numeric_list] = preprocessing.scale(data.loc[:, numeric_list])
model_single.py 文件源码
项目:Tencent_Social_Advertising_Algorithm_Competition
作者: guicunbin
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def get_concat_data(label_csv, label_col, other_csvs, is_rate, important_feats):
print 'important_feats : ',len(important_feats)
rank_feats = [f for f in get_csv_header(dataset1_csv) if 'click' in f]
rank_feats = [f for f in rank_feats if f in important_feats] if important_feats else rank_feats
X = pd.read_csv(label_csv, usecols = rank_feats+[label_col]).apply(small_dtype)
X = X[:1000000] if is_tiny else X
print 'concat csvs ......'
X = pd.concat([X, get_need_feats(other_csvs, is_rate, is_tiny, important_feats)], axis=1)
#if label_csv.split('/')[-1] == 'dataset2.csv':
# for c in X.columns:
# if c.endswith('_fset_total_cnt'):
# X = X.drop(X[X[c]==0].index, axis=0)
feat_cols = [f for f in X.columns if f != label_col]
if is_to_csv:
save_file = label_csv.split('.csv')[0]+'_concat.csv'
if os.path.exists(save_file):
print save_file + " has exists"
else:
print 'to csv ........'
X = X.replace(np.nan, -1)
X = X.replace(np.inf, -2)
X[feat_cols] = scale(X[feat_cols]).astype('float16')
X.to_csv(save_file, index=False, chunksize = 50000)
print X.shape
# TODO cate_feats = [f for f in X.columns if 'click' in f]
## ????????? 3 ???????,??rank_feats ????????
X, = change_to_category([X], cate_feats)
y = X[label_col].values
X = X[feat_cols]
if label_col == 'label':
print 'positive percent ',y.mean()
return X, y
def windowCharacter(x):
tmp = np.zeros((x.shape[0]))
n=0
for row in x.iterrows():
tmp[n] = signalMag(row[1]['X'],row[1]['Y'],row[1]['Z'])
n=n+1
# if np.std(tmp) > 5:
# return None
# else:
p_25 = np.percentile(tmp,25)
p_75 = np.percentile(tmp,75)
tmp_25 = [each for each in tmp if each < p_25]
tmp_75 = [each for each in tmp if each < p_75]
data_dm = scale(tmp,with_mean=True, with_std=False) # demean data
(freq_1,power_1) = butterFilter(data_dm,lowcut_1,highcut_1)
idx_1 = np.argmax(power_1)
freq_1_sec = np.delete(freq_1,idx_1)
power_1_sec = np.delete(power_1,idx_1)
idx_1_sec = np.argmax(power_1_sec)
(freq_2,power_2) = butterFilter(data_dm,lowcut_2,highcut_2)
idx_2 = np.argmax(power_2)
return np.mean(tmp), np.std(tmp), np.median(tmp), np.linalg.norm(tmp_25), np.linalg.norm(tmp_75),np.sum(power_1), freq_1[idx_1],power_1[idx_1], freq_1_sec[idx_1_sec], power_1_sec[idx_1_sec], freq_2[idx_2],power_2[idx_2],freq_1[idx_1]/np.sum(power_1)
def normalize(x, sf, logtrans=True, sfnorm=True, zeromean=True):
if sfnorm:
assert len(sf.shape) == 1
x = x / (sf[:, None]+1e-8) # colwise div
if logtrans:
x = np.log1p(x)
if zeromean:
x = scale(x)
return x
def test_scale():
matrix = [[0,30], [1, 27], [3, 24]]
scaled = pre.scale(matrix)
print(scaled)