def _validate_sklearn_preprocessing(self):
'''Validate "sklearn_preprocessing" dict in config'''
self.sklearn_preprocessing = self.config.get('sklearn_preprocessing') or {}
self._validate_type(self.sklearn_preprocessing, 'sklearn_preprocessing', dict)
for k, v in self.sklearn_preprocessing.items():
self._validate_type(v, 'sklearn_preprocessing:{}'.format(k), dict)
if v.get('method') in dir(skpre) or callable(v.get('method')):
pass
else:
self._validate_custom_callable(v.get('method'),
True,
'sklearn_preprocessing:{} - method'.format(k))
if v['method'].split(':')[-1] == 'FunctionTransformer':
self._validate_custom_callable(v.get('func'),
True,
'sklearn_preprocessing:{} - func passed to FunctionTransformer'.format(k))
python类preprocessing()的实例源码
def outofsample_extensions(method=None, dataset=None):
np.random.seed(1)
sklearn.utils.check_random_state(1)
train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)
# Learn a new space using Isomap
isomap = Isomap(n_components=10, n_neighbors=20)
train_data_isomap = np.float32(isomap.fit_transform(train_data))
if method == 'linear-regression':
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
train_data = std.fit_transform(train_data)
test_data = std.transform(test_data)
# Use linear regression to provide baseline out-of-sample extensions
proj = LinearRegression()
proj.fit(np.float64(train_data), np.float64(train_data_isomap))
acc = evaluate_svm(proj.predict(train_data), train_labels,
proj.predict(test_data), test_labels)
elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d':
# Use the SEF to provide out-of-sample extensions
if method == 'c-ISOMAP-10d':
proj = LinearSEF(train_data.shape[1], output_dimensionality=10)
proj.cuda()
else:
proj = LinearSEF(train_data.shape[1], output_dimensionality=20)
proj.cuda()
loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy',
epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1)
acc = evaluate_svm(proj.transform(train_data), train_labels,
proj.transform(test_data), test_labels)
print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def vectorize_fold(fold, tags, meta_df, use_metafeats=True):
with time_me('Loaded pdicts'):
scoreses = [common.pdict_for_tag(tag, fold) for tag in tags]
df = meta_df[meta_df['fold']==fold]
assert len(df)
y = df['label']
n_predictors = len(scoreses)
with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'):
# TODO: could use the logit loading fn added to user_wrapper module
scores = munge_scoreses(scoreses, df)
if not use_metafeats:
X = scores
else:
meta_cols = metavectorize.metafeature_columns
meta = df[meta_cols].values
# Special f_0 dummy meta feature for learning vanilla weight term per predictor
metafeats = np.hstack([np.ones( (len(df), 1) ), meta])
# Oh fuck this, I've spent too long trying to understand np.einsum...
# (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility
# that might have been useful here. But this is fine.)
n_metafeats = metafeats.shape[1]
logging.info('{} predictors x {} metafeatures -> {} coefs'.format(
n_predictors, n_metafeats, n_predictors*n_metafeats))
# X is 'metafeat major'. i.e. the first n_p values for each vector are the
# raw scores for each predictor, they're followed by each predictor's score
# multiplied by the first metafeature and so on.
X = np.tile(scores, n_metafeats) * np.repeat(metafeats, n_predictors, axis=1)
return X, y
def mfcc_features(filename):
"""Preprocessing per CTC paper.
(These are not the simpler linear spectrogram features alone as in Deep
Speech).
Properties:
- 10ms frames with 5ms overlap
- 12 MFCCs with 26 filter banks
- replace first MFCC with energy (TODO: log-energy)
- add first-order derivatives for all of the above
- total: 26 coefficients
"""
d, sr = librosa.load(filename)
frame_length_seconds = 0.010
frame_overlap_seconds = 0.005
mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
# energy (TODO: log?)
energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
mfccs[0] = energy # replace first MFCC with energy, per convention
deltas = librosa.feature.delta(mfccs, order=1)
mfccs_plus_deltas = np.vstack([mfccs, deltas])
coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1)
return coeffs
def _weightProcessing(weightDF):
weightDF = weightDF.loc[1:, :]
weightDF['coefficient'] = weightDF['coefficient'].abs()
min_max_scaler = preprocessing.MinMaxScaler()
weight_scaled = min_max_scaler.fit_transform(weightDF[['coefficient']])
weightDF['coefficient'] = weight_scaled
print weightDF.sort_values('coefficient', ascending=False).to_string(index=False)
def __init__(self, clf=None, le=None):
# type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
"""Construct a new intent classifier using the sklearn framework."""
from sklearn.preprocessing import LabelEncoder
if le is not None:
self.le = le
else:
self.le = LabelEncoder()
self.clf = clf
def __init__(self, clf=None, le=None):
# type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
"""Construct a new intent classifier using the sklearn framework."""
from sklearn.preprocessing import LabelEncoder
if le is not None:
self.le = le
else:
self.le = LabelEncoder()
self.clf = clf
finance_similarity.py 文件源码
项目:Spark-in-Finance-Quantitative-Investing
作者: litaotao
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def cal_minute_bar_similarity(line_data):
"""?????
line_data format: file_path, json_data
???
1. ??????
2. ?????
3. ?????? - ????
4. ????? - ????
Return:
square diff and var diff of two lines.
[diff_square, diff_var, (line_path)]
[diff_square_normalized, diff_var_normalized, (line_path)]
"""
tmp = pd.DataFrame()
import sklearn.preprocessing
scaler = sklearn.preprocessing.MinMaxScaler()
today_data = pd.DataFrame.from_dict(json.loads(df_today_share.value))
today_data_length = today_length_share.value
line_path, line_df = line_data
line_df = pd.DataFrame.from_dict(json.loads(line_df))
line_df.sort(columns=['barTime'], ascending=True, inplace=True)
tmp['first'] = list(today_data[: today_data_length]['ratio'])
tmp['second'] = list(line_df[: today_data_length]['ratio'])
_first, _second = list(tmp['first']), list(tmp['second'])
tmp['first_normalized'] = list(scaler.fit_transform(np.array(_first)))
tmp['second_normalized'] = list(scaler.fit_transform(np.array(_second)))
tmp['diff'] = tmp['first'] - tmp['second']
tmp['diff_normalized'] = tmp['first_normalized'] - tmp['second_normalized']
diff_square = sum(tmp['diff'] ** 2)
diff_square_normalized = sum(tmp['diff_normalized'] ** 2)
diff_var = float(tmp['diff'].var())
diff_var_normalized = float(tmp['diff_normalized'].var())
res_square = [round(diff_square, 5), round(diff_square_normalized, 5), (line_path)]
res_var = [round(diff_var, 5), round(diff_var_normalized, 5), (line_path)]
return res_square + res_var
# ### ???