def scale_numeric_data(pandas_data):
# Scaling is important because if the variables are too different from
# one another, it can throw off the model.
# EX: If one variable has an average of 1000, and another has an average
# of .5, then the model won't be as accurate.
for col in pandas_data.columns:
if pandas_data[col].dtype == np.float64 or pandas_data[col].dtype == np.int64:
pandas_data[col] = preprocessing.scale(pandas_data[col])
return pandas_data
# Creates a standard scaler based on the training data and applies it to both train
# and test data.
# Input:
# - Two Pandas DataFrames, same number of columns
# Output:
# - Two Pandas DataFrames, both of which have been scaled based on StandardScaler
# trained on training data.
python类scale()的实例源码
def transform(self, X, STANDARDIZE=True):
if not isinstance(X, np.ndarray):
X = to_array(X)
assert(X.ndim == 2), "Input array must have two dimensions."
if not check_standardized(X):
if STANDARDIZE:
X = preprocessing.scale(X)
print "Standardize input data for transform"
if not self.model:
print "Load or fit a model before performing trsnaformation."
else:
assert(X.shape[1] > self.model.n_components),\
"Input data must have a dimension larger than model components %d."\
% self.model.n_components
xp = self.model.transform(X)
return xp
def train(self, df, shuffle=True, preprocess=False, *args, **kwargs):
"""
Takes a dataframe of features + a 'label' column and trains the lobe
"""
if self._trained:
logger.warning('Overwriting an already trained brain!')
self._trained = False
# shuffle data for good luck
if shuffle:
df = shuffleDataFrame(df)
# scale train data and fit lobe
x = df.drop('label', axis=1).values
y = df['label'].values
del df
if preprocess:
x = preprocessing.scale(x)
logger.info('Training with %d samples', len(x))
self.lobe.fit(x, y)
self._trained = True
def get_sample(self, N=600, scale=False):
all_data = self.pre_process(self.file_name)
#print('data_type: ' + str(all_data.dtypes))
all_data = all_data.values
xs = all_data[:, 2:]
y = all_data[:, 1]
if scale:
xs = preprocessing.scale(xs)
if N != -1:
perm = np.random.permutation(xs.shape[0])
xs = xs[perm]
y = y[perm]
xs_train, xs_test = np.split(xs, [N])
y_train, y_test = np.split(y, [N])
return xs_train, xs_test, y_train, y_test
else:
return xs, y
def get_X_y(self):
"""Builds an X, y feature/target pair from the data.
:returns: a tuple of (feature matrix, labels)
"""
# X
X = np.array(self.data[self.features])
X = scale(X)
# y
stock_change = np.array(self.data["stock_p_change"])
sp500_change = np.array(self.data["sp500_p_change"])
is_above_threshold = stock_change-sp500_change > self.threshold
y = is_above_threshold.astype('i')
return (X, y)
def scale_sets(x_train, x_test, classifier_name):
"""
:param x_train: ndarray, required
- The train data of the feature matrix
:param x_test: ndarray, required
- The test data of the feature matrix
:param classifier_name: string, optional
- The name of the selected classifier
:return: ndarray
"""
# scaling leads to poorer performance in the case of random forests, xgb, etc.
if classifier_name not in ["random_forests", "XGB", "GBC"]:
# x_train, x_test are expected to be numpy arrays. Simple conditions such as if x_train will raise a ValueError.
x_train = scale(x_train) if x_train is not None else x_train
x_test = scale(x_test) if x_test is not None else x_test
return x_train, x_test
def classify(self, M):
"""
Classify a hyperspectral cube using the ROIs defined clusters by the fit method.
Parameters:
M: `numpy array`
A HSI cube (m x n x p).
Returns: `numpy array`
A class map (m x n x 1).
"""
img = self._convert2D(M)
image_scaled = preprocessing.scale(img)
cls = self.clf.predict(image_scaled)
self.cmap = self._convert3d(cls, M.shape[0], M.shape[1])
return self.cmap
model_fit_history.py 文件源码
项目:Exoplanet-Artificial-Intelligence
作者: pearsonkyle
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def load_data(fname='transit_data.pkl',categorical=False,whiten=True,DIR='pickle_data/'):
data = pickle.load(open(DIR+fname,'rb'))
# convert to numpy array fo float type from object type
pvals = arr(data['results'][:,0])
transits = arr(data['results'][:,1])
null = arr(data['results'][:,2])
X = np.vstack([transits,null])
y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] )
if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] )
if whiten: X = preprocessing.scale(X,axis=1)
return X,y,pvals,data['keys'],data['time']
generate_data.py 文件源码
项目:Exoplanet-Artificial-Intelligence
作者: pearsonkyle
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def load_data(fname='transit_data_train.pkl',categorical=False,whiten=True,DIR='pickle_data/'):
data = pickle.load(open(DIR+fname,'rb'))
# convert to numpy array fo float type from object type
pvals = arr(data['results'][:,0])
transits = arr(data['results'][:,1])
null = arr(data['results'][:,2])
X = np.vstack([transits,null])
y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] )
if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] )
if whiten: X = preprocessing.scale(X,axis=1)
return X,y,pvals,data['keys'],data['time']
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
""" Label? ??? ??? ??? ??? ??? Row ??? ????.
Args:
params:
* _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
* _df_csv_read_ori : pandas dataframe
* _label
Returns:
Preprocessing Dataframe
"""
if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
logging.info("No Duplicate")
result_df = _df_csv_read_ori
else :
cell_features = _df_csv_read_ori.columns.tolist()
cell_features.remove(_label)
result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
return result_df
def compute_preprocessor(self,method):
self.data={}
if method=='none':
self.data=self.orig_data
elif method=='min_max':
transform=preprocessing.MinMaxScaler()
self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
self.data['X_val']=transform.transform(self.orig_data['X_val'])
self.data['X_test']=transform.transform(self.orig_data['X_test'])
elif method=='scaled':
self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
elif method=='normalized':
self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
self.data['y_train']=self.orig_data['y_train']
self.data['y_val']=self.orig_data['y_val']
self.data['y_test']=self.orig_data['y_test']
def compute_preprocessor(self,method):
self.data={}
if method=='min_max':
transform=preprocessing.MinMaxScaler()
self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
self.data['X_val']=transform.transform(self.orig_data['X_val'])
self.data['X_test']=transform.transform(self.orig_data['X_test'])
elif method=='scaled':
self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
elif method=='normalized':
self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
self.data['y_train']=self.orig_data['y_train']
self.data['y_val']=self.orig_data['y_val']
self.data['y_test']=self.orig_data['y_test']
def FeatureCombination(Df,s='',num_feature=2):
feature_set = []
for c in Df.columns:
if c.startswith(s): feature_set.append(c)
print('combining', len(feature_set), 'features')
data = Df[feature_set].values
for c in Df.columns:
if Df[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(Df[c].values))
Df[c] = lbl.transform(list(Df[c].values))
imp = preprocessing.Imputer()
data = imp.fit_transform(data)
data = preprocessing.scale(data)
pca = PCA(num_feature)
pca.fit(data)
print('explained_variance_ratio_:', pca.explained_variance_ratio_)
trans = pca.transform(data)
for i in range(0,num_feature):
Df[s+'_%d'%(i+1)] = trans[:,i]
Df.drop(feature_set,1,inplace=True)
return Df
def get_ind_return(data):
'''
??xlsx????????????????????????????????
:param [DataFrame] data: ?xlsx????????-????
:return: [DataFrame] ind_ret: ??*?? ???????????
'''
# ??stk_ind_pair.xlsx???????????????
stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
# ?stk_ind?????????????????
stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
# ?stk_ind?data??merge??????????data
data = pd.merge(data, stk_ind, on='Stkcd')
# ?????????
groups = data.groupby(['Trdmnt', 'ind'])
# ???????????????
total_Ms = groups['Msmvttl'].sum()
# ?????????????????????
total_Mr=groups['total_Mr'].sum()
# ?????????????????
ind_ret=total_Mr/total_Ms
# ?ind_ret???level????
ind_ret=ind_ret.unstack()
#?ind_ret???
ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
return ind_ret
strategy6_scaling+.py 文件源码
项目:quantopian-machinelearning
作者: arshpreetsingh
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def create_model(context, data):
# Get the relevant daily prices
recent_prices = data.history(context.assets, 'price',context.history_range, '1d')
context.ma_50 =recent_prices.values[-50:].mean()
context.ma_200 = recent_prices.values[-200:].mean()
#print context.ma_50
#print context.ma_200
time_lags = pd.DataFrame(index=recent_prices.index)
time_lags['price']=recent_prices.values
time_lags['returns']=(time_lags['price'].pct_change()).fillna(0.0001)
time_lags['lag1'] = (time_lags['returns'].shift(1)).fillna(0.0001)
time_lags['lag2'] = (time_lags['returns'].shift(2)).fillna(0.0001)
time_lags['direction'] = np.sign(time_lags['returns'])
X = time_lags[['returns','lag2']] # Independent, or input variables
Y = time_lags['direction'] # Dependent, or output variable
X_scaled = preprocessing.scale(X)
context.model.fit(X_scaled, Y) # Generate our model
def __init__(self, data_set_parameters):
OutputLog().write('Loading dataset: ' + data_set_parameters['name'])
self.dataset_path = data_set_parameters['path']
self.trainset = None
self.testset = None
self.tuning = None
self.reduce_val = 0
self.x_y_mapping = {'train': None, 'dev': None, 'test': None}
self.x_reduce = {'train': None, 'dev': None, 'test': None}
self.data_set_parameters = data_set_parameters
self.scale = bool(int(data_set_parameters['scale']))
self.scale_rows = bool(int(data_set_parameters['scale_samples']))
self.whiten = bool(int(data_set_parameters['whiten']))
self.pca = map(int, data_set_parameters['pca'].split())
self.normalize_data = bool(int(data_set_parameters['normalize']))
self.preprocessors = None
def rerun_task(job_id, task_id):
"""
Reruns a specific task from a job. Sets the task status to 'pending' and triggers an asynchronous function to
process the task.
Parameters
----------
job_id: str
task_id: int
Returns
-------
None
"""
job = mongo_no_context_get_job(job_id)
task = mongo_no_context_get_task(job_id, task_id)
k = task['k']
covar_type = task['covar_type']
covar_tied = task['covar_tied']
n_init = task['n_init']
s3_file_key = job['s3_file_key']
columns = job['columns']
scale = job.get('scale', False)
response = mongo_no_context_update_task_status(job_id, task_id, 'pending')
work_task.delay(job_id, task_id, k, covar_type, covar_tied, n_init, s3_file_key, columns, scale)
def train():
os.chdir(dname)
for selected_stock in onlyfiles:
df = pd.read_csv(os.path.join('data_files',selected_stock))
#preprocessing the data
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
#measure of volatility
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
svr = SVR()
pickle.dump(svr,open(join(dname+'/models/svr_unfit/', selected_stock+'svr.sav'),'wb'))
svr.fit(X_train, y_train)
lr = LinearRegression()
pickle.dump(lr,open(join(dname+'/models/lr_unfit/', selected_stock+'lr.sav'),'wb'))
lr.fit(X_train, y_train)
mlp = MLPRegressor()
pickle.dump(mlp,open(join(dname+'/models/mlp_unfit/', selected_stock+'mlp.sav'),'wb'))
mlp.fit(X_train, y_train)
pickle.dump(svr,open(join(dname+'/models/svr_fit/', selected_stock+'svr.sav'),'wb'))
pickle.dump(lr,open(join(dname+'/models/lr_fit/', selected_stock+'lr.sav'),'wb'))
pickle.dump(mlp,open(join(dname+'/models/mlp_fit/', selected_stock+'mlp.sav'),'wb'))
print(selected_stock+" - trained")
def lession_7():
X= np.array([[10,12,2],
[-1,-9,99],
[22,33,11]])
print X
print preprocessing.scale(X)
#X,y=make
def normalise(csv_filepath):
"""
load csv data and normalize it
:param csv_filepath:
:return:
"""
df = pd.read_csv(csv_filepath)[[
'companyScore', 'describeScore', 'comprehensiveScore', 'interviewerScore', 'usefulCount', 'myScore',
'replyCount', 'isAnonymous']][1:]
senti_df = pd.read_csv(csv_filepath)['sentiment'][1:]
labels_ = [1 if _ > 0.9 else 0 for _ in senti_df]
df['isAnonymous'] = [int(_) for _ in df['isAnonymous']]
df_scaled = pd.DataFrame(preprocessing.scale(df))
return df_scaled, labels_
def get_name(self):
return 'unit-scale'
def apply(self, data):
return preprocessing.scale(data, axis=data.ndim-1)
def get_name(self):
return 'unit-scale-feat'
def apply(self, data):
return preprocessing.scale(data, axis=1)
def make_feat(self):
d = self.init_data()
for i in range(1, 100):
d['dif{}'.format(i)] = d.diff(i)
for i in range(0, 9):
d['hc{}'.format(i)] = d.HIGH.shift(i) - d.CLOSE.shift(i)
d['lc{}'.format(i)] = d.LOW.shift(i) - d.CLOSE.shift(i)
d['hl{}'.format(i)] = d.HIGH.shift(i) - d.LOW.shift(i)
d['oc{}'.format(i)] = d.OPEN.shift(i) - d.CLOSE.shift(i)
d['oh{}'.format(i)] = d.OPEN.shift(i) - d.HIGH.shift(i)
d['ol{}'.format(i)] = d.OPEN.shift(i) - d.LOW.shift(i)
d = d.fillna(0)
d = preprocessing.scale(d)
filename = join(self.out_poath, 'f_{0}.csv'.format(self.struc))
d.to_csv(path_or_buf=filename)
def train(self, data, labels):
"""
Trains current classifier with matrix data and labels, where labels[i]
describes data[:, i].
:param data: Matrix of data, where each column is a separate sample.
:param labels: List of labels, each corresponding to a column of data.
"""
if self.use_pca:
u, s, _ = scipy.sparse.linalg.svds(data)
self.svc.fit(
preprocessing.scale(u[:, :self.rank+1].T.dot(data).T), labels)
else:
self.svc.fit(preprocessing.scale(data.T), labels)
def classify(self, data):
"""
Classifies data based on current model.
:param data: Matrix with each column a different sample.
:returns: List of predictions, where return[i] describes data[:, i].
"""
if self.use_pca:
u, s, _ = scipy.sparse.linalg.svds(data)
self.svc.predict(
preprocessing.scale(u[:, :self.rank+1].T.dot(data).T))
else:
return self.svc.predict(preprocessing.scale(data.T))
def sk_min_max(X):
min_max_scaler = MinMaxScaler()
# X = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
return min_max_scaler.fit_transform(X)
def min_max(X):
min_max_scaler = MinMaxScaler()
X = scale(X, axis=0, with_mean=True, with_std=True, copy=True )
X = min_max_scaler.fit_transform(X)
return X
def sk_scale(X):
return scale(X, axis=0, with_mean=True, with_std=True, copy=True )