def FeatureCombination(Df,s='',num_feature=2):
feature_set = []
for c in Df.columns:
if c.startswith(s): feature_set.append(c)
print('combining', len(feature_set), 'features')
data = Df[feature_set].values
for c in Df.columns:
if Df[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(Df[c].values))
Df[c] = lbl.transform(list(Df[c].values))
imp = preprocessing.Imputer()
data = imp.fit_transform(data)
data = preprocessing.scale(data)
pca = PCA(num_feature)
pca.fit(data)
print('explained_variance_ratio_:', pca.explained_variance_ratio_)
trans = pca.transform(data)
for i in range(0,num_feature):
Df[s+'_%d'%(i+1)] = trans[:,i]
Df.drop(feature_set,1,inplace=True)
return Df
python类LabelEncoder()的实例源码
def create_codes(df, column_name, revive=False, model_code=0):
print('Encoding', column_name, '...')
# get unique data
nms_unique = df[column_name].unique().tolist()
# fit model
if not revive:
print('Creating new Label Encoder...')
le = LabelEncoder()
le.fit(nms_unique)
else:
# Reload LE
le_file_name = "LE_" + str(model_code)
le = load_pickle(ROOT_PATH + '\\Data\\PickleJar\\' + le_file_name + '.pkl')
# get all data
nms = df[column_name].tolist()
return le.transform(nms), le
def addDailyReturn(dataset):
"""
Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
"""
#will normalize labels
le = preprocessing.LabelEncoder()
dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
print dataset['UpDown']
# will be denoted by 2 when transformed
dataset.UpDown[dataset.UpDown >= 0] = "up"
# will be denoted by 1 when transformed
dataset.UpDown[dataset.UpDown < 0] = "down"
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
print dataset['UpDown']
def addDailyReturn(dataset):
"""
Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
"""
#will normalize labels
le = preprocessing.LabelEncoder()
#print "dataset['Adj_Close']\n", dataset['Adj_Close'][:5]
#print "dataset['Adj_Close'].shift(-1)\n", dataset['Adj_Close'].shift(1)[:5]
dataset['UpDown'] = (dataset['Adj_Close']-dataset['Adj_Close'].shift(1))/dataset['Adj_Close'].shift(1)
#print dataset['UpDown'][240:]
# will be denoted by 3 when transformed
dataset.UpDown[dataset.UpDown > 0] = "sell"
dataset.UpDown[dataset.UpDown == 0] = "hold"
dataset.UpDown[dataset.UpDown < 0] = "buy"
#print dataset['UpDown'][:10]
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
#print dataset['UpDown']
def addDailyReturn(dataset):
"""
Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
"""
#will normalize labels
le = preprocessing.LabelEncoder()
dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
print dataset['UpDown'][:5]
# will be denoted by 2 when transformed
dataset.UpDown[dataset.UpDown >= 0] = "up"
# will be denoted by 1 when transformed
dataset.UpDown[dataset.UpDown < 0] = "down"
print dataset['UpDown']
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
# print dataset['UpDown'][:5]
def create_id_df(cls, df, is_train):
"""
:rtype: DataFrame
:return: dataFrame, sorted by id,
columns are ["label", "id0", "id", "id_tr", "id_te"]
"""
df = df[["id0", "label"]].copy()
df = df.reset_index(drop=True)
is_train = np.array(is_train)
le_tr = LabelEncoder().fit(df.id0[is_train])
le_te = LabelEncoder().fit(df.id0[~is_train])
df["id_tr"] = np.nan
df["id_te"] = np.nan
df.loc[is_train, "id_tr"] = le_tr.transform(df.id0[is_train])
df.loc[~is_train, "id_te"] = le_te.transform(df.id0[~is_train])
df["id"] = np.where(np.isnan(df["id_tr"]), len(le_tr.classes_) + df["id_te"], df["id_tr"])
df = df.fillna(-1)
df = df.sort("id")
df = df[["label", "id0", "id", "id_tr", "id_te"]]
return df
random-forest-daily-returns.py 文件源码
项目:quantopian-machinelearning
作者: arshpreetsingh
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def create_model(context, data):
# Get the relevant daily prices
recent_prices = data.history(context.assets, 'price',context.history_range, '1d')
context.ma_50 =recent_prices.values[-50:].mean()
context.ma_200 = recent_prices.values[-200:].mean()
#print context.ma_50
#print context.ma_200
time_lags = pd.DataFrame(index=recent_prices.index)
time_lags['price']=recent_prices.values
time_lags['daily_returns']=time_lags['price'].pct_change()
time_lags['multiple_day_returns'] = time_lags['price'].pct_change(3)
time_lags['rolling_mean'] = time_lags['daily_returns'].rolling(window = 4,center=False).mean()
time_lags['time_lagged'] = time_lags['price']-time_lags['price'].shift(-2)
X = time_lags[['price','daily_returns','multiple_day_returns','rolling_mean']].dropna()
time_lags['updown'] = time_lags['daily_returns']
time_lags.updown[time_lags['daily_returns']>=0]='up'
time_lags.updown[time_lags['daily_returns']<0]='down'
le = preprocessing.LabelEncoder()
time_lags['encoding']=le.fit(time_lags['updown']).transform(time_lags['updown'])
# X = time_lags[['lag1','lag2']] # Independent, or input variables
# Y = time_lags['direction'] # Dependent, or output variable
context.model.fit(X,time_lags['encoding'][4:]) # Generate our model
def deserialise_encoder(
encoder: acton_pb.Database.LabelEncoder
) -> sklearn.preprocessing.LabelEncoder:
"""Deserialises a LabelEncoder protobuf.
Parameters
----------
encoder
LabelEncoder protobuf.
Returns
-------
sklearn.preprocessing.LabelEncoder
LabelEncoder (or None if no encodings were specified).
"""
encodings = []
for encoding in encoder.encoding:
encodings.append((encoding.class_int, encoding.class_label))
encodings.sort()
encodings = numpy.array([c[1] for c in encodings])
encoder = SKLabelEncoder()
encoder.classes_ = encodings
return encoder
def fit(self, X, y):
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
self.num_classes = np.unique(y).shape[0]
sf = xgb.DMatrix(X, y)
params = {"objective": 'multi:softprob',
"eta": self.eta,
"gamma": self.gamma,
"max_depth": self.max_depth,
"min_child_weight": self.min_child_weight,
"max_delta_step": self.max_delta_step,
"subsample": self.subsample,
"silent": self.silent,
"colsample_bytree": self.colsample_bytree,
"seed": self.seed,
"lambda": self.l2_reg,
"alpha": self.l1_reg,
"num_class": self.num_classes}
self.model = xgb.train(params, sf, self.num_round)
return self
def execute_inplace(self, data):
df = data.df
meta = data.metadata
classes = {}
cols_to_encode = meta[meta.type == ColType.CATEGORICAL].index
for col in cols_to_encode:
enc = LE()
df.loc[df[col].notnull(), col] = enc.fit_transform(df.loc[df[col].notnull(), col])
df[col] = df[col].astype(float)
meta.loc[col, 'type'] = ColType.INT_ENCODING
meta.loc[col, 'derived_from'] = col
classes[col] = enc.classes_
self.logger.info('LabelEncoder: encoded %s', col)
self.state = {'classes': classes}
def pre_process_data():
for col in categorical_fields:
data_frame[col].fillna('default',inplace=True)
data_frame_test[col].fillna('default',inplace=True)
for col in numerical_fields:
data_frame[col].fillna(0,inplace=True)
data_frame_test[col].fillna(0,inplace=True)
encode=LabelEncoder()
for col in categorical_fields:
data_frame[col]=encode.fit_transform(data_frame[col])
data_frame_test[col]=encode.fit_transform(data_frame_test[col])
data_frame['SalePrice'].fillna(0,inplace=True)
def labels_to_categories(y):
"""
Labels to categories
:param y: list of labels, ex. ['positive', 'negative', 'positive', 'neutral', 'positive', ...]
:return: list of categories, ex. [0, 2, 1, 2, 0, ...]
"""
encoder = LabelEncoder()
encoder.fit(y)
y_num = encoder.transform(y)
return y_num
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def label_classes(df, estimated_var):
le = LabelEncoder()
le.fit(df[estimated_var].values)
return le.classes_
def __init__(self, classifier=None):
if classifier:
self.clf = classifier
else:
self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500)
self.labels = preprocessing.LabelEncoder()
self.feature_length = -1
def get_dataset():
list_folder = os.listdir('data/')
list_images = []
for i in xrange(len(list_folder)):
images = os.listdir('data/' + list_folder[i])
for x in xrange(len(images)):
image = [list_folder[i] + '/' + images[x], list_folder[i]]
list_images.append(image)
list_images = np.array(list_images)
np.random.shuffle(list_images)
print "before cleaning got: " + str(list_images.shape[0]) + " data"
list_temp = []
for i in xrange(list_images.shape[0]):
image = misc.imread('data/' + list_images[i, 0])
if len(image.shape) < 3:
continue
list_temp.append(list_images[i, :].tolist())
list_images = np.array(list_temp)
print "after cleaning got: " + str(list_images.shape[0]) + " data"
label = np.unique(list_images[:, 1]).tolist()
list_images[:, 1] = LabelEncoder().fit_transform(list_images[:, 1])
return list_images, np.unique(list_images[:, 1]).shape[0], label
def __do_label_encoding(self):
df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
le = LabelEncoder()
cross_feature_dict = self.__get_label_encode_dict()
for _, new_feature_name in cross_feature_dict.iteritems():
to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]]
le.fit(pd.concat(to_be_stacked, axis=0))
df_train[new_feature_name] = le.transform(df_train[new_feature_name])
df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name])
df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name])
return
def fit(self, column):
self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column))
self.classes_ = self.encoder_.classes_
return self
def __init__(self, multilabel=False):
self.multilabel = multilabel
if self.multilabel:
self.le = MultiLabelBinarizer(sparse_output=True)
else:
self.le = LabelEncoder()
self.from_classes = False
def __init__(self):
self.label_encoder = preprocessing.LabelEncoder()
def test_autoclean_no_nans_with_strings():
"""Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])
hand_cleaned_data = data.copy()
hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)
cleaned_data = autoclean(data)
assert cleaned_data.equals(hand_cleaned_data)