def dim_red(self, col, method, params, kws, load_fit=None):
if method == 'PCA':
self.do_dim_red = PCA(*params, **kws)
if method == 'FastICA':
self.do_dim_red = FastICA(*params, **kws)
if method == 't-SNE':
self.do_dim_red = TSNE(*params, **kws)
if method == 'LLE':
self.do_dim_red = LocallyLinearEmbedding(*params, **kws)
if method == 'JADE-ICA':
self.do_dim_red = JADE(*params, **kws)
# TODO: Add ICA-JADE here
if load_fit:
self.do_dim_red = load_fit
else:
if method != 't-SNE':
self.do_dim_red.fit(self.df[col])
dim_red_result = self.do_dim_red.transform(self.df[col])
else:
dim_red_result = self.do_dim_red.fit_transform(self.df[col])
for i in list(range(1, dim_red_result.shape[1] + 1)): # will need to revisit this for other methods that don't use n_components to make sure column names still mamke sense
self.df[(method, str(i))] = dim_red_result[:, i - 1]
return self.do_dim_red
python类FastICA()的实例源码
def test_ica(eng):
t = linspace(0, 10, 100)
s1 = sin(t)
s2 = square(sin(2*t))
x = c_[s1, s2, s1+s2]
random.seed(0)
x += 0.001*random.randn(*x.shape)
x = fromarray(x, engine=eng)
def normalize_ICA(s, aT):
a = aT.T
c = a.sum(axis=0)
return s*c, (a/c).T
from sklearn.decomposition import FastICA
ica = FastICA(n_components=2, fun='cube', random_state=0)
s1 = ica.fit_transform(x.toarray())
aT1 = ica.mixing_.T
s1, aT1 = normalize_ICA(s1, aT1)
s2, aT2 = ICA(k=2, svd_method='direct', max_iter=200, seed=0).fit(x)
s2, aT2 = normalize_ICA(s2, aT2)
tol=1e-1
assert allclose_sign_permute(s1, s2, atol=tol)
assert allclose_sign_permute(aT1, aT2, atol=tol)
def fit(self, x, y, i=0):
# if gaussian processes are being used, data dimensionality needs to be reduced before fitting
if self.method[i] == 'GP':
if self.reduce_dim == 'FastICA':
print('Reducing dimensionality with ICA')
do_ica = FastICA(n_components=self.n_components)
self.do_reduce_dim = do_ica.fit(x)
if self.reduce_dim == 'PCA':
print('Reducing dimensionality with PCA')
do_pca = PCA(n_components=self.n_components)
self.do_reduce_dim = do_pca.fit(x)
x = self.do_reduce_dim.transform(x)
#try:
print('Training model...')
try:
self.model.fit(x, y)
self.goodfit = True
print(self.model)
except:
self.goodfit = False
if self.method[i] == 'GP':
print('Model failed to train! (For GP this does not always indicate a problem, especially for low numbers of components.)')
pass
else:
print('Model failed to train!')
traceback.print_stack()
if self.ransac:
self.outliers = np.logical_not(self.model.inlier_mask_)
print(str(np.sum(self.outliers)) + ' outliers removed with RANSAC')
def ica(self, col, nc=None, load_fit=None):
if nc:
self.do_ica = FastICA(n_components=nc)
self.do_ica.fit(self.df[col])
if load_fit: # use this to load a previous fit rather than fit the current data
self.do_ica = load_fit
ica_result = self.do_ica.transform(self.df[col])
for i in list(range(1, self.do_ica.n_components + 1)):
self.df[('ICA', i)] = ica_result[:, i - 1]
def generate_icamodel(train_vocabulary='./vocabulary/vocabulary_nv_4w.txt',model_path='./model/ICA/ica_ourword2vec.model'):
train_vocab =[v.strip() for v in open(train_vocabulary,'r').readlines()]
train_sample = np.zeros([len(train_vocab),300])
for i,v in enumerate(train_vocab):
word = v.split(' ')[0]
try:
train_sample[i]= word2vec_model[word]
except:
print word
ica = FastICA(300,max_iter=800)
ica.fit(train_sample)
joblib.dump(ica,model_path)
pass
def test_independent_component_analyzer(self):
self.standard_check(FastICA)
def ICA_results(data, n_comps=None):
ica = ICA(n_components=n_comps)
model = ica.fit(data)
out_data = {'model' : model, 'reconstruction error': ica.components_ }
return 'ICA', out_data
def ReduceDimension(X):
from sklearn.decomposition import FastICA
reducer = FastICA(n_components=2)
x_r = reducer.fit_transform(X)
yield 'ICA',x_r[:,0],x_r[:,1]
#=================================================
def ReduceDimension(X):
from sklearn.decomposition import FastICA
reducer = FastICA(n_components=2)
x_r = reducer.fit_transform(X)
yield 'ICA',x_r[:,0],x_r[:,1]
#=================================================
def preprocessing_inputs(strategy_dictionary, fitting_inputs_scaled):
if strategy_dictionary['preprocessing'] == 'PCA':
fitting_inputs_scaled = pca_transform(fitting_inputs_scaled)
if strategy_dictionary['preprocessing'] == 'FastICA':
fitting_inputs_scaled, strategy_dictionary = fast_ica_transform(strategy_dictionary, fitting_inputs_scaled)
return fitting_inputs_scaled, strategy_dictionary
def fast_ica_transform(strategy_dictionary, fitting_inputs_scaled):
try:
ica = FastICA()
ica.fit(fitting_inputs_scaled)
fitting_inputs_scaled = ica.transform(fitting_inputs_scaled)
except:
strategy_dictionary['preprocessing'] = 'None'
return fitting_inputs_scaled, strategy_dictionary
def _fit_local(self, data):
from sklearn.decomposition import FastICA
from numpy import random
random.seed(self.seed)
model = FastICA(n_components=self.k, fun="cube", max_iter=self.max_iter, tol=self.tol, random_state=self.seed)
signals = model.fit_transform(data)
return signals, model.mixing_.T
def __init__(
self, n_iter=50, rank=None,
auto_nuisance=True, n_nureg=None, nureg_zscore=True,
nureg_method='PCA',
baseline_single=False, logS_range=1.0, SNR_prior='exp',
SNR_bins=21, rho_bins=20, tol=1e-4, optimizer='BFGS',
minimize_options={'gtol': 1e-4, 'disp': False,
'maxiter': 20}, random_state=None,
anneal_speed=10):
self.n_iter = n_iter
self.rank = rank
self.auto_nuisance = auto_nuisance
self.n_nureg = n_nureg
self.nureg_zscore = nureg_zscore
if auto_nuisance:
assert (n_nureg is None) \
or (isinstance(n_nureg, int) and n_nureg > 0), \
'n_nureg should be a positive integer or None'\
' if auto_nuisance is True.'
if self.nureg_zscore:
self.preprocess_residual = lambda x: _zscore(x)
else:
self.preprocess_residual = lambda x: x
if nureg_method == 'FA':
self.nureg_method = lambda x: FactorAnalysis(n_components=x)
elif nureg_method == 'PCA':
self.nureg_method = lambda x: PCA(n_components=x, whiten=True)
elif nureg_method == 'SPCA':
self.nureg_method = lambda x: SparsePCA(n_components=x,
max_iter=20, tol=tol)
elif nureg_method == 'ICA':
self.nureg_method = lambda x: FastICA(n_components=x,
whiten=True)
else:
raise ValueError('nureg_method can only be FA, PCA, '
'SPCA(for sparse PCA) or ICA')
self.baseline_single = baseline_single
if type(logS_range) is int:
logS_range = float(logS_range)
self.logS_range = logS_range
assert SNR_prior in ['unif', 'lognorm', 'exp'], \
'SNR_prior can only be chosen from ''unif'', ''lognorm''' \
' and ''exp'''
self.SNR_prior = SNR_prior
self.SNR_bins = SNR_bins
self.rho_bins = rho_bins
self.tol = tol
self.optimizer = optimizer
self.minimize_options = minimize_options
self.random_state = random_state
self.anneal_speed = anneal_speed
return
Jason_Liu_stack_res.py 文件源码
项目:Kaggle-Mercedes-Benz-Greener-Manufacturing-33th-Solution
作者: arvidzt
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def get_additional_features(train,test,magic=False,ID=False):
col = list(test.columns)
if ID!=True:
col.remove('ID')
n_comp = 12
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train[col])
tsvd_results_test = tsvd.transform(test[col])
# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train[col])
pca2_results_test = pca.transform(test[col])
# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train[col])
ica2_results_test = ica.transform(test[col])
# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train[col])
grp_results_test = grp.transform(test[col])
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train[col])
srp_results_test = srp.transform(test[col])
for i in range(1, n_comp + 1):
train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
train['pca_' + str(i)] = pca2_results_train[:, i - 1]
test['pca_' + str(i)] = pca2_results_test[:, i - 1]
train['ica_' + str(i)] = ica2_results_train[:, i - 1]
test['ica_' + str(i)] = ica2_results_test[:, i - 1]
train['grp_' + str(i)] = grp_results_train[:, i - 1]
test['grp_' + str(i)] = grp_results_test[:, i - 1]
train['srp_' + str(i)] = srp_results_train[:, i - 1]
test['srp_' + str(i)] = srp_results_test[:, i - 1]
if magic==True:
magic_mat = train[['ID','X0','y']]
magic_mat = magic_mat.groupby(['X0'])['y'].mean()
magic_mat = pd.DataFrame({'X0':magic_mat.index,'magic':list(magic_mat)})
mean_magic = magic_mat['magic'].mean()
train = train.merge(magic_mat,on='X0',how='left')
test = test.merge(magic_mat,on='X0',how = 'left')
test['magic'] = test['magic'].fillna(mean_magic)
return train,test
## Preparing stacking functions. Each one takes the out of bag values as the Input
## xgb will not be used in this case, but still post it here.
def process(self, obj_data):
'''
Perform component analysis on data:
Results are added to the data wrapper as a dictionary with
results['CA'] = Eigenvenctors
results['Projection'] = Projection on to the eigenvectors
@param obj_data: Data wrapper containing the data
'''
num_components = self.ap_paramList[0]()
component_type = self.ap_paramList[1]()
start_time = self.ap_paramList[2]()
end_time = self.ap_paramList[3]()
results = dict()
results['start_date'] = start_time
results['end_date'] = end_time
if len(self.ap_paramList) >= 5:
label_names = self.ap_paramList[4]()
else:
label_names = None
cut_data = []
for label, data, err in obj_data.getIterator():
if label_names == None or label in label_names:
cut_data.append(data[start_time:end_time])
cut_data = np.array(cut_data)
if len(cut_data) > 0:
if component_type == 'ICA' :
ca = FastICA(n_components = num_components)
else:
ca = PCA(n_components = num_components)
time_projection = ca.fit_transform(cut_data.T)
results['CA'] = ca
results['Projection'] = time_projection
else:
results['CA'] = None
results['Projection'] = None
obj_data.addResult(self.str_description, results)
def process(self, obj_data):
'''
Perform component analysis on data
Results are added to the data wrapper as a dictionary with
results['CA'] = Eigenvenctors
results['Projection'] = Projection on to the eigenvectors
@param obj_data: Data wrapper
'''
component_type = self.ap_paramList[0]()
start_time = self.ap_paramList[1]()
end_time = self.ap_paramList[2]()
num_components = self.n_components
results = dict()
results['start_date'] = start_time
results['end_date'] = end_time
cut_data = []
label_list = []
for label, data in obj_data.getIterator():
for column in self.column_names:
cut_data.append(data.loc[start_time:end_time, column])
label_list.append(label)
cut_data = np.array(cut_data)
if len(cut_data) > 0:
if component_type == 'ICA' :
ca = FastICA(n_components = num_components)
else:
ca = PCA(n_components = num_components)
time_projection = ca.fit_transform(cut_data.T)
results['CA'] = ca
results['Projection'] = time_projection
else:
results['CA'] = None
results['Projection'] = None
results['labels'] = label_list
obj_data.addResult(self.str_description, results)