def format_selector(selector,data, target):
x_train, x_test, y_train, y_test = data_splitting.get_train_test(data, target)
# Fit the model
data.drop(target, 1, inplace=True) # Remove target feature
selector.fit(x_train, y_train)
# Retain the feature names
features = selector.get_support(indices = True) # Returns array of indexes of nonremoved features
features = [column for column in data[features] if column != target] # Gets feature names
# Transform, Format, Return
selector = pd.DataFrame(selector.transform(data))
selector.columns = features
return selector
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
python类VarianceThreshold()的实例源码
def test_VarianceThreshold():
'''
test the method of VarianceThreshold
:return: None
'''
X=[[100,1,2,3],
[100,4,5,6],
[100,7,8,9],
[101,11,12,13]]
selector=VarianceThreshold(1)
selector.fit(X)
print("Variances is %s"%selector.variances_)
print("After transform is %s"%selector.transform(X))
print("The surport is %s"%selector.get_support(True))
print("After reverse transform is %s"%
selector.inverse_transform(selector.transform(X)))
def feature_selection(self, data_set):
"""
:param data_set:
:return:
"""
sel = VarianceThreshold(threshold=(.5 * (1 - .5)))
feature_set = sel.fit_transform(data_set)
fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)
check = {}
for i in fea_index:
check[attr_list[i]] = data_set[0][i]
print check
return data_set
def feature_selection(self, data_set, feature_names):
"""
:param data_set:
:return:
"""
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
feature_set = sel.fit_transform(data_set)
fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)
check = {}
for i in fea_index:
check[feature_names[i]] = data_set[0][i]
print np.array(check)
return feature_set, fea_index
def varianceFilter(train_data, train_classes, threshold):
#if True:
# return frequencyFilter(train_data, train_classes, threshold)
'''
Variance filter
'''
vectorizer = DictVectorizer()
# Fit and transform the train data.
x_train = vectorizer.fit_transform(train_data)
#y_train = train_classes
sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
x_new = sel.fit_transform(x_train)
return vectorizer.inverse_transform(sel.inverse_transform(x_new))
def varianceSelection(self, df, threashold=.8):
if not isinstance(df, pandas.core.frame.DataFrame):
logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
sys.exit(1)
sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
sel.fit_transform(df)
return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def __init__(self, conf):
UnsupervisedFeatureSelection.__init__(self, conf)
self.projection = VarianceThreshold()
def createPipeline(self):
# Remove features with null variance
self.var_filter = VarianceThreshold()
self.pipeline = Pipeline([
('var_filter', self.var_filter),
('projection', self.projection)])
def variance_threshold_selector(data,target):
# Select Model
selector = VarianceThreshold(0) # Defaults to 0.0, e.g. only remove features with the same value in all samples
# Fit, Format, and Return
return format_selector(selector,data,target)
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
def drop_features(X_train, X_test):
# Drop some features from the get-go. No idea how these were found.
X_train = X_train.drop(['F6', 'F26'], 1)
X_test = X_test.drop(['F6', 'F26'], 1)
# Drop additional low-variance features. This *may* be overfitting to the
# test data, since the hyperparameters are different for train/test.
X_train = VarianceThreshold(1.3).fit_transform(X_train)
X_test = VarianceThreshold(1.25).fit_transform(X_test)
return X_train, X_test
def drop_features(X_train, X_test):
# Drop some features from the get-go. No idea how these were found.
X_train = X_train.drop(['F6', 'F26'], 1)
X_test = X_test.drop(['F6', 'F26'], 1)
# Drop additional low-variance features. This *may* be overfitting to the
# test data, since the hyperparameters are different for train/test.
X_train = VarianceThreshold(1.3).fit_transform(X_train)
X_test = VarianceThreshold(1.25).fit_transform(X_test)
return X_train, X_test
synthetic_data_generator.py 文件源码
项目:Machine-learning-for-cybersecurity
作者: Logo252
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def remove_unneeded_features(samples):
"""
Removes features that have the same value in given data samples.
:param samples: data samples
:return: samples with updated features
"""
selector = VarianceThreshold()
selector.fit_transform(samples)
# Array of integers corresponding to non removed features
features = selector.get_support(indices=True)
# Array of all non removed features names
feature_names = [column for column in samples[features]]
return pd.DataFrame(selector.fit_transform(samples), columns=feature_names)
def fit_classifier(feat_dicts=None, y_true=None, weights=None):
# clf = MultinomialNB()
clf = LogisticRegression(class_weight='balanced')
pipeline = Pipeline([
('vectorizer', DictVectorizer()),
('selection', VarianceThreshold()),
('classifier', clf)
])
# cf. http://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t
pipeline.fit(feat_dicts, y_true, **{'classifier__sample_weight': weights})
return pipeline
def run():
data = load_binary()
# Extract features
user_feat_matrix = process_level2(data) # X
del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0
cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
with open("result.txt", 'w') as f:
f.write("user_id,day_in_cycle,symptom,probability\n")
for symptom in symptoms:
print(symptom)
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X, s_Y.values)
print("dumping...")
data_dir = 'data'
cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
dump(symptom, model, X_all, c_length, data['users'].user_id)
def test_zero_variance():
# Test VarianceThreshold with default setting, zero variance.
for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
sel = VarianceThreshold().fit(X)
assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))
assert_raises(ValueError, VarianceThreshold().fit, [[0, 1, 2, 3]])
assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
def test_variance_threshold():
# Test VarianceThreshold with custom variance.
for X in [data, csr_matrix(data)]:
X = VarianceThreshold(threshold=.4).fit_transform(X)
assert_equal((len(data), 1), X.shape)
def remove_lv_features(model, X):
r"""Remove low-variance features.
Parameters
----------
model : alphapy.Model
Model specifications for removing features.
X : numpy array
The feature matrix.
Returns
-------
X_reduced : numpy array
The reduced feature matrix.
References
----------
You can find more information on low-variance feature selection here [LV]_.
.. [LV] http://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
"""
logger.info("Removing Low-Variance Features")
# Extract model parameters
lv_remove = model.specs['lv_remove']
lv_threshold = model.specs['lv_threshold']
predict_mode = model.specs['predict_mode']
# Remove low-variance features
if lv_remove:
logger.info("Low-Variance Threshold : %.2f", lv_threshold)
logger.info("Original Feature Count : %d", X.shape[1])
if not predict_mode:
selector = VarianceThreshold(threshold=lv_threshold)
selector.fit(X)
support = selector.get_support()
model.feature_map['lv_support'] = support
else:
support = model.feature_map['lv_support']
X_reduced = X[:, support]
logger.info("Reduced Feature Count : %d", X_reduced.shape[1])
else:
X_reduced = X
logger.info("Skipping Low-Variance Features")
return X_reduced
def run():
data = load_binary()
# Extract features
user_feat_matrix = process_level2(data) # X
del user_feat_matrix['X']['user_id']
X = user_feat_matrix['X'].values
X[np.isnan(X)] = 0
Y = user_feat_matrix['Y']
Y.fillna(0, inplace=True)
del user_feat_matrix['X_all']['user_id']
X_all = user_feat_matrix['X_all'].values
X_all[np.isnan(X_all)] = 0
cols = list(Y.columns.values)
symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
with open("result.txt", 'w') as f:
f.write("user_id,day_in_cycle,symptom,probability\n")
labels = final_labels['labels']
for symptom in symptoms:
print(symptom)
s_Y = Y[[x for x in cols if x[1] == symptom]]
pipeline = Pipeline([
('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
#('standard_scale', StandardScaler()),
('estimator', Lasso()),
])
for cluster in range(3): #number of clusters
print (cluster)
param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
verbose=2)
model.fit(X[labels == cluster], s_Y.values[labels == cluster])
print("dumping...")
data_dir = 'data'
cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])}
dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])