def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
python类SelectKBest()的实例源码
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def test_SelectKBest():
'''
test the method of SelectKBert
:return: None
'''
X=[ [1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3,],
[1,1,1,1,1] ]
y=[0,1,0,1]
print("before transform:",X)
selector=SelectKBest(score_func=f_classif,k=3)
selector.fit(X,y)
print("scores_:",selector.scores_)
print("pvalues_:",selector.pvalues_)
print("selected index:",selector.get_support(True))
print("after transform:",selector.transform(X))
birchForChangeWindowSize.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def analyseReasonWithXsqure(anamolySample,normalSample,topk):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
feat_regress.py 文件源码
项目:Stock-Market-Analysis-and-Prediction
作者: samshara
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def select_kbest_reg(data_frame, target, k=5):
"""
Selecting K-Best features regression
:param data_frame: A pandas dataFrame with the training data
:param target: target variable name in DataFrame
:param k: desired number of features from the data
:returns feature_scores: scores for each feature in the data as
pandas DataFrame
"""
feat_selector = SelectKBest(f_regression, k=k)
_ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])
feat_scores = pd.DataFrame()
feat_scores["F Score"] = feat_selector.scores_
feat_scores["P Value"] = feat_selector.pvalues_
feat_scores["Support"] = feat_selector.get_support()
feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns
return feat_scores
def test_build_param_grid_set_estimator():
clf1 = SVC()
clf2 = LogisticRegression()
clf3 = SVC()
clf4 = SGDClassifier()
estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
('clf', None)]),
clf=[set_grid(clf1, kernel=['linear']),
clf2,
set_grid(clf3, kernel=['poly'], degree=[2, 3]),
clf4])
param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
{'clf': [clf3], 'clf__kernel': ['poly'],
'clf__degree': [2, 3], 'sel__k': [2, 3]},
{'clf': [clf2, clf4], 'sel__k': [2, 3]}]
assert build_param_grid(estimator) == param_grid
def select_percentile_selector(data,target):
# Select Model
selector = SelectPercentile(percentile = 75) # Default is 10%
# Fit, Format, and Return
return format_selector(selector, data, target)
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
def de_c2(X,y):
""" chi2 """
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(chi2, k = de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1), clf.transform(X2)
return _func
# def de_mic(X,y):
# """ MIC """
# dim = X.shape[1]
# de = min(2000,dim)
# clf = SelectKBest(MIC, k=de)
# clf.fit(X,y)
# def _func(X1,X2):
# return clf.transform(X1),clf.transform(X2)
# return _func
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample, normalSample])
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
print 'x2:',warnstr
return warnstr
feat_select.py 文件源码
项目:Stock-Market-Analysis-and-Prediction
作者: samshara
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def select_kbest_clf(data_frame, target, k=4):
"""
Selecting K-Best features for classification
:param data_frame: A pandas dataFrame with the training data
:param target: target variable name in DataFrame
:param k: desired number of features from the data
:returns feature_scores: scores for each feature in the data as
pandas DataFrame
"""
feat_selector = SelectKBest(f_classif, k=k)
_ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])
feat_scores = pd.DataFrame()
feat_scores["F Score"] = feat_selector.scores_
feat_scores["P Value"] = feat_selector.pvalues_
feat_scores["Support"] = feat_selector.get_support()
feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns
return feat_scores
def reduceDimensionality(X, y, method="selectkbest", targetDim=10):
""" Reduces the dimensionality of [X] to [targetDim] """
try:
# Check for the required methodology first
if method.lower() == "selectkbest":
prettyPrint("Selecting %s best features from dataset" % targetDim, "debug")
kBestSelector = SelectKBest(k=targetDim)
X_new = kBestSelector.fit_transform(X, y).tolist()
elif method.lower() == "pca":
prettyPrint("Extracting %s features from dataset using PCA" % targetDim, "debug")
pcaExtractor = PCA(n_components=targetDim)
# Make sure vectors in X are positive
X_new = pcaExtractor.fit_transform(X, y).tolist()
else:
prettyPrint("Unknown dimensionality reduction method \"%s\"" % method, "warning")
return X
except Exception as e:
prettyPrint("Error encountered in \"reduceDimensionality\": %s" % e, "error")
return X
# Return the reduced dataset
return X_new
def ngrams_selection(train_data, train_labels, ind, model_file,
ngram_range_=(1, 1), max_num_features=100,
analyzer_type='word'):
"""Create and save vectorizers and feature selectors on given train data.
Args:
train_data: list of train text samples
train_labels: list of train labels
ind: index of vectorizer/selector to save file
model_file: model filename
ngram_range_: range of n-grams
max_num_features: maximum number of features to select
analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'
Returns:
nothing
"""
vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)
X_train = vectorizer.fit_transform(train_data)
if max_num_features < X_train.shape[1]:
ch2 = SelectKBest(chi2, k=max_num_features)
ch2.fit(X_train, train_labels)
data_struct = {'vectorizer': vectorizer, 'selector': ch2}
print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
else:
data_struct = {'vectorizer': vectorizer}
print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
return
def chiSquare(train_data, train_classes, topK):
vectorizer = DictVectorizer()
# Fit and transform the train data.
x_train = vectorizer.fit_transform(train_data)
y_train = train_classes
if (x_train.shape[1] < topK):
topK = x_train.shape[1]
selector = SelectKBest(chi2, k=topK)
x_new = selector.fit_transform(x_train, y_train)
return vectorizer.inverse_transform(selector.inverse_transform(x_new))
feature_selection.py 文件源码
项目:MultimodalAutoencoder
作者: natashamjaques
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def transform_select_K_best(X_train,Y_train, X_all, K=100):
"""Selects the best K features given the training data.
Args:
X_train: A matrix containing training data
Y_train: Classification labels for the training data
X_all: A matrix containing all the data
K: The number of features to select
"""
skb = SelectKBest(f_classif,K)
skb.fit(X_train,Y_train)
return skb.transform(X_all)
def test_make_pipeline():
t1 = SelectKBest()
t2 = SelectKBest()
t3 = SelectKBest()
t4 = SelectKBest()
t5 = SelectPercentile()
t6 = SelectKBest()
t7 = SelectKBest()
t8 = SelectKBest()
t9 = SelectPercentile()
in_steps = [[t1, None],
[t2, t3],
[t4, t5], # mixed
t6,
[None, t7],
[t8, None, t9], # mixed
None]
pipe = make_pipeline(*in_steps, memory='/path/to/nowhere')
union = make_union(*in_steps)
for est, est_steps in [(pipe, pipe.steps),
(union, union.transformer_list)]:
names, steps = zip(*est_steps)
assert names == ('selectkbest-1', 'selectkbest-2', 'alt-1',
'selectkbest-3', 'selectkbest-4', 'alt-2', 'nonetype')
assert steps == (t1, t2, t4, t6, None, t8, None)
assert len(est._param_grid) == 5
assert est._param_grid[names[0]] == [t1, None]
assert est._param_grid[names[1]] == [t2, t3]
assert est._param_grid[names[2]] == [t4, t5]
assert est._param_grid[names[4]] == [None, t7]
assert est._param_grid[names[5]] == [t8, None, t9]
assert type(pipe) is Pipeline
assert type(union) is FeatureUnion
assert pipe.memory == '/path/to/nowhere'
analysis_utilities.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def export_best_feature_names(self, df, labels, out_folder_path, k):
columns, repos, observations = self.decompose_df(df)
feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_
feature_scores = np.nan_to_num(feature_scores)
k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:]
k_best_feature_names = columns[k_best_features]
out_file_path = os.path.join(out_folder_path, "feature_selection.txt")
with open(out_file_path, "w") as output_file:
for feature_name in k_best_feature_names:
output_file.write(feature_name + "\n")
def __init__(self,selector,return_array=False):
'''
??sklearn??????????????????sklearn?
selector: sklearn.feature_selection????????????sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif,k=4)?
return_array: True?????numpy.ndarray?Fasle??????X?
????:
selector_: fit??selector?
'''
BaseSelector.__init__(self,return_array=return_array)
self.selector=selector
def test():
#??????
np.random.seed(13)
X=pd.DataFrame(np.random.randn(20,10))
X.columns=['x%d'%i for i in range(10)]
y=pd.Series(np.random.choice([0,1],20))
#??sklearn?????????????
clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
clf=SklearnSelector(estimator=clf_sklearn)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
clf_sklearn=SelectFromModel(LogisticRegression())
clf=SklearnSelector(estimator=clf_sklearn)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
#?????
clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
clf_selectfrommodel=SelectFromModel(LogisticRegression())
clf_baseselector=SklearnSelector(clf_selectkbest)
clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest),
('clf_selectfrommodel',clf_selectfrommodel),
('clf_baseselector',clf_baseselector)],threshold=0.5)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
print(clf.df_voting)
print(clf.score)
def __init__(self, conf):
SemiSupervisedFeatureSelection.__init__(self, conf)
self.projection = SelectKBest(mutual_info_classif, k = conf.num_components)
def __init__(self, conf):
SemiSupervisedFeatureSelection.__init__(self, conf)
self.projection = SelectKBest(chi2, k = conf.num_components)
def __init__(self, conf):
SemiSupervisedFeatureSelection.__init__(self, conf)
self.projection = SelectKBest(f_classif, k = conf.num_components)
def getFeature():
fileData = open("data")
row = []
col = []
data = []
evalRes = []
rowIndex = -1
fileList = fileData.readlines()
random.shuffle(fileList)
for line in fileList:
line = line.rstrip('\n')
dataList = re.split(' |:', line)
if int(dataList[0]) >= 7:
evalRes.append(1)
else:
if int(dataList[0]) <= 4:
evalRes.append(-1)
else:
continue
del dataList[0]
rowIndex = rowIndex + 1
row.extend([rowIndex] * int(len(dataList) / 2))
col.extend(map(int, dataList[::2]))
data.extend(map(int, dataList[1::2]))
featureMatrix = csr_matrix((data, (row, col)))
featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes)
return featureMNew, evalRes
def build_model_random_forest(df, features, categorical_features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
# one_hot_encoding because it doesn't work in pipeline for some reason
# for f in categorical_features:
# dummies = pd.get_dummies(df[f], prefix=f)
# for dummy in dummies.columns:
# df[dummy] = dummies[dummy]
# features.append(dummy)
# df = df.drop(f, 1)
# features.remove(f)
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=5)),
("forest", RandomForestClassifier())])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
print "Random Forest score: %f" % score
print "confusion_matrix : \n%s" % cm
return clf
def make_predictions_random_forest(df, features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
# print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
# print "columns: ", '\n\t\t'.join(df.columns)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=200)),
("forest", RandomForestClassifier(
min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
# print classification_report(test[target], predicted)
return score, cm
# Utility function to report best scores
def select_k_best_selector(data,target):
# Select Model
selector = SelectKBest(k=3) # default is 10 features
# Fit, Format, and Return
return format_selector(selector, data, target)
xgb_classification.py 文件源码
项目:jingjuSingingPhraseMatching
作者: ronggong
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def buildEstimators(mode):
if mode == 'train' or mode == 'cv':
# best parameters got by gridsearchCV, best score: 1
estimators = [('anova_filter', SelectKBest(f_classif, k='all')),
('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))]
clf = Pipeline(estimators)
elif mode == 'test':
clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r"))
return clf
def main():
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
X, y = samples_generator.make_classification(n_samples=1000, n_informative=5, n_redundant=4, random_state=_random_state)
anova_filter = SelectKBest(f_regression, k=5)
scaler = MinMaxScaler()
clf = svm.SVC(kernel='linear')
steps = [scaler, anova_filter, clf]
cached_run(steps, X, y)
def main():
sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
parser = argparse.ArgumentParser(description='')
parser.add_argument('-i', '--input', help='Input file', required=True)
parser.add_argument('-t', '--test', help='Test file', required=True)
parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
args = parser.parse_args()
data = read_semeval_regression(args.input, encoding='windows-1252')
analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=analyzer)),
('tfidf', TfidfTransformer()),
('sel', SelectKBest(chi2, k=args.k)),
('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
])
test = read_test_data(args.test, encoding='windows-1252')
regressor = pipeline.fit(data[0], data[1])
y = regressor.predict(test[2])
with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
for id_, topic, rate in zip(test[0], test[1], y):
print(id_, topic, rate, sep='\t', file=outfile)
def select_feats(X, y, nb_features, nb_features_to_keep=2048):
X, y = preproc_for_sklearn(X, y, nb_features)
if nb_features < nb_features_to_keep:
nb_features_to_keep = nb_features_to_keep / 4
feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y)
selected_indices = feature_selector.get_support(indices=True)
return selected_indices
def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0):
"""
Classifies the data using K-nearest neighbors and k-fold CV
:param X: The list of feature vectors
:type X: list
:param y: The list of labels corresponding to the feature vectors
:type y: list
:param K: The number of nearest neighbors to consider in classification
:type K: int
:param kfold: The number of folds in the CV
:type kfold: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: An array of predicted classes
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = neighbors.KNeighborsClassifier(n_neighbors=K)
# Select K Best features if enabled
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted