def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0):
"""
Trains a K-NN using the training data and tests it using the test data using K-fold cross validation
:type X: list
:param y: The labels corresponding to the training feature vectors
:type y: list
:param Xtest: The matrix of test feature vectors
:type Xtest: list
:param ytest: The labels corresponding to the test feature vectors
:type ytest: list
:param K: The number of nearest neighbors to consider in classification
:type K: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: Two lists of the validation and test accuracies across the k-folds
"""
try:
predicted, predicted_test = [], []
# Define classifier and cross validation iterator
clf = neighbors.KNeighborsClassifier(n_neighbors=K)
# Start the cross validation learning
X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
# Select K Best features if enabled
prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest
# Fit model
prettyPrint("Fitting model")
clf.fit(X_new, y)
# Validate and test model
prettyPrint("Validating model using training data")
predicted = clf.predict(X_new)
prettyPrint("Testing model")
predicted_test = clf.predict(Xtest_new)
except Exception as e:
prettyPrintError(e)
return [], []
return predicted, predicted_test
python类SelectKBest()的实例源码
def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0):
"""Classifies the data using Support vector machines with the SSK kernel and k-fold CV
:param X: The list of text documents containing traces
:type X: list
:param y: The labels of documents in 'X'
:type y: list
:param kfold: The number of folds
:type kfold: int (default: 10)
:param subseqLength: Length of subsequence used by the SSK
:type subseqLength: int (default: 3)
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: An array of predicted classes
"""
try:
predicted = []
# Retrieve Gram Matrix from string kernel
if verboseON():
prettyPrint("Generating Gram Matrix from documents", "debug")
X_gram = string_kernel(X, X)
y = numpy.array(y)
# Define classifier
clf = svm.SVC(kernel="precomputed")
X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram
prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest))
predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10):
"""
Classifies the data using Support vector machines and k-fold CV
:param X: The matrix of feature vectors
:type X: list
:param y: The vector containing the labels corresponding to feature vectors
:type y: list
:param kernel: The kernel used to elevate data into higher dimensionalities
:type kernel: str
:param C: The penalty parameter of the error term
:type C: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:param kfold: The number of folds to use in K-fold CV
:type kfold: int
:return: A list of predicted labels across the k-folds
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = svm.SVC(kernel=kernel, C=C)
# Select K Best features if enabled
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
"""
Classifies the data using decision trees and k-fold CV
:param X: The matrix of feature vectors
:type X: list
:param y: The vector containing labels corresponding to the feature vectors
:type y: list
:param estimators: The number of random trees to use in classification
:type estimators: int
:param criterion: The splitting criterion employed by the decision tree
:type criterion: str
:param splitter: The method used to split the data
:type splitter: str
:param maxDepth: The maximum depth the tree is allowed to grow
:type maxDepth: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:param kfold: The number of folds to use in K-fold CV
:type kfold: int
:return: A list of predicted labels across the k-folds
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def MIC(x, y):
# Maximal Information Coefficient
base = MINE()
base.compute_score(x, y)
return base.mic(),0.5 # just one kind of factor,
# but SelectKBest needs two factors to fit datas.
def de_ps(X,y):
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(lambda X, Y: np.array(map(lambda x:pearsonr(x, Y), X.T)).T, k=de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def de_c2(X,y):
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(chi2, k = de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def de_mic(X,y):
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(MIC, k=de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def de_f_and_p_value(X,y):
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(f_classif,k=de)
clf.fit(X, y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def de_f_and_p_value(X,y):
""" f&p value """
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(f_classif,k=de)
clf.fit(X, y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def new(method='centroid',n_features=8):
# Clustering method
nc = METHODS[method]
# Orthogonal feature selector
if n_features is None: n_features = 'all'
selector = SelectKBest(f_classif, k=n_features)
# NOTE: The only last operation of the list
# must be a classifier or clustering model
print(colored('Cluster model created','yellow'))
return [selector, nc]
def get_local_words(word_count, threshold, y_train, train_seq, num_words):
feature_index = delete_low_freq_words(word_count, threshold)
print(len(train_seq), len(feature_index))
word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])
for (seq_idx, seq) in enumerate(train_seq):
word_freq_list = np.zeros(len(feature_index))
for word in seq:
if (word not in feature_index):
continue
else:
word_idx = feature_index[word]
word_freq_matrix[seq_idx][word_idx] += 1
sk = SelectKBest(chi2, k="all")
sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
score_list = sk.scores_
word_score = {}
for (feature, idx) in feature_index.items():
word_score[feature] = score_list[idx]
word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)
local_word_list = []
for (word, score) in word_score[:num_words]:
local_word_list.append(word)
del word_freq_matrix
return local_word_list
def k_best_features(self):
# get total number of features.
num_features = self.features.shape[1]
feature_list = []
# find k-best features, with k from 1 to num_features.
for i in range(num_features):
skBest = SelectKBest(k=i)
skBest.fit_transform(self.features, self.labels)
# get boolean indices of the best features.
k_features = skBest.get_support()
# append the features to the feature list.
feature_list += self.features.columns[k_features].tolist()
return feature_list
def selectFeatures(k_features=5, *args):
"""
# Select k best features using the SelectKBest class in Sklearn.
# Inputs: k=no. of features to select, args=(XTrain,yTrain)
# returns: np array of k features.
"""
X, y = args
skb = SelectKBest(k=k_features)
return skb.fit_transform(X, y)
def featuresByChiSq(features,labels,nFeature=5000):
chi2_model = SelectKBest(chi2,k=nFeature)
dtm = chi2_model.fit_transform(features,labels)
return dtm,chi2_model
def train_and_score(X, y):
X_train, X_test, y_train, y_test = split_data(X, y)
clf = Pipeline([
('reduce_dim', SelectKBest(chi2, k=2)),
('train', LinearSVC(C=100))
])
scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
print("Mean Model Accuracy:", np.array(scores).mean())
clf.fit(X_train, y_train)
confuse(y_test, clf.predict(X_test))
print()
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
data = anamolySample
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0, len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr