def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
python类chi2()的实例源码
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
birchForChangeWindowSize.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def analyseReasonWithXsqure(anamolySample,normalSample,topk):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def de_c2(X,y):
""" chi2 """
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(chi2, k = de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1), clf.transform(X2)
return _func
# def de_mic(X,y):
# """ MIC """
# dim = X.shape[1]
# de = min(2000,dim)
# clf = SelectKBest(MIC, k=de)
# clf.fit(X,y)
# def _func(X1,X2):
# return clf.transform(X1),clf.transform(X2)
# return _func
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample, normalSample])
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
print 'x2:',warnstr
return warnstr
def ngrams_selection(train_data, train_labels, ind, model_file,
ngram_range_=(1, 1), max_num_features=100,
analyzer_type='word'):
"""Create and save vectorizers and feature selectors on given train data.
Args:
train_data: list of train text samples
train_labels: list of train labels
ind: index of vectorizer/selector to save file
model_file: model filename
ngram_range_: range of n-grams
max_num_features: maximum number of features to select
analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'
Returns:
nothing
"""
vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)
X_train = vectorizer.fit_transform(train_data)
if max_num_features < X_train.shape[1]:
ch2 = SelectKBest(chi2, k=max_num_features)
ch2.fit(X_train, train_labels)
data_struct = {'vectorizer': vectorizer, 'selector': ch2}
print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
else:
data_struct = {'vectorizer': vectorizer}
print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
return
def chiSquare(train_data, train_classes, topK):
vectorizer = DictVectorizer()
# Fit and transform the train data.
x_train = vectorizer.fit_transform(train_data)
y_train = train_classes
if (x_train.shape[1] < topK):
topK = x_train.shape[1]
selector = SelectKBest(chi2, k=topK)
x_new = selector.fit_transform(x_train, y_train)
return vectorizer.inverse_transform(selector.inverse_transform(x_new))
analysis_utilities.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def export_best_feature_names(self, df, labels, out_folder_path, k):
columns, repos, observations = self.decompose_df(df)
feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_
feature_scores = np.nan_to_num(feature_scores)
k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:]
k_best_feature_names = columns[k_best_features]
out_file_path = os.path.join(out_folder_path, "feature_selection.txt")
with open(out_file_path, "w") as output_file:
for feature_name in k_best_feature_names:
output_file.write(feature_name + "\n")
def __init__(self, conf):
SemiSupervisedFeatureSelection.__init__(self, conf)
self.projection = SelectKBest(chi2, k = conf.num_components)
def getFeature():
fileData = open("data")
row = []
col = []
data = []
evalRes = []
rowIndex = -1
fileList = fileData.readlines()
random.shuffle(fileList)
for line in fileList:
line = line.rstrip('\n')
dataList = re.split(' |:', line)
if int(dataList[0]) >= 7:
evalRes.append(1)
else:
if int(dataList[0]) <= 4:
evalRes.append(-1)
else:
continue
del dataList[0]
rowIndex = rowIndex + 1
row.extend([rowIndex] * int(len(dataList) / 2))
col.extend(map(int, dataList[::2]))
data.extend(map(int, dataList[1::2]))
featureMatrix = csr_matrix((data, (row, col)))
featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes)
return featureMNew, evalRes
def main():
sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
parser = argparse.ArgumentParser(description='')
parser.add_argument('-i', '--input', help='Input file', required=True)
parser.add_argument('-t', '--test', help='Test file', required=True)
parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
args = parser.parse_args()
data = read_semeval_regression(args.input, encoding='windows-1252')
analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=analyzer)),
('tfidf', TfidfTransformer()),
('sel', SelectKBest(chi2, k=args.k)),
('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
])
test = read_test_data(args.test, encoding='windows-1252')
regressor = pipeline.fit(data[0], data[1])
y = regressor.predict(test[2])
with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
for id_, topic, rate in zip(test[0], test[1], y):
print(id_, topic, rate, sep='\t', file=outfile)
def select_feats(X, y, nb_features, nb_features_to_keep=2048):
X, y = preproc_for_sklearn(X, y, nb_features)
if nb_features < nb_features_to_keep:
nb_features_to_keep = nb_features_to_keep / 4
feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y)
selected_indices = feature_selector.get_support(indices=True)
return selected_indices
def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0):
"""
Classifies the data using K-nearest neighbors and k-fold CV
:param X: The list of feature vectors
:type X: list
:param y: The list of labels corresponding to the feature vectors
:type y: list
:param K: The number of nearest neighbors to consider in classification
:type K: int
:param kfold: The number of folds in the CV
:type kfold: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: An array of predicted classes
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = neighbors.KNeighborsClassifier(n_neighbors=K)
# Select K Best features if enabled
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0):
"""
Trains a K-NN using the training data and tests it using the test data using K-fold cross validation
:type X: list
:param y: The labels corresponding to the training feature vectors
:type y: list
:param Xtest: The matrix of test feature vectors
:type Xtest: list
:param ytest: The labels corresponding to the test feature vectors
:type ytest: list
:param K: The number of nearest neighbors to consider in classification
:type K: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: Two lists of the validation and test accuracies across the k-folds
"""
try:
predicted, predicted_test = [], []
# Define classifier and cross validation iterator
clf = neighbors.KNeighborsClassifier(n_neighbors=K)
# Start the cross validation learning
X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
# Select K Best features if enabled
prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest
# Fit model
prettyPrint("Fitting model")
clf.fit(X_new, y)
# Validate and test model
prettyPrint("Validating model using training data")
predicted = clf.predict(X_new)
prettyPrint("Testing model")
predicted_test = clf.predict(Xtest_new)
except Exception as e:
prettyPrintError(e)
return [], []
return predicted, predicted_test
def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0):
"""Classifies the data using Support vector machines with the SSK kernel and k-fold CV
:param X: The list of text documents containing traces
:type X: list
:param y: The labels of documents in 'X'
:type y: list
:param kfold: The number of folds
:type kfold: int (default: 10)
:param subseqLength: Length of subsequence used by the SSK
:type subseqLength: int (default: 3)
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: An array of predicted classes
"""
try:
predicted = []
# Retrieve Gram Matrix from string kernel
if verboseON():
prettyPrint("Generating Gram Matrix from documents", "debug")
X_gram = string_kernel(X, X)
y = numpy.array(y)
# Define classifier
clf = svm.SVC(kernel="precomputed")
X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram
prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest))
predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10):
"""
Classifies the data using Support vector machines and k-fold CV
:param X: The matrix of feature vectors
:type X: list
:param y: The vector containing the labels corresponding to feature vectors
:type y: list
:param kernel: The kernel used to elevate data into higher dimensionalities
:type kernel: str
:param C: The penalty parameter of the error term
:type C: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:param kfold: The number of folds to use in K-fold CV
:type kfold: int
:return: A list of predicted labels across the k-folds
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = svm.SVC(kernel=kernel, C=C)
# Select K Best features if enabled
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
"""
Classifies the data using decision trees and k-fold CV
:param X: The matrix of feature vectors
:type X: list
:param y: The vector containing labels corresponding to the feature vectors
:type y: list
:param estimators: The number of random trees to use in classification
:type estimators: int
:param criterion: The splitting criterion employed by the decision tree
:type criterion: str
:param splitter: The method used to split the data
:type splitter: str
:param maxDepth: The maximum depth the tree is allowed to grow
:type maxDepth: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:param kfold: The number of folds to use in K-fold CV
:type kfold: int
:return: A list of predicted labels across the k-folds
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def de_c2(X,y):
dim = X.shape[1]
de = min(2000,dim)
clf = SelectKBest(chi2, k = de)
clf.fit(X,y)
def _func(X1,X2):
return clf.transform(X1),clf.transform(X2)
return _func
def get_local_words(word_count, threshold, y_train, train_seq, num_words):
feature_index = delete_low_freq_words(word_count, threshold)
print(len(train_seq), len(feature_index))
word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])
for (seq_idx, seq) in enumerate(train_seq):
word_freq_list = np.zeros(len(feature_index))
for word in seq:
if (word not in feature_index):
continue
else:
word_idx = feature_index[word]
word_freq_matrix[seq_idx][word_idx] += 1
sk = SelectKBest(chi2, k="all")
sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
score_list = sk.scores_
word_score = {}
for (feature, idx) in feature_index.items():
word_score[feature] = score_list[idx]
word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)
local_word_list = []
for (word, score) in word_score[:num_words]:
local_word_list.append(word)
del word_freq_matrix
return local_word_list
def featuresByChiSq(features,labels,nFeature=5000):
chi2_model = SelectKBest(chi2,k=nFeature)
dtm = chi2_model.fit_transform(features,labels)
return dtm,chi2_model
def train_and_score(X, y):
X_train, X_test, y_train, y_test = split_data(X, y)
clf = Pipeline([
('reduce_dim', SelectKBest(chi2, k=2)),
('train', LinearSVC(C=100))
])
scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
print("Mean Model Accuracy:", np.array(scores).mean())
clf.fit(X_train, y_train)
confuse(y_test, clf.predict(X_test))
print()
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
data = anamolySample
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0, len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
X_new = SelectKBest(chi2, topk).fit(data, target)
outcome = X_new.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def to_weka_arff(ngram, number_of_features):
count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True)
label_list = get_labels()
tweet_list = get_labelled_tweets()
features = count_vect.fit_transform(tweet_list)
features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list)
print features.shape
arff_data = []
arff_data.append("@RELATION sport")
for i in range(features.shape[1]):
arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")
arff_data.append("@DATA")
array_features = features.toarray()
for i in range(len(array_features)):
feature = array_features[i]
label = label_list[i]
csv_feature = ",".join(str(x) for x in feature)
csv_feature = csv_feature + "," + label
arff_data.append(csv_feature)
with open('data/sport.arff', 'w') as file:
for item in arff_data:
file.write("%s\n" % item)
signal_extractor.py 文件源码
项目:Automatic-feature-extraction-from-signal
作者: VVVikulin
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def basic_quality(self, target, feature_vector):
assert (len(target) == len(feature_vector))
if self.quality == 'NWP':
sort_data_p = np.array([x for (y,x) in sorted(zip(feature_vector, target), key=lambda x: x[0])])
sort_data_n = np.array([x for (y,x) in sorted(zip(-1.0 * feature_vector, target), key=lambda x: x[0])])
p_nwp = QualityMeasure.calc_nwp(sort_data_p)
n_nwp = QualityMeasure.calc_nwp(sort_data_n)
return min(n_nwp, p_nwp)
if self.quality == 'corrcoef':
return 1 - abs(np.corrcoef(target, feature_vector)[0][1])
if self.quality == 'mutual_info':
m = MINE()
m.compute_score(target, feature_vector)
return 1.0 - m.mic()
if self.quality == 'chi2':
return 1 - chi2(abs(feature_vector.reshape(len(feature_vector), 1)), target)[0][0]
if self.quality == 'distcorr':
return 1 - distcorr(target, feature_vector)
if self.quality == 'distree':
data = np.column_stack((feature_vector, self.random_feature))
clf = DecisionTreeClassifier(max_depth=5, random_state=0)
clf.fit(data, target)
return 1.0 - clf.feature_importances_[0]
if self.quality == 'knnscore':
errors = []
clf = KNeighborsClassifier()
data = np.array([feature_vector]).transpose()
loo = LeaveOneOut()
for train, test in loo.split(data):
clf = KNeighborsClassifier()
clf.fit(data[train], target[train])
errors.append(accuracy_score(target[test], clf.predict(data[test])))
return 1.0 - np.mean(errors)
return 'WRONG QUALITY NAME'
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
feature_selection.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def chi2_feature_test(X,y,feature_index):
"""
Performs the chi square test on the desired feature
Keyword arguments:
X -- The feature vectors
y -- The target vector
feature_index - The selected feature (a zero-based index)
"""
feature_column=X[:,feature_index].reshape(-1,1)
min_val=feature_column.min()
if min_val<0:
feature_column=feature_column+min_val*-1+1
return chi2(feature_column,y)