python类chi2()的实例源码-面圈网

onlinedetectWithlittleData.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

birchForChangeWindowSize.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

decomposition.py 文件源码项目：NLPWorks 作者: thautwarm 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def de_c2(X,y):
    """ chi2 """
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf  = SelectKBest(chi2, k = de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1), clf.transform(X2)
    return _func

# def de_mic(X,y):
#     """ MIC """
#     dim  = X.shape[1]
#     de   = min(2000,dim) 
#     clf = SelectKBest(MIC, k=de)
#     clf.fit(X,y)
#     def _func(X1,X2):
#         return clf.transform(X1),clf.transform(X2)
#     return _func

onlinedetect.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

simulatev1.3.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):

    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample, normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    print 'x2:',warnstr
    return warnstr

utils.py 文件源码项目：deeppavlov 作者: deepmipt 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def ngrams_selection(train_data, train_labels, ind, model_file,
                     ngram_range_=(1, 1), max_num_features=100,
                     analyzer_type='word'):
    """Create and save vectorizers and feature selectors on given train data.

    Args:
        train_data: list of train text samples
        train_labels: list of train labels
        ind: index of vectorizer/selector to save file
        model_file: model filename
        ngram_range_: range of n-grams
        max_num_features: maximum number of features to select
        analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'

    Returns:
        nothing
    """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)

    X_train = vectorizer.fit_transform(train_data)

    if max_num_features < X_train.shape[1]:
        ch2 = SelectKBest(chi2, k=max_num_features)
        ch2.fit(X_train, train_labels)
        data_struct = {'vectorizer': vectorizer, 'selector': ch2}
        print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    else:
        data_struct = {'vectorizer': vectorizer}
        print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    return

FeatureSelection.py 文件源码项目：rdocChallenge 作者: Elyne 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def chiSquare(train_data, train_classes, topK):
    vectorizer = DictVectorizer()  

    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    y_train = train_classes

    if (x_train.shape[1] < topK):
        topK = x_train.shape[1]

    selector = SelectKBest(chi2, k=topK)
    x_new = selector.fit_transform(x_train, y_train)

    return vectorizer.inverse_transform(selector.inverse_transform(x_new))

analysis_utilities.py 文件源码项目：oss-github-analysis-project 作者: itu-oss-project-team 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def export_best_feature_names(self, df, labels, out_folder_path, k):
        columns, repos, observations = self.decompose_df(df)
        feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_
        feature_scores = np.nan_to_num(feature_scores)
        k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:]
        k_best_feature_names = columns[k_best_features]

        out_file_path = os.path.join(out_folder_path, "feature_selection.txt")
        with open(out_file_path, "w") as output_file:
            for feature_name in k_best_feature_names:
                output_file.write(feature_name + "\n")

ChiSquare.py 文件源码项目：SecuML 作者: ANSSI-FR 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self, conf):
        SemiSupervisedFeatureSelection.__init__(self, conf)
        self.projection = SelectKBest(chi2, k = conf.num_components)

getFeature.py 文件源码项目：USTC_AILab2 作者: overflocat 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def getFeature():
    fileData = open("data")
    row = []
    col = []
    data = []
    evalRes = []
    rowIndex = -1
    fileList = fileData.readlines()
    random.shuffle(fileList)
    for line in fileList:
        line = line.rstrip('\n')
        dataList = re.split(' |:', line)

        if int(dataList[0]) >= 7:
            evalRes.append(1)
        else:
            if int(dataList[0]) <= 4:
                evalRes.append(-1)
            else:
                continue
        del dataList[0]

        rowIndex = rowIndex + 1
        row.extend([rowIndex] * int(len(dataList) / 2))
        col.extend(map(int, dataList[::2]))
        data.extend(map(int, dataList[1::2]))

    featureMatrix = csr_matrix((data, (row, col)))
    featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes)
    return featureMNew, evalRes

semeval_regression.py 文件源码项目：semeval2016-task4 作者: aesuli 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)

parameter_search.py 文件源码项目：RIDDLE 作者: jisungk 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def select_feats(X, y, nb_features, nb_features_to_keep=2048):
    X, y = preproc_for_sklearn(X, y, nb_features)

    if nb_features < nb_features_to_keep:
        nb_features_to_keep = nb_features_to_keep / 4

    feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y)
    selected_indices = feature_selector.get_support(indices=True) 

    return selected_indices

ScikitLearners.py 文件源码项目：Aion 作者: aleisalem 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0):
    """
    Classifies the data using K-nearest neighbors and k-fold CV
    :param X: The list of feature vectors
    :type X: list
    :param y: The list of labels corresponding to the feature vectors
    :type y: list
    :param K: The number of nearest neighbors to consider in classification
    :type K: int
    :param kfold: The number of folds in the CV
    :type kfold: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: An array of predicted classes
    """
    try:
        # Prepare data 
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = neighbors.KNeighborsClassifier(n_neighbors=K)
        # Select K Best features if enabled
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()

    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

ScikitLearners.py 文件源码项目：Aion 作者: aleisalem 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0):
    """
    Trains a K-NN using the training data and tests it using the test data using K-fold cross validation
    :type X: list
    :param y: The labels corresponding to the training feature vectors
    :type y: list
    :param Xtest: The matrix of test feature vectors
    :type Xtest: list
    :param ytest: The labels corresponding to the test feature vectors
    :type ytest: list
    :param K: The number of nearest neighbors to consider in classification
    :type K: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: Two lists of the validation and test accuracies across the k-folds
    """
    try:
        predicted, predicted_test = [], []
        # Define classifier and cross validation iterator
        clf = neighbors.KNeighborsClassifier(n_neighbors=K)
        # Start the cross validation learning
        X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
        # Select K Best features if enabled
        prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest
        # Fit model
        prettyPrint("Fitting model")
        clf.fit(X_new, y)
        # Validate and test model
        prettyPrint("Validating model using training data")
        predicted = clf.predict(X_new)
        prettyPrint("Testing model")
        predicted_test = clf.predict(Xtest_new)

    except Exception as e:
        prettyPrintError(e)
        return [], []

    return predicted, predicted_test

ScikitLearners.py 文件源码项目：Aion 作者: aleisalem 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0):
    """Classifies the data using Support vector machines with the SSK kernel and k-fold CV
    :param X: The list of text documents containing traces
    :type X: list
    :param y: The labels of documents in 'X'
    :type y: list
    :param kfold: The number of folds
    :type kfold: int (default: 10)
    :param subseqLength: Length of subsequence used by the SSK
    :type subseqLength: int (default: 3)
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: An array of predicted classes 
    """
    try:
        predicted = []
        # Retrieve Gram Matrix from string kernel
        if verboseON():
            prettyPrint("Generating Gram Matrix from documents", "debug")
        X_gram = string_kernel(X, X)
        y = numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel="precomputed")
        X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram
        prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest))
        predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

ScikitLearners.py 文件源码项目：Aion 作者: aleisalem 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10):
    """
    Classifies the data using Support vector machines and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing the labels corresponding to feature vectors
    :type y: list
    :param kernel: The kernel used to elevate data into higher dimensionalities
    :type kernel: str
    :param C: The penalty parameter of the error term
    :type C: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int 
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data 
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel=kernel, C=C)
        # Select K Best features if enabled
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

ScikitLearners.py 文件源码项目：Aion 作者: aleisalem 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
    """
    Classifies the data using decision trees and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing labels corresponding to the feature vectors
    :type y: list
    :param estimators: The number of random trees to use in classification
    :type estimators: int
    :param criterion: The splitting criterion employed by the decision tree
    :type criterion: str
    :param splitter: The method used to split the data
    :type splitter: str
    :param maxDepth: The maximum depth the tree is allowed to grow
    :type maxDepth: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

decomposition.py 文件源码项目：NLPWorks 作者: thautwarm 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def de_c2(X,y):
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf  = SelectKBest(chi2, k = de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

preprocess.py 文件源码项目：mtl 作者: zhenhongChen 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def get_local_words(word_count, threshold, y_train, train_seq, num_words):

    feature_index = delete_low_freq_words(word_count, threshold)
    print(len(train_seq), len(feature_index))
    word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])

    for (seq_idx, seq) in enumerate(train_seq):
        word_freq_list = np.zeros(len(feature_index))

        for word in seq:
            if (word not in feature_index):
                continue
            else:
                word_idx = feature_index[word]
                word_freq_matrix[seq_idx][word_idx] += 1

    sk = SelectKBest(chi2, k="all")
    sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
    score_list = sk.scores_

    word_score = {}
    for (feature, idx) in feature_index.items():
        word_score[feature] = score_list[idx]

    word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)

    local_word_list = []
    for (word, score) in word_score[:num_words]:
        local_word_list.append(word)

    del word_freq_matrix

    return local_word_list

classification.py 文件源码项目：DocumentClassification 作者: bahmanh 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

model.py 文件源码项目：student-performance-prediction 作者: sachanganesh 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def train_and_score(X, y):
    X_train, X_test, y_train, y_test = split_data(X, y)

    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100))
    ])

    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy:", np.array(scores).mean())

    clf.fit(X_train, y_train)

    confuse(y_test, clf.predict(X_test))
    print()

OD_numpy_buf.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

v1.1.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    data = anamolySample
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0, len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0, len(name)):
        if outcome[i]:
            print name[i]

v0.3.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

v1.3.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr

v0.2.py 文件源码项目：onlineDetectForHadoop 作者: DawnsonLi 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

to_weka_arff.py 文件源码项目：sport-news-retrieval 作者: Andyccs 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def to_weka_arff(ngram, number_of_features):
  count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True)

  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  features = count_vect.fit_transform(tweet_list)

  features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list)
  print features.shape

  arff_data = []

  arff_data.append("@RELATION sport")

  for i in range(features.shape[1]):
    arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
  arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

  arff_data.append("@DATA")

  array_features = features.toarray()
  for i in range(len(array_features)):
    feature = array_features[i]
    label = label_list[i]
    csv_feature = ",".join(str(x) for x in feature)
    csv_feature = csv_feature + "," + label
    arff_data.append(csv_feature)

  with open('data/sport.arff', 'w') as file:
    for item in arff_data:
      file.write("%s\n" % item)

signal_extractor.py 文件源码项目：Automatic-feature-extraction-from-signal 作者: VVVikulin 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def basic_quality(self, target, feature_vector):
        assert (len(target) == len(feature_vector))
        if self.quality == 'NWP':
            sort_data_p = np.array([x for (y,x) in sorted(zip(feature_vector, target), key=lambda x: x[0])])
            sort_data_n = np.array([x for (y,x) in sorted(zip(-1.0 * feature_vector, target), key=lambda x: x[0])])
            p_nwp = QualityMeasure.calc_nwp(sort_data_p)
            n_nwp = QualityMeasure.calc_nwp(sort_data_n)
            return min(n_nwp, p_nwp)
        if self.quality == 'corrcoef':
            return 1 - abs(np.corrcoef(target, feature_vector)[0][1])
        if self.quality == 'mutual_info':
            m = MINE()
            m.compute_score(target, feature_vector)
            return 1.0 - m.mic()
        if self.quality == 'chi2':
            return 1 - chi2(abs(feature_vector.reshape(len(feature_vector), 1)), target)[0][0]
        if self.quality == 'distcorr':
            return 1 - distcorr(target, feature_vector)
        if self.quality == 'distree':
            data = np.column_stack((feature_vector, self.random_feature))
            clf = DecisionTreeClassifier(max_depth=5,  random_state=0)
            clf.fit(data, target)
            return 1.0 - clf.feature_importances_[0]
        if self.quality == 'knnscore':
            errors = []
            clf = KNeighborsClassifier()
            data = np.array([feature_vector]).transpose()
            loo = LeaveOneOut()
            for train, test in loo.split(data):
                clf = KNeighborsClassifier()
                clf.fit(data[train], target[train])
                errors.append(accuracy_score(target[test], clf.predict(data[test])))
            return 1.0 - np.mean(errors)
        return 'WRONG QUALITY NAME'

test_dict_vectorizer.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"])

feature_selection.py 文件源码项目：Default-Credit-Card-Prediction 作者: AlexPnt 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def chi2_feature_test(X,y,feature_index):
    """
    Performs the chi square test on the desired feature

    Keyword arguments:
    X -- The feature vectors
    y -- The target vector
    feature_index - The selected feature (a zero-based index)
    """

    feature_column=X[:,feature_index].reshape(-1,1)
    min_val=feature_column.min()
    if min_val<0:
        feature_column=feature_column+min_val*-1+1
    return chi2(feature_column,y)