python类SelectKBest()的实例源码

ScikitLearners.py 文件源码 项目:Aion 作者: aleisalem 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0):
    """
    Trains a K-NN using the training data and tests it using the test data using K-fold cross validation
    :type X: list
    :param y: The labels corresponding to the training feature vectors
    :type y: list
    :param Xtest: The matrix of test feature vectors
    :type Xtest: list
    :param ytest: The labels corresponding to the test feature vectors
    :type ytest: list
    :param K: The number of nearest neighbors to consider in classification
    :type K: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: Two lists of the validation and test accuracies across the k-folds
    """
    try:
        predicted, predicted_test = [], []
        # Define classifier and cross validation iterator
        clf = neighbors.KNeighborsClassifier(n_neighbors=K)
        # Start the cross validation learning
        X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
        # Select K Best features if enabled
        prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest
        # Fit model
        prettyPrint("Fitting model")
        clf.fit(X_new, y)
        # Validate and test model
        prettyPrint("Validating model using training data")
        predicted = clf.predict(X_new)
        prettyPrint("Testing model")
        predicted_test = clf.predict(Xtest_new)

    except Exception as e:
        prettyPrintError(e)
        return [], []

    return predicted, predicted_test
ScikitLearners.py 文件源码 项目:Aion 作者: aleisalem 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0):
    """Classifies the data using Support vector machines with the SSK kernel and k-fold CV
    :param X: The list of text documents containing traces
    :type X: list
    :param y: The labels of documents in 'X'
    :type y: list
    :param kfold: The number of folds
    :type kfold: int (default: 10)
    :param subseqLength: Length of subsequence used by the SSK
    :type subseqLength: int (default: 3)
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: An array of predicted classes 
    """
    try:
        predicted = []
        # Retrieve Gram Matrix from string kernel
        if verboseON():
            prettyPrint("Generating Gram Matrix from documents", "debug")
        X_gram = string_kernel(X, X)
        y = numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel="precomputed")
        X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram
        prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest))
        predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted
ScikitLearners.py 文件源码 项目:Aion 作者: aleisalem 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10):
    """
    Classifies the data using Support vector machines and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing the labels corresponding to feature vectors
    :type y: list
    :param kernel: The kernel used to elevate data into higher dimensionalities
    :type kernel: str
    :param C: The penalty parameter of the error term
    :type C: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int 
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data 
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel=kernel, C=C)
        # Select K Best features if enabled
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted
ScikitLearners.py 文件源码 项目:Aion 作者: aleisalem 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
    """
    Classifies the data using decision trees and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing labels corresponding to the feature vectors
    :type y: list
    :param estimators: The number of random trees to use in classification
    :type estimators: int
    :param criterion: The splitting criterion employed by the decision tree
    :type criterion: str
    :param splitter: The method used to split the data
    :type splitter: str
    :param maxDepth: The maximum depth the tree is allowed to grow
    :type maxDepth: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def MIC(x, y):
     # Maximal Information Coefficient 
     base = MINE()
     base.compute_score(x, y)
     return base.mic(),0.5  # just one kind of factor,
                            # but SelectKBest needs two factors to fit datas.
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def de_ps(X,y):
    dim = X.shape[1]
    de = min(2000,dim)
    clf = SelectKBest(lambda X, Y: np.array(map(lambda x:pearsonr(x, Y), X.T)).T, k=de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def de_c2(X,y):
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf  = SelectKBest(chi2, k = de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def de_mic(X,y):
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf = SelectKBest(MIC, k=de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def de_f_and_p_value(X,y):
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = SelectKBest(f_classif,k=de)
    clf.fit(X, y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func
decomposition.py 文件源码 项目:NLPWorks 作者: thautwarm 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def de_f_and_p_value(X,y):
    """ f&p value """
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = SelectKBest(f_classif,k=de)
    clf.fit(X, y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func
cluster.py 文件源码 项目:pantip-libr 作者: starcolon 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def new(method='centroid',n_features=8):

  # Clustering method
  nc = METHODS[method]

  # Orthogonal feature selector
  if n_features is None: n_features = 'all'
  selector = SelectKBest(f_classif, k=n_features)

  # NOTE: The only last operation of the list
  # must be a classifier or clustering model
  print(colored('Cluster model created','yellow'))
  return [selector, nc]
preprocess.py 文件源码 项目:mtl 作者: zhenhongChen 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_local_words(word_count, threshold, y_train, train_seq, num_words):

    feature_index = delete_low_freq_words(word_count, threshold)
    print(len(train_seq), len(feature_index))
    word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])

    for (seq_idx, seq) in enumerate(train_seq):
        word_freq_list = np.zeros(len(feature_index))

        for word in seq:
            if (word not in feature_index):
                continue
            else:
                word_idx = feature_index[word]
                word_freq_matrix[seq_idx][word_idx] += 1

    sk = SelectKBest(chi2, k="all")
    sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
    score_list = sk.scores_

    word_score = {}
    for (feature, idx) in feature_index.items():
        word_score[feature] = score_list[idx]

    word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)

    local_word_list = []
    for (word, score) in word_score[:num_words]:
        local_word_list.append(word)

    del word_freq_matrix

    return local_word_list
wrangler.py 文件源码 项目:tcsl 作者: machinelearningnanodegree 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def k_best_features(self):
        # get total number of features.
        num_features = self.features.shape[1]
        feature_list = []
        # find k-best features, with k from 1 to num_features.
        for i in range(num_features):
            skBest = SelectKBest(k=i)
            skBest.fit_transform(self.features, self.labels)
            # get boolean indices of the best features.
            k_features = skBest.get_support()
            # append the features to the feature list.
            feature_list += self.features.columns[k_features].tolist()
        return feature_list
test_dtree.py 文件源码 项目:tcsl 作者: machinelearningnanodegree 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def selectFeatures(k_features=5, *args):
    """
    # Select k best features using the SelectKBest class in Sklearn.
    # Inputs: k=no. of features to select, args=(XTrain,yTrain)
    # returns: np array of k features.
    """
    X, y = args
    skb = SelectKBest(k=k_features)
    return skb.fit_transform(X, y)
classification.py 文件源码 项目:DocumentClassification 作者: bahmanh 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model
model.py 文件源码 项目:student-performance-prediction 作者: sachanganesh 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def train_and_score(X, y):
    X_train, X_test, y_train, y_test = split_data(X, y)

    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100))
    ])

    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy:", np.array(scores).mean())

    clf.fit(X_train, y_train)

    confuse(y_test, clf.predict(X_test))
    print()
OD_numpy_buf.py 文件源码 项目:onlineDetectForHadoop 作者: DawnsonLi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
v1.1.py 文件源码 项目:onlineDetectForHadoop 作者: DawnsonLi 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    data = anamolySample
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0, len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0, len(name)):
        if outcome[i]:
            print name[i]
v0.3.py 文件源码 项目:onlineDetectForHadoop 作者: DawnsonLi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
v1.3.py 文件源码 项目:onlineDetectForHadoop 作者: DawnsonLi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr


问题


面经


文章

微信
公众号

扫码关注公众号