python类DictVectorizer()的实例源码

FeatureSelection.py 文件源码 项目:rdocChallenge 作者: Elyne 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def varianceFilter(train_data, train_classes, threshold):
    #if True:
    #    return frequencyFilter(train_data, train_classes, threshold)
    '''
    Variance filter
    '''
    vectorizer = DictVectorizer()  
    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    #y_train = train_classes

    sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
    x_new = sel.fit_transform(x_train)
    return vectorizer.inverse_transform(sel.inverse_transform(x_new))
modelData.py 文件源码 项目:rdocChallenge 作者: Elyne 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()):

    labels = [x.severity for x in data]

    generatePrimaryFeats(data, featTypes)

    featurized = []
    for d in data:
        instance = {}
        for featname, values in d.feats.items():
            # Give each feature a unique name to avoid overwriting features.
            # If e.g. a concept feature has the same name as a bow word, the old code
            # would overwrite one of the features.
            instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()})

        featurized.append(instance)

    d = DictVectorizer()
    x_train = d.fit_transform(featurized)

    folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed)
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds)
    fit_grid = grid.fit(x_train, labels)

    print(fit_grid.best_params_)
    return fit_grid.best_params_
modelData.py 文件源码 项目:rdocChallenge 作者: Elyne 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs):
    new_train_set = list(trainSet)
    new_y_train = list(y_train)

    trainAndBSData = trainSet + bootstrap_data

    generateDataDrivenFeats(trainSet, trainAndBSData, es)

    featurized = featurize(trainAndBSData)

    train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
    test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)]

    #Do feature selection on train data
    train_feats = fs.runFeatureSelection(train_feats, y_train, es)
    train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es)

    # calculate Inter-annotator weighting. 
    weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot)

    vectorizer = DictVectorizer()   
    x_train = vectorizer.fit_transform(train_feats)
    x_test = vectorizer.transform(test_feats)

    if es.scaleData:
        min_max_scalar = MinMaxScaler()
        x_train = min_max_scalar.fit_transform(x_train.toarray())
        x_test = min_max_scalar.transform(x_test.toarray())

    model = train(estimator, x_train, y_train, weights_train, model=None)

    y_pred_prob = model.predict_proba(x_test)
    for i, cur_y in enumerate(y_pred_prob):
        if np.max(cur_y) > th_bs:
            new_train_set.append(bootstrap_data[i])
            new_y_train.append(np.argmax(cur_y))

    return (new_train_set, new_y_train) #update none to confidence vector
Vectorizer.py 文件源码 项目:kindred 作者: jakelever 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def _vectorize(self,corpus,fit):
        assert isinstance(corpus,kindred.Corpus)

        matrices = []
        for feature in self.chosenFeatures:
            assert feature in self.featureInfo.keys()
            featureFunction = self.featureInfo[feature]['func']
            never_tfidf = self.featureInfo[feature]['never_tfidf']
            data = featureFunction(corpus)
            notEmpty = any( len(d)>0 for d in data )
            if fit:
                if notEmpty:
                    self.dictVectorizers[feature] = DictVectorizer()
                    if self.tfidf and not never_tfidf:
                        self.tfidfTransformers[feature] = TfidfTransformer()
                        intermediate = self.dictVectorizers[feature].fit_transform(data)
                        matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].fit_transform(data))
            else:
                if feature in self.dictVectorizers:
                    if self.tfidf and not never_tfidf:
                        intermediate = self.dictVectorizers[feature].transform(data)
                        matrices.append(self.tfidfTransformers[feature].transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].transform(data))

        mergedMatrix = hstack(matrices)
        return mergedMatrix
CEP_Exp_One.py 文件源码 项目:dmon-adp 作者: igabriel85 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
ACI_Exp_One.py 文件源码 项目:dmon-adp 作者: igabriel85 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
CEP_Exp_Two.py 文件源码 项目:dmon-adp 作者: igabriel85 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
learner.py 文件源码 项目:bionlp17 作者: leebird 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, name, warm_start=True):
        self.vocal = DictVectorizer()
        self.model = linear_model.LogisticRegression(warm_start=warm_start,
                                                     solver='sag',
                                                     max_iter=200,
                                                     verbose=0,
                                                     penalty='l2',
                                                     n_jobs=4)
pipeline.py 文件源码 项目:whereami 作者: kootenpv 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
    return make_pipeline(DictVectorizer(sparse=False), clf)
knock76.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def predict_function():
    x_list = []
    predict_doc = joblib.load('logreg.pkl')
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    return pred, y_train, prob
knock73.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def log_regression():
    x_list = []
    logreg = LogisticRegression()
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    word_vec = DictVectorizer()
    X = word_vec.fit_transform(x_list)
    logreg.fit(X, y_train)
    joblib.dump(logreg, 'logreg.pkl')
    joblib.dump(word_vec,"word_vec.pkl")
knock78.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def cv_prediction(feature_dict, feature, polarity, threshold, folds):
    accuracy = 0
    precision = 0
    recall = 0
    f1 = 0
    count = 0
    dicvec = DictVectorizer()
    LR = LogisticRegression()
    kfold = KFold(len(polarity), n_folds=folds)
    for train, test in kfold:
        count += 1
        x = list()
        y = list()
        [x.append(feature[i]) for i in train]
        [y.append(polarity[i]) for i in train]
        x.append(feature_dict)
        y.append(0)
        LR.fit(dicvec.fit_transform(x), y)
        test_label = list()
        answer_label = list()
        [answer_label.append(polarity[j]) for j in test]
        for j in test:
            query = fit_feature(feature[j], feature_dict)
            result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold)
            test_label.append(result)
        accuracy += accuracy_score(answer_label, test_label)
        precision += precision_score(answer_label, test_label)
        recall += recall_score(answer_label, test_label)
        f1 += f1_score(answer_label, test_label)
        print('{}_fold finished.'.format(count))

    return accuracy, precision, recall, f1
knock76.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def predict_function():
    x_list = []
    predict_doc = joblib.load('logreg.pkl')
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    return pred, y_train, prob
knock73.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def log_regression():
    x_list = []
    logreg = LogisticRegression()
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    word_vec = DictVectorizer()
    X = word_vec.fit_transform(x_list)
    logreg.fit(X, y_train)
    joblib.dump(logreg, 'logreg.pkl')
    joblib.dump(word_vec,"word_vec.pkl")
knock74.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def main():
    lr = joblib.load('./lr.pkl')
    dic2vec = DictVectorizer()
    features = list()
    y = list()
    for line in open('sentiment.txt'):
        word_list = line[3:].strip('\n').strip().split()
        features.append(getFeature(word_list))
    x = dic2vec.fit_transform(features)
    with open('sentiment_prediction.txt', 'w') as fp:
        for sentiment, prob in zip(lr.predict(x), lr.predict_proba(x)):
            print('{}\t{}'.format(sentiment, prob), file=fp)
custom_transformers.py 文件源码 项目:pandas-pipelines-custom-transformers 作者: jem1031 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self
test_dict_vectorizer.py 文件源码 项目:coremltools 作者: apple 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_dictvectorizer(self):

        D = [{"foo": 1, "bar": 3},
             {"bar": 4, "baz": 2},
             {"bar": 1, "quux": 1, "quuux": 2}]

        for sparse in (True, False):
            for dtype in (int, np.float32, np.int16):
                for sort in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    v = v.fit(D)
                    self._test_conversion(D, v)
test_dict_vectorizer.py 文件源码 项目:coremltools 作者: apple 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_unseen_or_no_features(self):
        D1 = [{"camelot": 0, "spamalot": 1}]
        D2 = [{}, {"nothing" : 21}]

        for sparse in (True, False):
            for dtype in (int, np.float32, np.int16):
                for sort in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    v = v.fit(D1) 
                    self._test_conversion(D2, v)
test_dict_vectorizer.py 文件源码 项目:coremltools 作者: apple 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def test_int_features_in_pipeline(self): 

        import numpy.random as rn
        import pandas as pd
        rn.seed(0)

        x_train_dict = [ dict( (rn.randint(100), 1) 
                          for i in range(20)) 
                            for j in range(100)]
        y_train = [0,1]*50

        from sklearn.pipeline import Pipeline
        from sklearn.feature_extraction import DictVectorizer
        from sklearn.linear_model import LogisticRegression

        pl = Pipeline([("dv", DictVectorizer()),  ("lm", LogisticRegression())])
        pl.fit(x_train_dict, y_train)

        import coremltools

        model = coremltools.converters.sklearn.convert(pl, input_features = "features", output_feature_names = "target")

        x = pd.DataFrame( {"features" : x_train_dict, 
                           "prediction" : pl.predict(x_train_dict)})

        cur_eval_metics = evaluate_classifier(model, x)
        self.assertEquals(cur_eval_metics['num_errors'], 0)
predictor.py 文件源码 项目:auto_ml 作者: ClimbsRocks 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _validate_input_col_descriptions(self):
        found_output_column = False
        self.cols_to_ignore = []
        expected_vals = set(['categorical', 'text', 'nlp'])

        for key, value in self.column_descriptions.items():
            value = value.lower()
            self.column_descriptions[key] = value
            if value == 'output':
                self.output_column = key
                found_output_column = True
            elif value == 'date':
                self.date_cols.append(key)
            elif value == 'ignore':
                self.cols_to_ignore.append(key)
            elif value in expected_vals:
                pass
            else:
                raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
        if found_output_column is False:
            print('Here is the column_descriptions that was passed in:')
            print(self.column_descriptions)
            raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')

        # We will be adding one new categorical variable for each date col
        # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
        for date_col in self.date_cols:
            self.column_descriptions[date_col + '_day_part'] = 'categorical'

        self.cols_to_ignore = set(self.cols_to_ignore)


    # We use _construct_pipeline at both the start and end of our training.
    # At the start, it constructs the pipeline from scratch
    # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it


问题


面经


文章

微信
公众号

扫码关注公众号