python类LabelEncoder()的实例源码-面圈网

encoder.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def lbl_encode(df_tr,df_te=None,cols=None,objonly=True):
    print("label encode ...")
    lbl = LabelEncoder()
    if df_te is not None:
        df = df_tr.append(df_te)
        if cols is None:
            cols = set(df_tr.columns.values).intersection(set(df_te.columns.values))
    else:
        df = df_tr
        if cols is None:
            cols = df_tr.columns.values
    encoded = []
    for col in cols:
        if objonly and df[col].dtype!='object':
            continue
        encoded.append(col)
        lbl.fit(df[col].map(str))
        df_tr[col] = lbl.transform(df_tr[col].map(str))
        if df_te is not None:
            df_te[col] = lbl.transform(df_te[col].map(str))
    print('lbl encode:',encoded)

tests.py 文件源码项目：datacleaner 作者: rhiever 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def test_autoclean_cv_no_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)

tests.py 文件源码项目：datacleaner 作者: rhiever 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_autoclean_with_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)

tests.py 文件源码项目：datacleaner 作者: rhiever 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_autoclean_real_data():
    """Test autoclean() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
    adult_data.loc[30:60, 'age'] = np.nan
    adult_data.loc[90:100, 'education'] = np.nan

    hand_cleaned_adult_data = adult_data.copy()

    hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
    hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)

    cleaned_adult_data = autoclean(adult_data)

    assert cleaned_adult_data.equals(hand_cleaned_adult_data)

struct_models.py 文件源码项目：marseille 作者: vene 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def initialize_labels(self, Y):

        y_nodes_flat = [y_val for y in Y for y_val in y.nodes]
        y_links_flat = [y_val for y in Y for y_val in y.links]
        self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

        self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_,
                                     dtype=np.double)
        self.link_cw_ = compute_class_weight(self.class_weight,
                                             self.link_encoder_.classes_,
                                             y_links_flat)

        self.link_cw_ /= self.link_cw_.min()

        logging.info('Setting node class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.prop_encoder_.classes_, self.prop_cw_))))

        logging.info('Setting link class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.link_encoder_.classes_, self.link_cw_))))

2016-11-07_v0.0.1.py 文件源码项目：SPHERE-HyperStream 作者: IRC-SPHERE 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})

scikitlearn.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

test_classifier.py 文件源码项目：microTC 作者: INGEOTEC 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_predict_from_file():
    from microtc.wrappers import ClassifierWrapper
    from microtc.textmodel import TextModel
    from microtc.utils import read_data_labels
    from sklearn.preprocessing import LabelEncoder

    import os
    fname = os.path.dirname(__file__) + '/text.json'
    corpus, labels = read_data_labels(fname)
    t = TextModel(corpus)
    le = LabelEncoder()
    le.fit(labels)
    y = le.transform(labels)
    c = ClassifierWrapper()
    X = [t[x] for x in corpus]
    c.fit(X, y)
    hy = le.inverse_transform(c.predict(X))
    for i in hy:
        assert i in ['POS', 'NEU', 'NEG']

regcorewrapper.py 文件源码项目：microTC 作者: INGEOTEC 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='r2', classifier=RegressorWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]

scorewrapper.py 文件源码项目：microTC 作者: INGEOTEC 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='macrof1', classifier=ClassifierWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]

model.py 文件源码项目：traffic-v2 作者: vnetserg 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def score_model(model, data_test, labeler):
    '''
        ??????? ?????????????????? ??????,
        ?????? ? ??????????? ????? ??? ???????:
        ???????? ?????????, ???????? ??????? ?
        ???????? ??? ??????? ??????, ????????
        ? ????????????? ??????.
        ?????????:
            model - ????????? ??????
            data_test - ??????????? ???????
            labeler - LabelEncoder ?????? ???????
        ??????????:
            ??????
    '''
    X_test = data_test.drop(["proto"], axis=1)
    y_test = data_test["proto"]
    y_predicted = model.predict(X_test)

    true_labels = labeler.inverse_transform(y_test)
    predicted_labels = labeler.inverse_transform(y_predicted)

    print feature_importances_report(model, X_test.columns)
    print "\n", classification_report(true_labels, predicted_labels)
    print cross_class_report(true_labels, predicted_labels)

doFeats_2.py 文件源码项目：Tencent_Social_Ads 作者: freelzy 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def doDescartes(X_train, X_test):
    res = X_test[['instanceID']]
    X_test.drop('instanceID', axis=1, inplace=True)
    data = X_train.append(X_test, ignore_index=True)
    del X_train, X_test
    gc.collect()

    for feat_1 in ['maybe_0', 'maybe_2']:
        for feat_2 in ['connectionType', 'creativeID', 'positionID']:
            le = LabelEncoder()
            data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str'))
    X_train = data.loc[data['label'] != -1, :]
    X_test = data.loc[data['label'] == -1, :]
    X_test.loc[:, 'instanceID'] = res.values
    del data
    gc.collect()
    return X_train, X_test

util.py 文件源码项目：stock-price-prediction 作者: chinuy 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def preprocessData(dataset):

    le = preprocessing.LabelEncoder()

    # in case divid-by-zero
    dataset.Open[dataset.Open == 0] = 1

    # add prediction target: next day Up/Down
    threshold = 0.000
    dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
    dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
    dataset.UpDown[dataset.UpDown < threshold] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
    dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
    return dataset

handle_missing.py 文件源码项目：playground 作者: Pennsy 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def generate_test_data():
    with open('./test.csv', 'r') as test_file:
        test_csv = csv.reader(test_file, delimiter=',')
        next(test_csv)
        test_data = list(test_csv)
    test_data = numpy.array(test_data)
    # delete id column
    # test_data = numpy.delete(test_data, 0, 1)
    # One of K encoding of categorical data
    encoder = preprocessing.LabelEncoder()
    for j in (1, 2, 3, 4, 5, 6, 7, 8, 9, 14):
        test_data[:, j+1] = encoder.fit_transform(test_data[:, j+1])
    # Converting numpy strings to floats
    test_data = test_data.astype(numpy.float)
    missValueIndex = 7
    Xy_test = test_data[test_data[:, 3+1]==missValueIndex]
    Xy_train = test_data[test_data[:, 3+1]!=missValueIndex]
    X_train = numpy.delete(Xy_train, 3+1 ,1)
    y_train = Xy_train[:, 3+1]
    X_test = numpy.delete(Xy_test, 3+1 ,1)
    market_test_data = MarketingData(X_train, y_train, X_test)
    return market_test_data, test_data


# use knn for impute missing values

scikitlearn.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

test_mondrian.py 文件源码项目：scikit-garden 作者: scikit-garden 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def check_proba_classif_convergence(X_train, y_train, mc):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)

    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = mc.predict_proba(X_train)
    labels = mc.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = mc.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])

test_forest.py 文件源码项目：scikit-garden 作者: scikit-garden 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def check_proba_classif_convergence(est, X_train, y_train):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = est.predict_proba(X_train)
    labels = est.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = est.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)

scikitlearn.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

training.py 文件源码项目：whereareyou 作者: futurice 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def train_model(data, with_mac=True):
    global without_mac_clf, mac_clf
    df = pd.DataFrame.from_dict(data)
    y = df.pop("location")
    features = [f for f in df.columns if f is not 'mac']
    df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
    model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
    if with_mac:
        df = df.apply(LabelEncoder().fit_transform)
    else:
        df.drop("mac", axis=1, inplace=True)
    clf = DecisionTreeClassifier()
    clf.fit(df, y)
    joblib.dump(clf, model_name)
    if with_mac and mac_clf is None:
        mac_clf = clf
    if not with_mac and without_mac_clf is None:
        without_mac_clf = clf
    export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
    os.system("dot -Tpng model.dot -o model.png")

scikitlearn.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

mlp_classifier.py 文件源码项目：muffnn 作者: civisanalytics 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _fit_targets(self, y, classes=None):
        self.multilabel_ = self._is_multilabel(y)

        # If provided, use classes to fit the encoded and set classes_.
        # Otherwise, find the unique classes in y.
        if classes is not None:
            y = classes

        if self.multilabel_:
            self._enc = None
            self.classes_ = np.arange(y.shape[1])
            self.n_classes_ = y.shape[1]
        else:
            self._enc = LabelEncoder().fit(y)
            self.classes_ = self._enc.classes_
            self.n_classes_ = len(self.classes_)

scikitlearn.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

CART_Trainer.py 文件源码项目：kdd99-scikit 作者: PENGZhaoqing 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def label_encoding(self, dataset):
        """

        :param data_set:
        :param data_target:
        :return: data_set
        """

        le_1 = preprocessing.LabelEncoder()
        le_2 = preprocessing.LabelEncoder()
        le_3 = preprocessing.LabelEncoder()

        le_1.fit(np.unique(dataset[:, 1]))
        le_2.fit(np.unique(dataset[:, 2]))
        le_3.fit(np.unique(dataset[:, 3]))

        dataset[:, 1] = le_1.transform(dataset[:, 1])
        dataset[:, 2] = le_2.transform(dataset[:, 2])
        dataset[:, 3] = le_3.transform(dataset[:, 3])

        return dataset

scikitlearn.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

predict_2017_06_16_4.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

predict_2017_06_16_5.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

predict_2017_06_16_1.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

predict_2017_06_16_3.py 文件源码项目：mlbootcamp_5 作者: ivan-filonov 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test