python类Normalizer()的实例源码-面圈网

test_one_hot_encoder.py 文件源码项目：coremltools 作者: apple 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def test_boston_OHE_pipeline(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct. 

            model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
                 ("Normalizer", Normalizer())])

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)

lsa.py 文件源码项目：document_classification 作者: scotthlee 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
    svd = TruncatedSVD(n_features)  
    if normalize:   
        if flip:
            lsa = make_pipeline(svd, Normalizer(copy=False))
            doc_mat = lsa.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            lsa = make_pipeline(svd, Normalizer(copy=False))        
            doc_mat = lsa.fit_transform(doc_vecs)
        return doc_mat
    else:
        if flip:
            doc_mat = svd.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            doc_mat = svd.fit_transform(doc_vecs)
        return doc_mat

scikit_classifier.py 文件源码项目：QuestionAnswerNLP 作者: debjyoti385 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def train(labeled_featuresets, C=1e5):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        feat = [featureset for featureset, label in labeled_featuresets]
        feature_vectorizer = MVectorizer.DictsVectorizer()
        X = feature_vectorizer.fit_transform(feat)
        X = Normalizer().fit_transform(X)
        label_set = set( [label for featureset, label in labeled_featuresets] )
        label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
        y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
        # print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
        classifier = OneVsRestClassifier(LinearSVC(loss='squared_hinge', penalty='l2', dual=True, tol=1e-5, C=C))
        classifier.fit(X,y)
        # print "done"

        return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)

bis_avg.py 文件源码项目：kaggle-yelp-restaurant-photo-classification 作者: u1234x1234 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def pool(biz_dict, vlad_dict, mode):
    if mode == 'train':
        y_dict = read_y()
    y = np.zeros((0, 9))
    x = np.array([])
    x_vlad = np.array([])

    for key, value in sorted(biz_dict.items()):
        avg = np.array(value).sum(axis=0) / len(value)
        vlad = vlad_dict.get(key)
#        vlad = preprocessing.normalize(vlad)
#        print(vlad.shape)
#        feat = np.concatenate([avg, vlad], axis=0)
#        feat = preprocessing.Normalizer().fit_transform(feat)
#        feat = avg
        x = np.vstack((x, avg)) if x.size else avg
        x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad

        if mode == 'train':
            y = np.vstack((y, y_dict.get(key)))        
    return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)

clustering.py 文件源码项目：onionstack 作者: ntddk 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def main():
    features = []

    for i in list:
        im = cv2.imread(i)
        hist, bins = np.histogram(im.ravel(), 256, [0, 256])
        features.append(hist)

    lsa = TruncatedSVD(10)
    features = lsa.fit_transform(features)
    features = Normalizer(copy = False).fit_transform(features)

    km = KMeans(
        init='k-means++',
        n_clusters=n_clusters,
    )
    km.fit(features)

    for i in range(n_clusters):
        if not os.path.exists('./result/' + str(i)):
            os.makedirs('./result/' + str(i))

    cnt = 0

    for i in list:
        filename = i.split('/')[-1]
        print filename,
        print km.labels_[cnt]
        shutil.copyfile(i, './result/' +  str(km.labels_[cnt]) + '/' + filename)
        cnt += 1

_normalizer.py 文件源码项目：coremltools 作者: gsabran 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def convert(model, input_features, output_features):
    """Convert a normalizer model to the protobuf spec.

    Parameters
    ----------
    model: Normalizer
        A Normalizer.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """

    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Normalizer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features, output_features)

    # Set the one hot encoder parameters
    _normalizer_spec = spec.normalizer
    if model.norm == 'l1':
        _normalizer_spec.normType = _proto__normalizer.L1
    elif model.norm == 'l2':
        _normalizer_spec.normType = _proto__normalizer.L2
    elif model.norm == 'max':
        _normalizer_spec.normType = _proto__normalizer.LMax
    return _MLModel(spec)

lsa.py 文件源码项目：nlp-lt 作者: minven 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def truncated_svd(self):
        # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
        svd = TruncatedSVD(self.dimensions)   
        lsa = make_pipeline(svd, Normalizer(copy=False))
        X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
        print(svd.components_[0])
        print(svd.explained_variance_ratio_) 
        print(svd.explained_variance_ratio_.sum())

ml_framework.py 文件源码项目：FLASH 作者: yuyuz 项目源码文件源码阅读 76 收藏 0 点赞 0 评论 0

def get_data_preprocessor_rescaling(params):
    dpr = None
    d_rescaling = params['layer_dict_list'][0]

    if params['rescaling'] == str(d_rescaling['None']) or params['rescaling'] == 'None':
        dpr = None
    elif params['rescaling'] == str(d_rescaling['MinMax']) or params['rescaling'] == 'MinMax':
        dpr = MinMaxScaler()
    elif params['rescaling'] == str(d_rescaling['Standardize']) or params['rescaling'] == 'Standardize':
        dpr = StandardScaler()
    elif params['rescaling'] == str(d_rescaling['Normalize']) or params['rescaling'] == 'Normalize':
        dpr = Normalizer()

    return dpr

_normalizer.py 文件源码项目：coremltools 作者: apple 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def convert(model, input_features, output_features):
    """Convert a normalizer model to the protobuf spec.

    Parameters
    ----------
    model: Normalizer
        A Normalizer.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """

    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Normalizer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features, output_features)

    # Set the one hot encoder parameters
    _normalizer_spec = spec.normalizer
    if model.norm == 'l1':
        _normalizer_spec.normType = _proto__normalizer.L1
    elif model.norm == 'l2':
        _normalizer_spec.normType = _proto__normalizer.L2
    elif model.norm == 'max':
        _normalizer_spec.normType = _proto__normalizer.LMax
    return _MLModel(spec)

test_normalizer.py 文件源码项目：coremltools 作者: apple 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def test_random(self):
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size = (50, 3))

        for param in ('l1', 'l2', 'max'):

            cur_model= Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out')

            metrics = evaluate_transformer(spec, 
                    [dict(zip(["a", "b", "c"], row)) for row in X], 
                    [{"out" : row} for row in output])

test_normalizer.py 文件源码项目：coremltools 作者: apple 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def test_boston(self):
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Normalizer(norm='l2').fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names, 'out')

        input_data = [dict(zip(scikit_data.feature_names, row)) 
                for row in scikit_data.data]

        output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]

        evaluate_transformer(spec, input_data, output_data)

ClasteringCalculator.py 文件源码项目：TextStageProcessor 作者: mhyhre 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def make_ward_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'WARD/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
        predict_result = ward.fit_predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

ClasteringCalculator.py 文件源码项目：TextStageProcessor 作者: mhyhre 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def make_spectral_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'spectral/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
        predict_result = spectral.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

    # aa = Affinity Propagation

ClasteringCalculator.py 文件源码项目：TextStageProcessor 作者: mhyhre 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def make_aa_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'affinity_propagation/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
                                               max_iter=self.aa_max_iter,
                                               convergence_iter=self.aa_no_change_stop)

        predict_result = aa_clusterizator.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

ClasteringCalculator.py 文件源码项目：TextStageProcessor 作者: mhyhre 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def avg_spelling_error(lang=None):
    pipeline = Pipeline([('feature', SpellingError(language=lang)),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('avg_spelling_error', pipeline)

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def punctuation_features():
    pipeline = Pipeline([('feature', PunctuationFeatures()),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('punctuation_features', pipeline)

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def word_bigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
                                                  ngram_range=(2, 2))),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_bigrams', pipeline)

pipelines.py 文件源码项目：magic 作者: pan-webis-de 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def char_ngrams():
    vectorizer = CountVectorizer(min_df=1,
                                 preprocessor=TextCleaner(filter_urls=True,
                                                          filter_mentions=True,
                                                          filter_hashtags=True,
                                                          lowercase=False),
                                 analyzer='char_wb',
                                 ngram_range=(4, 4))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('char_ngrams', pipeline)

__init__.py 文件源码项目：mlprojects-py 作者: srinathperera 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def doPCA(X, output_columns_count):
    #DO PCA on the data and use it to transform
    svd = TruncatedSVD(output_columns_count)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)
    return X

MVectorizer.py 文件源码项目：QuestionAnswerNLP 作者: debjyoti385 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def normalize(matrix):
  '''Normalize each row (L2-norm) of a CSR sparse matrix (it should work with most sparse matrices though)'''
  sparsy = matrix.tocoo()
  data = [float(d) for d in sparsy.data]
  return Normalizer().transform(csr_matrix((data, (sparsy.row, sparsy.col))))


#
# Simple tests
#

scikit_classifier.py 文件源码项目：QuestionAnswerNLP 作者: debjyoti385 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def batch_classify(self, featuresets):
        X = self.feature_vectorizer.transform(featuresets)
        X = Normalizer().fit_transform(X)
        y = self.classifier.predict(X)
        return [self.inverse_label_vectorizer[cls] for cls in y]

scikit_classifier.py 文件源码项目：QuestionAnswerNLP 作者: debjyoti385 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def classify(self, featureset):
        X = self.feature_vectorizer.transform([featureset])
        X = Normalizer().fit_transform(X)
        y = self.classifier.predict(X)
        assert(len(y) == 1)
        return self.inverse_label_vectorizer[y[0]]

legacy_script.py 文件源码项目：AppsOfDataAnalysis 作者: nhanloukiala 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def l2_norm(dataset, **kwargs):
    return prep.Normalizer(norm='l2', copy=True).fit_transform(dataset)

parametric.py 文件源码项目：kenchi 作者: Y-oHr-N 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples.

        Returns
        -------
        self : detector
            Return self.
        """

        X                    = check_array(X)

        if not self.assume_normalized:
            self._normalizer = Normalizer().fit(X)
            X                = self._normalizer.transform(X)

        mean                 = np.mean(X, axis=0)
        self.mean_direction_ = mean / np.linalg.norm(mean)

        self.y_score_        = self.anomaly_score(X)
        df, loc, scale       = chi2.fit(self.y_score_)
        self.threshold_      = chi2.ppf(1.0 - self.fpr, df, loc, scale)

        return self

common.py 文件源码项目：mars_express 作者: wsteitz 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def __init__(self):
        self.scaler = preprocessing.StandardScaler()
        self.normer = preprocessing.Normalizer()

classification.py 文件源码项目：DocumentClassification 作者: bahmanh 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa

11.3 normalize.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def test_Normalizer():
    '''
    test the method
    :return: None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [1,3,5,2,4,],
          [2,4,1,3,5] ]
    print("before transform:",X)
    normalizer=Normalizer(norm='l2')
    print("after transform:",normalizer.transform(X))

sk_feature_process.py 文件源码项目：python_utils 作者: Jayhello 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def test_normalizer():
    from sklearn.preprocessing import Normalizer
    arr = np.array([[3, -1],
                    [-4, 2]])

    print Normalizer().fit_transform(arr)
    # [[ 0.9486833  -0.31622777]
    #  [-0.89442719  0.4472136 ]]