python类normalize()的实例源码

STFIWF.py 文件源码 项目:2016_CCFsougou 作者: dhdsjy 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
STFIWF.py 文件源码 项目:2016_CCFsougou2 作者: dhdsjy 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
STFIWF.py 文件源码 项目:2016_CCFsougou2 作者: dhdsjy 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
bis_avg.py 文件源码 项目:kaggle-yelp-restaurant-photo-classification 作者: u1234x1234 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def pool(biz_dict, vlad_dict, mode):
    if mode == 'train':
        y_dict = read_y()
    y = np.zeros((0, 9))
    x = np.array([])
    x_vlad = np.array([])

    for key, value in sorted(biz_dict.items()):
        avg = np.array(value).sum(axis=0) / len(value)
        vlad = vlad_dict.get(key)
#        vlad = preprocessing.normalize(vlad)
#        print(vlad.shape)
#        feat = np.concatenate([avg, vlad], axis=0)
#        feat = preprocessing.Normalizer().fit_transform(feat)
#        feat = avg
        x = np.vstack((x, avg)) if x.size else avg
        x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad

        if mode == 'train':
            y = np.vstack((y, y_dict.get(key)))        
    return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
encoding.py 文件源码 项目:wi_wacv14 作者: VChristlein 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def normalizeEnc(enc, method):
    """
    normalize encoding w. global normalization scheme(s)

    parameters:
        enc: the encoding vector to normalize
        method:
            'ssr': signed square root
            'l2g': global l2 normalization
    """
    # ssr-normalization (kinda hellinger-normalization)
    if 'ssr' in method:
        enc = np.sign(enc) * np.sqrt(np.abs(enc))

    if 'l2g' in method:
        enc = preprocessing.normalize(enc)

    return enc
encoding.py 文件源码 项目:wi_wacv14 作者: VChristlein 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def vlad(data, means, assignments, components, 
               normalize=['l2c']):
    """
    compute 'vector of locally aggregated descriptors'
    """
    def encode(k):
        uk_ = assignments[:,k].T.dot(data)        

        clustermass = assignments[:,k].sum()
        if clustermass > 0:
            uk_ -= clustermass * means[k]

        if 'l2c' in normalize:
            n = max(math.sqrt(np.sum(uk_ * uk_)), 1e-12)
            uk_ /= n

        return uk_

    uk = map(encode, range(components))

    uk = np.concatenate(uk, axis=0).reshape(1,-1)

    return uk
word_movers_knn.py 文件源码 项目:sentence-classification 作者: bgmartins 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def predict(self, X):
        """Predict the class labels for the provided data
        Parameters
        ----------
        X : scipy.sparse matrix, shape (n_test_samples, vocab_size)
            Test samples.

        Returns
        -------
        y : array of shape [n_samples]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)
        dist = self._pairwise_wmd(sp.sparse.csr_matrix(X))
        return super(WordMoversKNN, self).predict(dist)
neural_network.py 文件源码 项目:FlapPyBio 作者: michael-iuzzolino 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def feed_forward(self, X):
        X = np.asarray(X)
        for index, (matrix, b) in enumerate(zip(self.W[:-1], self.b)):

            size_output = self.topology[index+1]

            if index == 0:

                X = normalize(X[:,np.newaxis], axis=0).ravel()
                dot_ = np.dot(matrix, X)

            else:
                dot_ = np.dot(matrix, output)

            output = self._activation_(dot_ + b, size_output)

        self.output = output[0]
run_ngtm.py 文件源码 项目:neural_topic_models 作者: dallascard 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def save_mean_representations(model, model_filename, X, labels, pred_file):
    n_items, dv = X.shape
    n_classes = model.n_classes
    n_topics = model.d_t

    # try normalizing input vectors
    test_X = normalize(np.array(X, dtype='float32'), axis=1)

    model.load_params(model_filename)

    # evaluate bound on test set
    item_mus = []
    for item in range(n_items):
        y = labels[item]

        # save the mean document representation
        r_mu = model.get_mean_doc_rep(test_X[item, :], y)
        item_mus.append(np.array(r_mu))

    # write all the test doc representations to file
    if pred_file is not None and n_topics > 1:
        np.savez_compressed(pred_file, X=np.array(item_mus), y=labels)
edge_detector_cnn.py 文件源码 项目:nn-segmentation-for-lar 作者: cvdlab 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def predict_image(self, test_img):
        """
        predicts classes of input image
        :param test_img: filepath to image to predict on
        :param show: displays segmentation results
        :return: segmented result
        """
        img = np.array( rgb2gray( imread( test_img ).astype( 'float' ) ).reshape( 5, 216, 160 )[-2] ) / 256

        plist = []

        # create patches from an entire slice
        img_1 = adjust_sigmoid( img ).astype( float )
        edges_1 = adjust_sigmoid( img, inv=True ).astype( float )
        edges_2 = img_1
        edges_5_n = normalize( laplace( img_1 ) )
        edges_5_n = img_as_float( img_as_ubyte( edges_5_n ) )

        plist.append( extract_patches_2d( edges_1, (23, 23) ) )
        plist.append( extract_patches_2d( edges_2, (23, 23) ) )
        plist.append( extract_patches_2d( edges_5_n, (23, 23) ) )
        patches = np.array( zip( np.array( plist[0] ), np.array( plist[1] ), np.array( plist[2] ) ) )

        # predict classes of each pixel based on model
        full_pred = self.model.predict_classes( patches )
        fp1 = full_pred.reshape( 194, 138 )
        return fp1
data_node_frame.py 文件源码 项目:skp_edu_docker 作者: TensorMSA 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
        """ Label? ??? ??? ??? ??? ??? Row ??? ????.
        Args:
          params:
            * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
            * _df_csv_read_ori : pandas dataframe
            * _label
        Returns:
          Preprocessing Dataframe
        """
        if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
            logging.info("No Duplicate")
            result_df =  _df_csv_read_ori
        else :
            cell_features = _df_csv_read_ori.columns.tolist()
            cell_features.remove(_label)
            result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
            logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
            temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
            result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
        return result_df
vector.py 文件源码 项目:CerebralCortex-2.0-legacy 作者: MD2Korg 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def normalize(datastream: DataStream) -> DataStream:
    """

    :param datastream:
    :return:
    """
    result = DataStream.from_datastream(input_streams=[datastream])
    if datastream.data is None or len(datastream.data) == 0:
        result.data = []
        return result

    input_data = np.array([i.sample for i in datastream.data])

    data = preprocessing.normalize(input_data, axis=0)

    result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i])
                   for i, v in enumerate(datastream.data)]

    return result
random_features_helper.py 文件源码 项目:hyperband_benchmarks 作者: lishal 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def compute_preprocessor(self,method):
        self.data={}
        if method=='none':
            self.data=self.orig_data
        elif method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
kernel_lsqr_helper.py 文件源码 项目:hyperband_benchmarks 作者: lishal 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def compute_preprocessor(self,method):
        self.data={}
        if method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
paraphrase.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_sils_matrix(method, scores, wordlist):
    ''' See get_sims_matrix for definitions, which are the same here. The
    difference is that the resulting matrix contains distances instead of
    similarities.

    :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
    '''
    if method =='direct':
        sims = get_sims_matrix(method, scores, wordlist)
        sims = preprocessing.normalize(np.matrix(sims), norm='l2')
        sils = 1-sims
    elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
        sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
    elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
        sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
    elif method == 'vec_cosine':
        d = scores.values()[0].shape[0]
        sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
    else:
        sys.stderr.write('Unknown sil method: %s' % method)
        return None
    sils = np.nan_to_num(sils)
    return sils
von_mises_fisher_mixture.py 文件源码 项目:spherecluster 作者: clara-labs 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, n_clusters=5, posterior_type='soft', force_weights=None,
                 n_init=10, n_jobs=1, max_iter=300, verbose=False,
                 init='random-class', random_state=None, tol=1e-6,
                 copy_x=True, normalize=True):
        self.n_clusters = n_clusters
        self.posterior_type = posterior_type
        self.force_weights = force_weights
        self.n_init = n_init
        self.n_jobs = n_jobs
        self.max_iter = max_iter
        self.verbose = verbose
        self.init = init
        self.random_state = random_state
        self.tol = tol
        self.copy_x = copy_x
        self.normalize = normalize

        # results from algorithm
        self.cluster_centers_ = None
        self.labels = None
        self.intertia_ = None
        self.weights_ = None
        self.concentrations_ = None
        self.posterior_ = None
von_mises_fisher_mixture.py 文件源码 项目:spherecluster 作者: clara-labs 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def fit(self, X, y=None):
        """Compute mixture of von Mises Fisher clustering.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """
        if self.normalize:
            X = normalize(X)

        self._check_force_weights()
        random_state = check_random_state(self.random_state)
        X = self._check_fit_data(X)

        (self.cluster_centers_, self.labels_, self.inertia_, self.weights_,
         self.concentrations_, self.posterior_) = movMF(
                X, self.n_clusters, posterior_type=self.posterior_type,
                force_weights=self.force_weights, n_init=self.n_init,
                n_jobs=self.n_jobs, max_iter=self.max_iter,
                verbose=self.verbose, init=self.init,
                random_state=random_state,
                tol=self.tol, copy_x=self.copy_x
            )

        return self
von_mises_fisher_mixture.py 文件源码 项目:spherecluster 作者: clara-labs 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def transform(self, X, y=None):
        """Transform X to a cluster-distance space.
        In the new space, each dimension is the cosine distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------
        X_new : array, shape [n_samples, k]
            X transformed in the new space.
        """
        if self.normalize:
            X = normalize(X)

        check_is_fitted(self, 'cluster_centers_')
        X = self._check_test_data(X)
        return self._transform(X)
von_mises_fisher_mixture.py 文件源码 项目:spherecluster 作者: clara-labs 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Note:  Does not check that each point is on the sphere.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        if self.normalize:
            X = normalize(X)

        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        return _labels_inertia(X, self.cluster_centers_)[0]
test_pairwise.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)


问题


面经


文章

微信
公众号

扫码关注公众号