def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
python类normalize()的实例源码
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
See also
--------
strip_accents_ascii
Remove accentuated char for any unicode symbol that has a direct
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
bis_avg.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def pool(biz_dict, vlad_dict, mode):
if mode == 'train':
y_dict = read_y()
y = np.zeros((0, 9))
x = np.array([])
x_vlad = np.array([])
for key, value in sorted(biz_dict.items()):
avg = np.array(value).sum(axis=0) / len(value)
vlad = vlad_dict.get(key)
# vlad = preprocessing.normalize(vlad)
# print(vlad.shape)
# feat = np.concatenate([avg, vlad], axis=0)
# feat = preprocessing.Normalizer().fit_transform(feat)
# feat = avg
x = np.vstack((x, avg)) if x.size else avg
x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad
if mode == 'train':
y = np.vstack((y, y_dict.get(key)))
return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
def normalizeEnc(enc, method):
"""
normalize encoding w. global normalization scheme(s)
parameters:
enc: the encoding vector to normalize
method:
'ssr': signed square root
'l2g': global l2 normalization
"""
# ssr-normalization (kinda hellinger-normalization)
if 'ssr' in method:
enc = np.sign(enc) * np.sqrt(np.abs(enc))
if 'l2g' in method:
enc = preprocessing.normalize(enc)
return enc
def vlad(data, means, assignments, components,
normalize=['l2c']):
"""
compute 'vector of locally aggregated descriptors'
"""
def encode(k):
uk_ = assignments[:,k].T.dot(data)
clustermass = assignments[:,k].sum()
if clustermass > 0:
uk_ -= clustermass * means[k]
if 'l2c' in normalize:
n = max(math.sqrt(np.sum(uk_ * uk_)), 1e-12)
uk_ /= n
return uk_
uk = map(encode, range(components))
uk = np.concatenate(uk, axis=0).reshape(1,-1)
return uk
def predict(self, X):
"""Predict the class labels for the provided data
Parameters
----------
X : scipy.sparse matrix, shape (n_test_samples, vocab_size)
Test samples.
Returns
-------
y : array of shape [n_samples]
Class labels for each data sample.
"""
X = check_array(X, accept_sparse='csr', copy=True)
X = normalize(X, norm='l1', copy=False)
dist = self._pairwise_wmd(sp.sparse.csr_matrix(X))
return super(WordMoversKNN, self).predict(dist)
def feed_forward(self, X):
X = np.asarray(X)
for index, (matrix, b) in enumerate(zip(self.W[:-1], self.b)):
size_output = self.topology[index+1]
if index == 0:
X = normalize(X[:,np.newaxis], axis=0).ravel()
dot_ = np.dot(matrix, X)
else:
dot_ = np.dot(matrix, output)
output = self._activation_(dot_ + b, size_output)
self.output = output[0]
def save_mean_representations(model, model_filename, X, labels, pred_file):
n_items, dv = X.shape
n_classes = model.n_classes
n_topics = model.d_t
# try normalizing input vectors
test_X = normalize(np.array(X, dtype='float32'), axis=1)
model.load_params(model_filename)
# evaluate bound on test set
item_mus = []
for item in range(n_items):
y = labels[item]
# save the mean document representation
r_mu = model.get_mean_doc_rep(test_X[item, :], y)
item_mus.append(np.array(r_mu))
# write all the test doc representations to file
if pred_file is not None and n_topics > 1:
np.savez_compressed(pred_file, X=np.array(item_mus), y=labels)
def predict_image(self, test_img):
"""
predicts classes of input image
:param test_img: filepath to image to predict on
:param show: displays segmentation results
:return: segmented result
"""
img = np.array( rgb2gray( imread( test_img ).astype( 'float' ) ).reshape( 5, 216, 160 )[-2] ) / 256
plist = []
# create patches from an entire slice
img_1 = adjust_sigmoid( img ).astype( float )
edges_1 = adjust_sigmoid( img, inv=True ).astype( float )
edges_2 = img_1
edges_5_n = normalize( laplace( img_1 ) )
edges_5_n = img_as_float( img_as_ubyte( edges_5_n ) )
plist.append( extract_patches_2d( edges_1, (23, 23) ) )
plist.append( extract_patches_2d( edges_2, (23, 23) ) )
plist.append( extract_patches_2d( edges_5_n, (23, 23) ) )
patches = np.array( zip( np.array( plist[0] ), np.array( plist[1] ), np.array( plist[2] ) ) )
# predict classes of each pixel based on model
full_pred = self.model.predict_classes( patches )
fp1 = full_pred.reshape( 194, 138 )
return fp1
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
""" Label? ??? ??? ??? ??? ??? Row ??? ????.
Args:
params:
* _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
* _df_csv_read_ori : pandas dataframe
* _label
Returns:
Preprocessing Dataframe
"""
if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
logging.info("No Duplicate")
result_df = _df_csv_read_ori
else :
cell_features = _df_csv_read_ori.columns.tolist()
cell_features.remove(_label)
result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
return result_df
def normalize(datastream: DataStream) -> DataStream:
"""
:param datastream:
:return:
"""
result = DataStream.from_datastream(input_streams=[datastream])
if datastream.data is None or len(datastream.data) == 0:
result.data = []
return result
input_data = np.array([i.sample for i in datastream.data])
data = preprocessing.normalize(input_data, axis=0)
result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i])
for i, v in enumerate(datastream.data)]
return result
def compute_preprocessor(self,method):
self.data={}
if method=='none':
self.data=self.orig_data
elif method=='min_max':
transform=preprocessing.MinMaxScaler()
self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
self.data['X_val']=transform.transform(self.orig_data['X_val'])
self.data['X_test']=transform.transform(self.orig_data['X_test'])
elif method=='scaled':
self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
elif method=='normalized':
self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
self.data['y_train']=self.orig_data['y_train']
self.data['y_val']=self.orig_data['y_val']
self.data['y_test']=self.orig_data['y_test']
def compute_preprocessor(self,method):
self.data={}
if method=='min_max':
transform=preprocessing.MinMaxScaler()
self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
self.data['X_val']=transform.transform(self.orig_data['X_val'])
self.data['X_test']=transform.transform(self.orig_data['X_test'])
elif method=='scaled':
self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
elif method=='normalized':
self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
self.data['y_train']=self.orig_data['y_train']
self.data['y_val']=self.orig_data['y_val']
self.data['y_test']=self.orig_data['y_test']
def get_sils_matrix(method, scores, wordlist):
''' See get_sims_matrix for definitions, which are the same here. The
difference is that the resulting matrix contains distances instead of
similarities.
:return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
'''
if method =='direct':
sims = get_sims_matrix(method, scores, wordlist)
sims = preprocessing.normalize(np.matrix(sims), norm='l2')
sils = 1-sims
elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
elif method == 'vec_cosine':
d = scores.values()[0].shape[0]
sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
else:
sys.stderr.write('Unknown sil method: %s' % method)
return None
sils = np.nan_to_num(sils)
return sils
def __init__(self, n_clusters=5, posterior_type='soft', force_weights=None,
n_init=10, n_jobs=1, max_iter=300, verbose=False,
init='random-class', random_state=None, tol=1e-6,
copy_x=True, normalize=True):
self.n_clusters = n_clusters
self.posterior_type = posterior_type
self.force_weights = force_weights
self.n_init = n_init
self.n_jobs = n_jobs
self.max_iter = max_iter
self.verbose = verbose
self.init = init
self.random_state = random_state
self.tol = tol
self.copy_x = copy_x
self.normalize = normalize
# results from algorithm
self.cluster_centers_ = None
self.labels = None
self.intertia_ = None
self.weights_ = None
self.concentrations_ = None
self.posterior_ = None
def fit(self, X, y=None):
"""Compute mixture of von Mises Fisher clustering.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
"""
if self.normalize:
X = normalize(X)
self._check_force_weights()
random_state = check_random_state(self.random_state)
X = self._check_fit_data(X)
(self.cluster_centers_, self.labels_, self.inertia_, self.weights_,
self.concentrations_, self.posterior_) = movMF(
X, self.n_clusters, posterior_type=self.posterior_type,
force_weights=self.force_weights, n_init=self.n_init,
n_jobs=self.n_jobs, max_iter=self.max_iter,
verbose=self.verbose, init=self.init,
random_state=random_state,
tol=self.tol, copy_x=self.copy_x
)
return self
def transform(self, X, y=None):
"""Transform X to a cluster-distance space.
In the new space, each dimension is the cosine distance to the cluster
centers. Note that even if X is sparse, the array returned by
`transform` will typically be dense.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to transform.
Returns
-------
X_new : array, shape [n_samples, k]
X transformed in the new space.
"""
if self.normalize:
X = normalize(X)
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
return self._transform(X)
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Note: Does not check that each point is on the sphere.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to predict.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
if self.normalize:
X = normalize(X)
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
return _labels_inertia(X, self.cluster_centers_)[0]
def test_cosine_similarity():
# Test the cosine_similarity.
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)
for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2)