def test_boston_OHE_pipeline(self):
data = load_boston()
for categorical_features in [ [3], [8], [3, 8], [8,3] ]:
# Put it in a pipeline so that we can test whether the output dimension
# handling is correct.
model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
("Normalizer", Normalizer())])
model.fit(data.data.copy(), data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, 'out').get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out" : row} for row in model.transform(data.data.copy())]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
python类Normalizer()的实例源码
def word_unigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
vectorizer = CountVectorizer(min_df=2,
stop_words=get_stopwords(),
preprocessor=preprocessor,
ngram_range=(1, 1))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_unigrams', pipeline)
def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
svd = TruncatedSVD(n_features)
if normalize:
if flip:
lsa = make_pipeline(svd, Normalizer(copy=False))
doc_mat = lsa.fit_transform(doc_vecs.transpose())
doc_mat = doc_mat.transpose()
else:
lsa = make_pipeline(svd, Normalizer(copy=False))
doc_mat = lsa.fit_transform(doc_vecs)
return doc_mat
else:
if flip:
doc_mat = svd.fit_transform(doc_vecs.transpose())
doc_mat = doc_mat.transpose()
else:
doc_mat = svd.fit_transform(doc_vecs)
return doc_mat
def train(labeled_featuresets, C=1e5):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
"""
feat = [featureset for featureset, label in labeled_featuresets]
feature_vectorizer = MVectorizer.DictsVectorizer()
X = feature_vectorizer.fit_transform(feat)
X = Normalizer().fit_transform(X)
label_set = set( [label for featureset, label in labeled_featuresets] )
label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
# print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
classifier = OneVsRestClassifier(LinearSVC(loss='squared_hinge', penalty='l2', dual=True, tol=1e-5, C=C))
classifier.fit(X,y)
# print "done"
return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
bis_avg.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def pool(biz_dict, vlad_dict, mode):
if mode == 'train':
y_dict = read_y()
y = np.zeros((0, 9))
x = np.array([])
x_vlad = np.array([])
for key, value in sorted(biz_dict.items()):
avg = np.array(value).sum(axis=0) / len(value)
vlad = vlad_dict.get(key)
# vlad = preprocessing.normalize(vlad)
# print(vlad.shape)
# feat = np.concatenate([avg, vlad], axis=0)
# feat = preprocessing.Normalizer().fit_transform(feat)
# feat = avg
x = np.vstack((x, avg)) if x.size else avg
x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad
if mode == 'train':
y = np.vstack((y, y_dict.get(key)))
return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
def main():
features = []
for i in list:
im = cv2.imread(i)
hist, bins = np.histogram(im.ravel(), 256, [0, 256])
features.append(hist)
lsa = TruncatedSVD(10)
features = lsa.fit_transform(features)
features = Normalizer(copy = False).fit_transform(features)
km = KMeans(
init='k-means++',
n_clusters=n_clusters,
)
km.fit(features)
for i in range(n_clusters):
if not os.path.exists('./result/' + str(i)):
os.makedirs('./result/' + str(i))
cnt = 0
for i in list:
filename = i.split('/')[-1]
print filename,
print km.labels_[cnt]
shutil.copyfile(i, './result/' + str(km.labels_[cnt]) + '/' + filename)
cnt += 1
def convert(model, input_features, output_features):
"""Convert a normalizer model to the protobuf spec.
Parameters
----------
model: Normalizer
A Normalizer.
input_features: str
Name of the input column.
output_features: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
# Test the scikit-learn model
_sklearn_util.check_expected_type(model, Normalizer)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))
# Set the interface params.
spec = _Model_pb2.Model()
spec.specificationVersion = SPECIFICATION_VERSION
spec = _set_transform_interface_params(spec, input_features, output_features)
# Set the one hot encoder parameters
_normalizer_spec = spec.normalizer
if model.norm == 'l1':
_normalizer_spec.normType = _proto__normalizer.L1
elif model.norm == 'l2':
_normalizer_spec.normType = _proto__normalizer.L2
elif model.norm == 'max':
_normalizer_spec.normType = _proto__normalizer.LMax
return _MLModel(spec)
def truncated_svd(self):
# https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
svd = TruncatedSVD(self.dimensions)
lsa = make_pipeline(svd, Normalizer(copy=False))
X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
print(svd.components_[0])
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
def get_data_preprocessor_rescaling(params):
dpr = None
d_rescaling = params['layer_dict_list'][0]
if params['rescaling'] == str(d_rescaling['None']) or params['rescaling'] == 'None':
dpr = None
elif params['rescaling'] == str(d_rescaling['MinMax']) or params['rescaling'] == 'MinMax':
dpr = MinMaxScaler()
elif params['rescaling'] == str(d_rescaling['Standardize']) or params['rescaling'] == 'Standardize':
dpr = StandardScaler()
elif params['rescaling'] == str(d_rescaling['Normalize']) or params['rescaling'] == 'Normalize':
dpr = Normalizer()
return dpr
def convert(model, input_features, output_features):
"""Convert a normalizer model to the protobuf spec.
Parameters
----------
model: Normalizer
A Normalizer.
input_features: str
Name of the input column.
output_features: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
# Test the scikit-learn model
_sklearn_util.check_expected_type(model, Normalizer)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))
# Set the interface params.
spec = _Model_pb2.Model()
spec.specificationVersion = SPECIFICATION_VERSION
spec = _set_transform_interface_params(spec, input_features, output_features)
# Set the one hot encoder parameters
_normalizer_spec = spec.normalizer
if model.norm == 'l1':
_normalizer_spec.normType = _proto__normalizer.L1
elif model.norm == 'l2':
_normalizer_spec.normType = _proto__normalizer.L2
elif model.norm == 'max':
_normalizer_spec.normType = _proto__normalizer.LMax
return _MLModel(spec)
def test_random(self):
# Generate some random data_imputeValue.multiArrayValue[i]
X = _np.random.random(size = (50, 3))
for param in ('l1', 'l2', 'max'):
cur_model= Normalizer(norm=param)
output = cur_model.fit_transform(X)
spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out')
metrics = evaluate_transformer(spec,
[dict(zip(["a", "b", "c"], row)) for row in X],
[{"out" : row} for row in output])
def test_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = Normalizer(norm='l2').fit(scikit_data.data)
spec = converter.convert(scikit_model, scikit_data.feature_names, 'out')
input_data = [dict(zip(scikit_data.feature_names, row))
for row in scikit_data.data]
output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]
evaluate_transformer(spec, input_data, output_data)
def make_ward_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'WARD/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
predict_result = ward.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def make_spectral_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'spectral/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
predict_result = spectral.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
# aa = Affinity Propagation
def make_aa_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'affinity_propagation/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
max_iter=self.aa_max_iter,
convergence_iter=self.aa_no_change_stop)
predict_result = aa_clusterizator.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def make_birch_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'birch/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
birch = Birch(threshold=self.birch_threshold,
branching_factor=self.birch_branching_factor,
n_clusters=self.birch_clusters_count)
predict_result = birch.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def avg_spelling_error(lang=None):
pipeline = Pipeline([('feature', SpellingError(language=lang)),
('tfidf', TfidfTransformer(sublinear_tf=False)),
('scale', Normalizer())])
return ('avg_spelling_error', pipeline)
def punctuation_features():
pipeline = Pipeline([('feature', PunctuationFeatures()),
('tfidf', TfidfTransformer(sublinear_tf=False)),
('scale', Normalizer())])
return ('punctuation_features', pipeline)
def word_bigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
ngram_range=(2, 2))),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_bigrams', pipeline)
def char_ngrams():
vectorizer = CountVectorizer(min_df=1,
preprocessor=TextCleaner(filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
lowercase=False),
analyzer='char_wb',
ngram_range=(4, 4))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('char_ngrams', pipeline)
def doPCA(X, output_columns_count):
#DO PCA on the data and use it to transform
svd = TruncatedSVD(output_columns_count)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
return X
def normalize(matrix):
'''Normalize each row (L2-norm) of a CSR sparse matrix (it should work with most sparse matrices though)'''
sparsy = matrix.tocoo()
data = [float(d) for d in sparsy.data]
return Normalizer().transform(csr_matrix((data, (sparsy.row, sparsy.col))))
#
# Simple tests
#
def batch_classify(self, featuresets):
X = self.feature_vectorizer.transform(featuresets)
X = Normalizer().fit_transform(X)
y = self.classifier.predict(X)
return [self.inverse_label_vectorizer[cls] for cls in y]
def classify(self, featureset):
X = self.feature_vectorizer.transform([featureset])
X = Normalizer().fit_transform(X)
y = self.classifier.predict(X)
assert(len(y) == 1)
return self.inverse_label_vectorizer[y[0]]
def l2_norm(dataset, **kwargs):
return prep.Normalizer(norm='l2', copy=True).fit_transform(dataset)
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples.
Returns
-------
self : detector
Return self.
"""
X = check_array(X)
if not self.assume_normalized:
self._normalizer = Normalizer().fit(X)
X = self._normalizer.transform(X)
mean = np.mean(X, axis=0)
self.mean_direction_ = mean / np.linalg.norm(mean)
self.y_score_ = self.anomaly_score(X)
df, loc, scale = chi2.fit(self.y_score_)
self.threshold_ = chi2.ppf(1.0 - self.fpr, df, loc, scale)
return self
def __init__(self):
self.scaler = preprocessing.StandardScaler()
self.normer = preprocessing.Normalizer()
def featuresByLSA(features,ncomponents=100):
svd = TruncatedSVD(n_components=ncomponents)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
dtm_lsa = lsa.fit_transform(features)
return dtm_lsa
def test_Normalizer():
'''
test the method
:return: None
'''
X=[ [1,2,3,4,5],
[5,4,3,2,1],
[1,3,5,2,4,],
[2,4,1,3,5] ]
print("before transform:",X)
normalizer=Normalizer(norm='l2')
print("after transform:",normalizer.transform(X))
def test_normalizer():
from sklearn.preprocessing import Normalizer
arr = np.array([[3, -1],
[-4, 2]])
print Normalizer().fit_transform(arr)
# [[ 0.9486833 -0.31622777]
# [-0.89442719 0.4472136 ]]