def new(n_feature=128):
vectorizer = CountVectorizer(
encoding='utf-8',
ngram_range=(1,1), # Unigram only
max_features=n_feature,
binary=True
)
# Fill the gap (missing expected tags)
# ---
# Hypothesis: Some tags are somehow related so
# we smoothen the missing values with matrix factorisation.
smoother = NMF(n_components=n_feature)
# Binarise the vector's individual values
binariser = Binarizer(copy=True)
# Count vectoriser => NMF as smoother => Binariser
print(colored('Taghasher model created','yellow'))
return [vectorizer,smoother,binariser]
python类Binarizer()的实例源码
def sklearn_one_hot_vectorize(corpus):
# The Sklearn one hot vectorize method
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
freq = CountVectorizer()
vectors = freq.fit_transform(corpus)
print(len(vectors.toarray()[0]))
onehot = Binarizer()
vectors = onehot.fit_transform(vectors.toarray())
print(len(vectors[0]))
def fit_voting(self):
voting = 'soft'
names = [
# 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,'
# 'elongated,negation_count)',
# 'logreg(w2v_doc)',
# 'logreg(w2v_word_avg_google)',
'word2vec_bayes',
'cnn_word(embedding=google)',
'rnn_word(embedding=google)',
]
classifiers = [ExternalModel({
self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)),
self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)),
}) for name in names]
all_scores = []
for classifier in classifiers:
scores = classifier.predict_proba(self.val_docs)
if voting == 'hard':
scores = Binarizer(1 / 3).transform(scores)
all_scores.append(scores)
all_scores = np.array(all_scores)
all_scores_first, all_scores_rest = all_scores[0], all_scores[1:]
le = LabelEncoder().fit(self.classes_)
val_label_indexes = le.transform(self.val_labels())
# assume w_0=1 as w is invariant to scaling
w = basinhopping(
lambda w_: -(val_label_indexes == np.argmax((
all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1))
).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000,
minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1))
).x
w = np.hstack([[1], w])
w /= w.sum()
logging.info('w: {}'.format(w))
estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w)
estimator.le_ = le
estimator.estimators_ = classifiers
return 'vote({})'.format(','.join(names)), estimator
def transform_data(x_i, le, with_fit=True):
if isinstance(le, preprocessing.MinMaxScaler) or isinstance(le, preprocessing.Binarizer):
x_i = x_i.astype(np.float)
if with_fit:
le.fit(x_i.reshape((-1, 1)))
x_i = le.transform(x_i.reshape((-1, 1)))
elif isinstance(le, preprocessing.LabelEncoder):
if with_fit:
le.fit(x_i)
x_i = le.transform(x_i.reshape((-1, 1)))
else:
raise ValueError("unknow transform")
return x_i.reshape((-1)), le
def test_Binarizer():
'''
test Binatizer method
:return: None
'''
X=[ [1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3,],
[1,1,1,1,1] ]
print("before transform:",X)
binarizer=Binarizer(threshold=2.5)
print("after transform:",binarizer.transform(X))
def test_binarizer():
from sklearn.preprocessing import Binarizer
arr = np.array([0, 1, 2, 3, 4])
print Binarizer(threshold=2).fit_transform(arr)
# [[0 0 0 1 1]]
def main():
t = time.time()
img = imread(args.img_file_path)
imgs = [img, watermark(img), rotate(img), crop(img), mirror(img)]
imgs_norm = image_normalize(imgs)
dataset_features = np.load('fc6.npy')
query_start = time.time()
query_features = extract_feature(imgs_norm)
binarizer = preprocessing.Binarizer().fit(query_features)
query_features = binarizer.transform(query_features)
print(dataset_features)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist
cosine = distance.cdist(dataset_features, query_features, 'cosine')
print(cosine.shape)
dis = cosine
inds_all = argsort(dis, axis=0) # ???? https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
print('query cost: %f, dataset: %d, query: %d' % (time.time() - query_start, len(dataset_features), len(imgs)))
img_names = load_image_names()
fig, axes = plt.subplots(5, 11, figsize=(22, 10), subplot_kw={'xticks': [], 'yticks': []})
fig.subplots_adjust(hspace=0.15, wspace=0.01, left=.02, right=.98, top=.92, bottom=.08)
titles = ['original', 'watermark', 'rotate', 'crop', 'mirror']
for i in range(len(imgs)):
topK = []
inds = inds_all[:, i]
# print(inds)
for k in range(10):
topK.append(img_names[inds[k]])
print(inds[k], dis[inds[k], i], img_names[inds[k]])
original = axes[i, 0]
original.set_title(titles[i])
img = imgs[i]
original.imshow(img)
for j in range(10):
ax = axes[i, j + 1]
img = imread(topK[j])
ax.imshow(img)
title = '%d : %f' % (j + 1, dis[inds[j], i])
ax.set_title(title)
savePath = args.img_file_path + '_search_result.jpg'
plt.savefig(savePath)
print(time.time() - t)
# os.system('open -a Preview.app -F ' + savePath)
def main():
x, fc6 = initModel()
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
img_names = load_image_names(args.input_data_dir)
with open(args.output_image_name_file, 'w') as img_names_file:
for img_name in img_names:
img_names_file.write(img_name + '\n')
t = time.time()
# ???????????
batch_size = 100
features = []
with open(args.output_feature_file, 'w') as output_file:
for i in range(0, int(math.ceil(len(img_names) / (batch_size * 1.0)))):
print('batch: %d' % i)
if (i + 1) * batch_size < len(img_names):
img_names_batch = img_names[i * batch_size:(i + 1) * batch_size]
else:
img_names_batch = img_names[i * batch_size:len(img_names)]
img_batch = load_images(img_names_batch)
output = sess.run(fc6, feed_dict={x: img_batch})
features.append(output)
features = np.vstack(features)
# binarizer = preprocessing.Binarizer().fit(features)
# features = binarizer.transform(features)
np.save(output_file, features)
# with open('fc6.npy', 'w') as output_file:
# for i in range(0, int(math.ceil(len(imgs) / (batch_size * 1.0)))):
# print('batch: %d' % i)
# if (i + 1) * batch_size < len(imgs):
# img_batch = imgs[i * batch_size:(i + 1) * batch_size]
# else:
# img_batch = imgs[i * batch_size: len(imgs)]
# output = sess.run(fc6, feed_dict={x: img_batch})
# features.append(output)
# features = np.vstack(features)
# np.save(output_file, features)
print(time.time() - t)