def trainPCA(dstFolder, BATCH_SIZE = 50):
ipca = IncrementalPCA(n_components=10, batch_size=BATCH_SIZE)
for batch_isomaps_path in glob.glob(os.path.join(dstFolder, "*_isomaps_centered.p")):
batch_num = int(os.path.basename(batch_isomaps_path).split("_")[0])
isomaps = pickle.load(open(batch_isomaps_path, "rb"))
# remove the illumination and count channel
isomaps_just_color = isomaps[:,:,:,1:3]
#flatten to data
X = isomaps_just_color.reshape(isomaps_just_color.shape[0], -1) # ,isomaps_just_color.shape[3])
ipca.partial_fit(X)
with open(os.path.join(dstFolder, "pca.p"), "wb") as file:
pickle.dump(ipca, file)
python类IncrementalPCA()的实例源码
def fixed_batch_size_comparison(data):
all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10,
data.shape[1], num=5)]
batch_size = 1000
# Compare runtimes and error for fixed batch size
all_times = defaultdict(list)
all_errors = defaultdict(list)
for n_components in all_features:
pca = PCA(n_components=n_components)
rpca = RandomizedPCA(n_components=n_components, random_state=1999)
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
('ipca', ipca),
('rpca', rpca)]}
for k in sorted(results_dict.keys()):
all_times[k].append(results_dict[k]['time'])
all_errors[k].append(results_dict[k]['error'])
plot_feature_times(all_times, batch_size, all_features, data)
plot_feature_errors(all_errors, batch_size, all_features, data)
def test_incremental_pca():
# Incremental PCA on dense arrays.
X = iris.data
batch_size = X.shape[0] // 3
ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
pca = PCA(n_components=2)
pca.fit_transform(X)
X_transformed = ipca.fit_transform(X)
np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2))
assert_almost_equal(ipca.explained_variance_ratio_.sum(),
pca.explained_variance_ratio_.sum(), 1)
for n_components in [1, 2, X.shape[1]]:
ipca = IncrementalPCA(n_components, batch_size=batch_size)
ipca.fit(X)
cov = ipca.get_covariance()
precision = ipca.get_precision()
assert_array_almost_equal(np.dot(cov, precision),
np.eye(X.shape[1]))
def test_incremental_pca_check_projection():
# Test that the projection of data is correct.
rng = np.random.RandomState(1999)
n, p = 100, 3
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5])
Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
# Get the reconstruction of the generated data X
# Note that Xt has the same "components" as X, just separated
# This is what we want to ensure is recreated correctly
Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)
# Normalize
Yt /= np.sqrt((Yt ** 2).sum())
# Make sure that the first element of Yt is ~1, this means
# the reconstruction worked as expected
assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
def test_incremental_pca_set_params():
# Test that components_ sign is stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 20
X = rng.randn(n_samples, n_features)
X2 = rng.randn(n_samples, n_features)
X3 = rng.randn(n_samples, n_features)
ipca = IncrementalPCA(n_components=20)
ipca.fit(X)
# Decreasing number of components
ipca.set_params(n_components=10)
assert_raises(ValueError, ipca.partial_fit, X2)
# Increasing number of components
ipca.set_params(n_components=15)
assert_raises(ValueError, ipca.partial_fit, X3)
# Returning to original setting
ipca.set_params(n_components=20)
ipca.partial_fit(X)
def test_incremental_pca_partial_fit():
# Test that fit and partial_fit get equivalent results.
rng = np.random.RandomState(1999)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= .00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
batch_size = 10
ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
# Add one to make sure endpoint is included
batch_itr = np.arange(0, n + 1, batch_size)
for i, j in zip(batch_itr[:-1], batch_itr[1:]):
pipca.partial_fit(X[i:j, :])
assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
def test_whitening():
# Test that PCA and IncrementalPCA transforms match to sign flip.
X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
effective_rank=2, random_state=1999)
prec = 3
n_samples, n_features = X.shape
for nc in [None, 9]:
pca = PCA(whiten=True, n_components=nc).fit(X)
ipca = IncrementalPCA(whiten=True, n_components=nc,
batch_size=250).fit(X)
Xt_pca = pca.transform(X)
Xt_ipca = ipca.transform(X)
assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
Xinv_ipca = ipca.inverse_transform(Xt_ipca)
Xinv_pca = pca.inverse_transform(Xt_pca)
assert_almost_equal(X, Xinv_ipca, decimal=prec)
assert_almost_equal(X, Xinv_pca, decimal=prec)
assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def compute_reduced_representation(self):
if not self.compute_reduced:
return None
config_score = simple_config.load()["score"]
f_db = os.path.join(
config_score["output_data_directory"],
config_score["document_scores"]["f_db"]
)
h5 = touch_h5(f_db)
g = h5[self.method]
keys = g.keys()
V = np.vstack([g[x]["V"][:] for x in keys])
sizes = [g[x]["_ref"].shape[0] for x in keys]
nc = self.reduced_n_components
clf = IncrementalPCA(n_components=nc)
msg = "Performing PCA on {}, ({})->({})"
print(msg.format(self.method, V.shape[1], nc))
VX = clf.fit_transform(V)
EVR = clf.explained_variance_ratio_
COMPONENTS = clf.components_
for key, size in zip(keys, sizes):
# Take slices equal to the size
vx, VX = VX[:size,:], VX[size:, :]
evr, EVR = EVR[:size], EVR[size:]
com, COMPONENTS = COMPONENTS[:size,:], COMPONENTS[size:, :]
g[key].create_dataset("VX", data=vx, **self.h5py_args)
g[key].create_dataset("VX_explained_variance_ratio_", data=evr)
g[key].create_dataset("VX_components_", data=com)
h5.close()
def plot_feature_times(all_times, batch_size, all_components, data):
plt.figure()
plot_results(all_components, all_times['pca'], label="PCA")
plot_results(all_components, all_times['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
plt.legend(loc="upper left")
plt.suptitle("Algorithm runtime vs. n_components\n \
LFW, size %i x %i" % data.shape)
plt.xlabel("Number of components (out of max %i)" % data.shape[1])
plt.ylabel("Time (seconds)")
def plot_feature_errors(all_errors, batch_size, all_components, data):
plt.figure()
plot_results(all_components, all_errors['pca'], label="PCA")
plot_results(all_components, all_errors['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm error vs. n_components\n"
"LFW, size %i x %i" % data.shape)
plt.xlabel("Number of components (out of max %i)" % data.shape[1])
plt.ylabel("Mean absolute error")
def plot_batch_times(all_times, n_features, all_batch_sizes, data):
plt.figure()
plot_results(all_batch_sizes, all_times['pca'], label="PCA")
plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
LFW, size %i x %i" % (
n_features, data.shape[0], data.shape[1]))
plt.xlabel("Batch size")
plt.ylabel("Time (seconds)")
def plot_batch_errors(all_errors, n_features, all_batch_sizes, data):
plt.figure()
plot_results(all_batch_sizes, all_errors['pca'], label="PCA")
plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \
LFW, size %i x %i" % (
n_features, data.shape[0], data.shape[1]))
plt.xlabel("Batch size")
plt.ylabel("Mean absolute error")
def test_incremental_pca_inverse():
# Test that the projection of data can be inverted.
rng = np.random.RandomState(1999)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= .00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
Y = ipca.transform(X)
Y_inverse = ipca.inverse_transform(Y)
assert_almost_equal(X, Y_inverse, decimal=3)
def test_incremental_pca_validation():
# Test that n_components is >=1 and <= n_features.
X = [[0, 1], [1, 0]]
for n_components in [-1, 0, .99, 3]:
assert_raises(ValueError, IncrementalPCA(n_components,
batch_size=10).fit, X)
def test_incremental_pca_batch_signs():
# Test that components_ sign is stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features)
all_components = []
batch_sizes = np.arange(10, 20)
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
all_components.append(ipca.components_)
for i, j in zip(all_components[:-1], all_components[1:]):
assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
def test_incremental_pca_batch_values():
# Test that components_ values are stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features)
all_components = []
batch_sizes = np.arange(20, 40, 3)
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
all_components.append(ipca.components_)
for i, j in zip(all_components[:-1], all_components[1:]):
assert_almost_equal(i, j, decimal=1)
def test_incremental_pca_against_pca_iris():
# Test that IncrementalPCA and PCA are approximate (to a sign flip).
X = iris.data
Y_pca = PCA(n_components=2).fit_transform(X)
Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)
assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_incremental_pca_against_pca_random_data():
# Test that IncrementalPCA and PCA are approximate (to a sign flip).
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)
Y_pca = PCA(n_components=3).fit_transform(X)
Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)
assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def new(stop_words=[],decomposition='SVD',n_components=5):
# Prepare vectoriser engines
idf = TfidfVectorizer(
ngram_range=(1,3), #Unigram,bigram,& trigram
stop_words=stop_words
)
# Prepare normaliser
norm = Normalizer(norm='max')
print(colored('Texthasher model created','yellow'))
# Prepare dimensionality reduction
if decomposition and n_components:
if decomposition=='LDA': # Results in Non-negative matrix
reducer = LatentDirichletAllocation( # TFIDF --> Topic term
n_topics=n_components,
max_doc_update_iter=20,
max_iter=8
)
return [idf,norm,reducer]
elif decomposition=='SVD':
reducer = TruncatedSVD( # Best for small dataset,
n_components, # nightmare for large dataset
n_iter=8) # Damn slow
return [idf,norm,reducer]
elif decomposition=='PCA':
# When using IPCA, remember to always keep:
# n_samples > n_components > batch_size
# reducer = IncrementalPCA(n_components)
# Sparse -> Dense greedily consumes large amount of mem
# to_dense = SparseToDense()
# return [idf,norm,to_dense,reducer]
reducer = SparsePCA(n_components)
return [idf,norm,reducer]
return [idf,norm]
else:
return [idf,norm]