def visualize_data(data, labels):
pca = RandomizedPCA(n_components=2)
reshaped = pca.fit_transform(data)
df = pd.DataFrame({'x': reshaped[:,0], 'y': reshaped[:, 1],
'label': np.where(labels == 1, 'Positive',
np.where(labels == 0, 'Neutral',
'Negative'))})
colors = ['yellow', 'red', 'blue']
for label, color in zip(df['label'].unique(), colors):
mask = df['label'] == label
plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
plt.legend()
plt.title('PCA Decomposition of Image Data')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()
# plt.savefig('PCA_plot.png')
python类RandomizedPCA()的实例源码
def visualize_data(data, labels):
pca = RandomizedPCA(n_components=2)
reshaped = pca.fit_transform(data)
df = pd.DataFrame({'x': reshaped[:,0], 'y': reshaped[:, 1],
'label': np.where(labels == 1, 'Positive',
np.where(labels == 0, 'Neutral',
'Negative'))})
colors = ['yellow', 'red', 'blue']
for label, color in zip(df['label'].unique(), colors):
mask = df['label'] == label
plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
plt.legend()
plt.title('PCA Decomposition of Image Data')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()
# plt.savefig('PCA_plot.png')
def plot_clusters_pca(responsibilities, color_groups):
from sklearn.decomposition import RandomizedPCA
import pylab as pl
from random import shuffle
colors = list(colors_dict.values())
shuffle(colors)
pca = RandomizedPCA(n_components=2)
X = pca.fit_transform(responsibilities)
# print >>stderr, pca.explained_variance_ratio_
pl.figure()
pl.scatter(X[:, 0], X[:, 1], c="grey", label="unknown")
for c, sub, i in zip(colors, color_groups, count(0)):
pl.scatter(X[sub, 0], X[sub, 1], c=c, label=str(i))
pl.legend()
pl.title("PCA responsibility matrix")
pl.show()
def fixed_batch_size_comparison(data):
all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10,
data.shape[1], num=5)]
batch_size = 1000
# Compare runtimes and error for fixed batch size
all_times = defaultdict(list)
all_errors = defaultdict(list)
for n_components in all_features:
pca = PCA(n_components=n_components)
rpca = RandomizedPCA(n_components=n_components, random_state=1999)
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
('ipca', ipca),
('rpca', rpca)]}
for k in sorted(results_dict.keys()):
all_times[k].append(results_dict[k]['time'])
all_errors[k].append(results_dict[k]['error'])
plot_feature_times(all_times, batch_size, all_features, data)
plot_feature_errors(all_errors, batch_size, all_features, data)
MLNPCapstone.py 文件源码
项目:machine-learning-nanodegree-program-capstone
作者: harrylippy
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def pca_analysis(self):
if not self._use_pca:
return
print "done.\n + Using PCA to analyze the data...",; stdout.flush()
cols = self._get_columns()
(X_train, _) = self._train_data
if not self._pca:
self._pca = RandomizedPCA(
n_components=self._pca_max_n,
whiten=True,
random_state=42)
self._pca.fit(X_train)
# NOTE: plot code stolen from sklearn example: http://bit.ly/1X8ZsUw
fig = plt.figure(self._fig_count, figsize=(4,3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(self._pca.explained_variance_ratio_)
fig.suptitle('RandomizedPCA Analysis')
plt.axis('tight')
plt.xlabel('Component')
plt.ylabel('Explained Variance Ratio')
plt.show()
self._fig_count += 1
# Reset the PCA object, since we will need to set the exact number
# of components we want to use if and when we use it again
self._pca = None
# Train a classifier pipeline that may or may not use PCA or other
# feature selection methods
def plot_feature_times(all_times, batch_size, all_components, data):
plt.figure()
plot_results(all_components, all_times['pca'], label="PCA")
plot_results(all_components, all_times['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
plt.legend(loc="upper left")
plt.suptitle("Algorithm runtime vs. n_components\n \
LFW, size %i x %i" % data.shape)
plt.xlabel("Number of components (out of max %i)" % data.shape[1])
plt.ylabel("Time (seconds)")
def plot_feature_errors(all_errors, batch_size, all_components, data):
plt.figure()
plot_results(all_components, all_errors['pca'], label="PCA")
plot_results(all_components, all_errors['ipca'],
label="IncrementalPCA, bsize=%i" % batch_size)
plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm error vs. n_components\n"
"LFW, size %i x %i" % data.shape)
plt.xlabel("Number of components (out of max %i)" % data.shape[1])
plt.ylabel("Mean absolute error")
def plot_batch_times(all_times, n_features, all_batch_sizes, data):
plt.figure()
plot_results(all_batch_sizes, all_times['pca'], label="PCA")
plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
plt.legend(loc="lower left")
plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
LFW, size %i x %i" % (
n_features, data.shape[0], data.shape[1]))
plt.xlabel("Batch size")
plt.ylabel("Time (seconds)")
def variable_batch_size_comparison(data):
batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10,
data.shape[0], num=10)]
for n_components in [i.astype(int) for i in
np.linspace(data.shape[1] // 10,
data.shape[1], num=4)]:
all_times = defaultdict(list)
all_errors = defaultdict(list)
pca = PCA(n_components=n_components)
rpca = RandomizedPCA(n_components=n_components, random_state=1999)
results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
('rpca', rpca)]}
# Create flat baselines to compare the variation over batch size
all_times['pca'].extend([results_dict['pca']['time']] *
len(batch_sizes))
all_errors['pca'].extend([results_dict['pca']['error']] *
len(batch_sizes))
all_times['rpca'].extend([results_dict['rpca']['time']] *
len(batch_sizes))
all_errors['rpca'].extend([results_dict['rpca']['error']] *
len(batch_sizes))
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=n_components,
batch_size=batch_size)
results_dict = {k: benchmark(est, data) for k, est in [('ipca',
ipca)]}
all_times['ipca'].append(results_dict['ipca']['time'])
all_errors['ipca'].append(results_dict['ipca']['error'])
plot_batch_times(all_times, n_components, batch_sizes, data)
# RandomizedPCA error is always worse (approx 100x) than other PCA
# tests
plot_batch_errors(all_errors, n_components, batch_sizes, data)