python类IncrementalPCA()的实例源码-面圈网

isomapPCA.py 文件源码项目：FaceAnalysis 作者: ElliotSalisbury 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def trainPCA(dstFolder, BATCH_SIZE = 50):
    ipca = IncrementalPCA(n_components=10, batch_size=BATCH_SIZE)

    for batch_isomaps_path in glob.glob(os.path.join(dstFolder, "*_isomaps_centered.p")):
        batch_num = int(os.path.basename(batch_isomaps_path).split("_")[0])

        isomaps = pickle.load(open(batch_isomaps_path, "rb"))

        # remove the illumination and count channel
        isomaps_just_color = isomaps[:,:,:,1:3]

        #flatten to data
        X = isomaps_just_color.reshape(isomaps_just_color.shape[0], -1)  # ,isomaps_just_color.shape[3])

        ipca.partial_fit(X)

    with open(os.path.join(dstFolder, "pca.p"), "wb") as file:
        pickle.dump(ipca, file)

bench_plot_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def fixed_batch_size_comparison(data):
    all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10,
                                                       data.shape[1], num=5)]
    batch_size = 1000
    # Compare runtimes and error for fixed batch size
    all_times = defaultdict(list)
    all_errors = defaultdict(list)
    for n_components in all_features:
        pca = PCA(n_components=n_components)
        rpca = RandomizedPCA(n_components=n_components, random_state=1999)
        ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
                                                               ('ipca', ipca),
                                                               ('rpca', rpca)]}

        for k in sorted(results_dict.keys()):
            all_times[k].append(results_dict[k]['time'])
            all_errors[k].append(results_dict[k]['error'])

    plot_feature_times(all_times, batch_size, all_features, data)
    plot_feature_errors(all_errors, batch_size, all_features, data)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_incremental_pca():
    # Incremental PCA on dense arrays.
    X = iris.data
    batch_size = X.shape[0] // 3
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    pca = PCA(n_components=2)
    pca.fit_transform(X)

    X_transformed = ipca.fit_transform(X)

    np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2))
    assert_almost_equal(ipca.explained_variance_ratio_.sum(),
                        pca.explained_variance_ratio_.sum(), 1)

    for n_components in [1, 2, X.shape[1]]:
        ipca = IncrementalPCA(n_components, batch_size=batch_size)
        ipca.fit(X)
        cov = ipca.get_covariance()
        precision = ipca.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]))

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_incremental_pca_check_projection():
    # Test that the projection of data is correct.
    rng = np.random.RandomState(1999)
    n, p = 100, 3
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5])
    Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])

    # Get the reconstruction of the generated data X
    # Note that Xt has the same "components" as X, just separated
    # This is what we want to ensure is recreated correctly
    Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)

    # Normalize
    Yt /= np.sqrt((Yt ** 2).sum())

    # Make sure that the first element of Yt is ~1, this means
    # the reconstruction worked as expected
    assert_almost_equal(np.abs(Yt[0][0]), 1., 1)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_incremental_pca_set_params():
    # Test that components_ sign is stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 20
    X = rng.randn(n_samples, n_features)
    X2 = rng.randn(n_samples, n_features)
    X3 = rng.randn(n_samples, n_features)
    ipca = IncrementalPCA(n_components=20)
    ipca.fit(X)
    # Decreasing number of components
    ipca.set_params(n_components=10)
    assert_raises(ValueError, ipca.partial_fit, X2)
    # Increasing number of components
    ipca.set_params(n_components=15)
    assert_raises(ValueError, ipca.partial_fit, X3)
    # Returning to original setting
    ipca.set_params(n_components=20)
    ipca.partial_fit(X)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def test_incremental_pca_partial_fit():
    # Test that fit and partial_fit get equivalent results.
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    batch_size = 10
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
    pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    # Add one to make sure endpoint is included
    batch_itr = np.arange(0, n + 1, batch_size)
    for i, j in zip(batch_itr[:-1], batch_itr[1:]):
        pipca.partial_fit(X[i:j, :])
    assert_almost_equal(ipca.components_, pipca.components_, decimal=3)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_whitening():
    # Test that PCA and IncrementalPCA transforms match to sign flip.
    X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
                                      effective_rank=2, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc).fit(X)
        ipca = IncrementalPCA(whiten=True, n_components=nc,
                              batch_size=250).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X, Xinv_ipca, decimal=prec)
        assert_almost_equal(X, Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)

document_scores.py 文件源码项目：word2vec_pipeline 作者: NIHOPA 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def compute_reduced_representation(self):

        if not self.compute_reduced:
            return None

        config_score = simple_config.load()["score"]
        f_db = os.path.join(
            config_score["output_data_directory"],
            config_score["document_scores"]["f_db"]
        )

        h5 = touch_h5(f_db)
        g = h5[self.method]

        keys = g.keys()
        V     = np.vstack([g[x]["V"][:] for x in keys])
        sizes = [g[x]["_ref"].shape[0] for x in keys]

        nc = self.reduced_n_components
        clf = IncrementalPCA(n_components=nc)

        msg = "Performing PCA on {}, ({})->({})"
        print(msg.format(self.method, V.shape[1], nc))

        VX = clf.fit_transform(V)
        EVR = clf.explained_variance_ratio_
        COMPONENTS = clf.components_

        for key, size in zip(keys, sizes):

            # Take slices equal to the size
            vx, VX = VX[:size,:], VX[size:, :]
            evr, EVR = EVR[:size], EVR[size:]
            com, COMPONENTS = COMPONENTS[:size,:], COMPONENTS[size:, :]

            g[key].create_dataset("VX", data=vx, **self.h5py_args)
            g[key].create_dataset("VX_explained_variance_ratio_", data=evr)
            g[key].create_dataset("VX_components_", data=com)

        h5.close()

bench_plot_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def plot_feature_times(all_times, batch_size, all_components, data):
    plt.figure()
    plot_results(all_components, all_times['pca'], label="PCA")
    plot_results(all_components, all_times['ipca'],
                 label="IncrementalPCA, bsize=%i" % batch_size)
    plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
    plt.legend(loc="upper left")
    plt.suptitle("Algorithm runtime vs. n_components\n \
                 LFW, size %i x %i" % data.shape)
    plt.xlabel("Number of components (out of max %i)" % data.shape[1])
    plt.ylabel("Time (seconds)")

bench_plot_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def plot_feature_errors(all_errors, batch_size, all_components, data):
    plt.figure()
    plot_results(all_components, all_errors['pca'], label="PCA")
    plot_results(all_components, all_errors['ipca'],
                 label="IncrementalPCA, bsize=%i" % batch_size)
    plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
    plt.legend(loc="lower left")
    plt.suptitle("Algorithm error vs. n_components\n"
                 "LFW, size %i x %i" % data.shape)
    plt.xlabel("Number of components (out of max %i)" % data.shape[1])
    plt.ylabel("Mean absolute error")

bench_plot_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def plot_batch_times(all_times, n_features, all_batch_sizes, data):
    plt.figure()
    plot_results(all_batch_sizes, all_times['pca'], label="PCA")
    plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
    plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
    plt.legend(loc="lower left")
    plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
                 LFW, size %i x %i" % (
                 n_features, data.shape[0], data.shape[1]))
    plt.xlabel("Batch size")
    plt.ylabel("Time (seconds)")

bench_plot_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def plot_batch_errors(all_errors, n_features, all_batch_sizes, data):
    plt.figure()
    plot_results(all_batch_sizes, all_errors['pca'], label="PCA")
    plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA")
    plt.legend(loc="lower left")
    plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \
                 LFW, size %i x %i" % (
                 n_features, data.shape[0], data.shape[1]))
    plt.xlabel("Batch size")
    plt.ylabel("Mean absolute error")

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_incremental_pca_inverse():
    # Test that the projection of data can be inverted.
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
    Y = ipca.transform(X)
    Y_inverse = ipca.inverse_transform(Y)
    assert_almost_equal(X, Y_inverse, decimal=3)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_incremental_pca_validation():
    # Test that n_components is >=1 and <= n_features.
    X = [[0, 1], [1, 0]]
    for n_components in [-1, 0, .99, 3]:
        assert_raises(ValueError, IncrementalPCA(n_components,
                                                 batch_size=10).fit, X)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_incremental_pca_batch_signs():
    # Test that components_ sign is stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(10, 20)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for i, j in zip(all_components[:-1], all_components[1:]):
        assert_almost_equal(np.sign(i), np.sign(j), decimal=6)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def test_incremental_pca_batch_values():
    # Test that components_ values are stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(20, 40, 3)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for i, j in zip(all_components[:-1], all_components[1:]):
        assert_almost_equal(i, j, decimal=1)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_incremental_pca_against_pca_iris():
    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
    X = iris.data

    Y_pca = PCA(n_components=2).fit_transform(X)
    Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)

    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)

test_incremental_pca.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_incremental_pca_against_pca_random_data():
    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)

    Y_pca = PCA(n_components=3).fit_transform(X)
    Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)

    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)

texthasher.py 文件源码项目：pantip-libr 作者: starcolon 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def new(stop_words=[],decomposition='SVD',n_components=5):

  # Prepare vectoriser engines
  idf = TfidfVectorizer(
    ngram_range=(1,3), #Unigram,bigram,& trigram
    stop_words=stop_words
  )

  # Prepare normaliser
  norm = Normalizer(norm='max')

  print(colored('Texthasher model created','yellow'))

  # Prepare dimensionality reduction
  if decomposition and n_components:
    if decomposition=='LDA': # Results in Non-negative matrix
      reducer = LatentDirichletAllocation( # TFIDF --> Topic term
        n_topics=n_components,
        max_doc_update_iter=20,
        max_iter=8  
      )
      return [idf,norm,reducer]

    elif decomposition=='SVD':
      reducer = TruncatedSVD( # Best for small dataset, 
        n_components,         # nightmare for large dataset
        n_iter=8) # Damn slow

      return [idf,norm,reducer]

    elif decomposition=='PCA':
      # When using IPCA, remember to always keep:
      # n_samples > n_components > batch_size
      # reducer = IncrementalPCA(n_components)

      # Sparse -> Dense greedily consumes large amount of mem
      # to_dense = SparseToDense()

      # return [idf,norm,to_dense,reducer]

      reducer = SparsePCA(n_components)
      return [idf,norm,reducer]

    return [idf,norm]
  else:
    return [idf,norm]