def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100):
while True:
if queue.empty():
break
year = queue.get()
print "Loading embeddings for year", year
time.sleep(random.random() * 120)
valid_words = set(words_above_count(count_dir, year, min_count))
print len(valid_words)
words = list(valid_words.intersection(words[year][:num_words]))
print len(words)
base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False)
base_embed = base_embed.get_subembed(words, restrict_context=True)
print "SVD for year", year
u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
print "Saving year", year
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u)
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
python类randomized_svd()的实例源码
def ksvd(Y, D, X, n_cycles=1, verbose=True):
n_atoms = D.shape[1]
n_features, n_samples = Y.shape
unused_atoms = []
R = Y - fast_dot(D, X)
for c in range(n_cycles):
for k in range(n_atoms):
if verbose:
sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100))
sys.stdout.flush()
# find all the datapoints that use the kth atom
omega_k = X[k, :] != 0
if not np.any(omega_k):
unused_atoms.append(k)
continue
# the residual due to all the other atoms but k
Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k])
U, S, V = randomized_svd(Rk, n_components=1, n_iter=10, flip_sign=False)
D[:, k] = U[:, 0]
X[k, omega_k] = V[0, :] * S[0]
# update the residual
R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k])
print ""
return D, X, unused_atoms
sparse_to_dense.py 文件源码
项目:Class_Evaluation_Summarization
作者: arunrm87
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def sparse_dense(summary):
text_copy = copy.deepcopy(summary)
"""
Find a suitable value for the hyperparameter, some random value like 0.5, or based
on some heuristic like (rank of original matrix/10), or (max_singular_value of the
original matrix / 20)
"""
_, s, _ = randomized_svd(summary, 1, n_iter=5)
hyperparameter = s[0] / 50
term_document_matrix_rank = np.linalg.matrix_rank(summary)
iterations = int(term_document_matrix_rank / 10)
A_new = dense(text_copy, hyperparameter, 0.02, iterations)
return A_new
def _init_svd(self, dictionary, definitions):
self.td_matrix = lil_matrix((len(dictionary), self.n_terms))
for defn, i in zip(definitions, range(len(definitions))):
if i % 100 == 0:
print("Building term-document matrix: {} / {}".format(i, len(dictionary)), end="\r")
self.td_matrix[i, :] = self.compute_freq_vec(dictionary[defn])
self.td_matrix = self.td_matrix.transpose().tocsr()
print()
for i in range(self.n_terms):
n = float(self.td_matrix[i, :].getnnz())
if i % 100 == 0:
print("Applying td-idf: {} / {}".format(i, self.n_terms), end="\r")
if n > 0:
self.td_matrix[i, :] *= np.log(len(dictionary) / n)
print()
print("Performing rank reduction...")
self.u, self.s, self.vt = randomized_svd(self.td_matrix, 50, transpose=False)
self.doc_matrix = np.matmul(np.diag(self.s), self.vt).transpose()
def svd_timing(X, n_comps, n_iter, n_oversamples,
power_iteration_normalizer='auto', method=None):
"""
Measure time for decomposition
"""
print("... running SVD ...")
if method is not 'fbpca':
gc.collect()
t0 = time()
U, mu, V = randomized_svd(X, n_comps, n_oversamples, n_iter,
power_iteration_normalizer,
random_state=random_state, transpose=False)
call_time = time() - t0
else:
gc.collect()
t0 = time()
# There is a different convention for l here
U, mu, V = fbpca.pca(X, n_comps, raw=True, n_iter=n_iter,
l=n_oversamples+n_comps)
call_time = time() - t0
return U, mu, V, call_time
def test_svd(eng):
x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
x = fromarray(x, engine=eng)
from sklearn.utils.extmath import randomized_svd
u1, s1, v1 = randomized_svd(x.toarray(), n_components=2, random_state=0)
u2, s2, v2 = SVD(k=2, method='direct').fit(x)
assert allclose_sign(u1, u2)
assert allclose(s1, s2)
assert allclose_sign(v1.T, v2.T)
u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x)
tol = 1e-1
assert allclose_sign(u1, u2, atol=tol)
assert allclose(s1, s2, atol=tol)
assert allclose_sign(v1.T, v2.T, atol=tol)
def gsvd(X, M, A, n_comps = 10):
"""
Generalized SVD
:param X:
:param M:
:param A:
:return:
"""
print("GSVD")
print("GSVD: Weights... ", end='')
Xw = np.dot(np.sqrt(M), np.dot(X, np.sqrt(A)))
print("Done!")
print("GSVD: SVD... ", end='')
[P_, D, Q_] = randomized_svd(Xw, n_comps)
#P_ = P_[:,0:n_comps]
#D = D[0:n_comps]
#Q_ = Q_[0:n_comps,:]
print('Done!')
print("GSVD: Factor scores and eigenvalues... ", end='')
Mp = np.power(np.diag(M), -0.5)
Ap = np.power(np.diag(A), -0.5)
P = np.dot(np.diag(Mp), P_)
Q = np.dot(np.diag(Ap), Q_.T)
ev = np.power(D, 2)
print('Done!')
return P, D, Q, ev
def randomizedSVD(self):
# http://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis
# http://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn
U, S, V = randomized_svd(self.bag_of_words_matrix.T,
n_components=self.dimensions,
n_iter=5,
random_state=None)
self.U = U
self.S = S
self.V = V
self.tokens_representation = np.matrix(U) * np.diag(S)
self.documents_representation = (np.diag(S) * np.matrix(V)).T
def _svd(self, X, max_rank=None):
if max_rank:
# if we have a max rank then perform the faster randomized SVD
return randomized_svd(
X,
max_rank,
n_iter=self.n_power_iterations)
else:
# perform a full rank SVD using ARPACK
return np.linalg.svd(
X,
full_matrices=False,
compute_uv=True)
def _svd(self, X, max_rank=None):
if max_rank:
# if we have a max rank then perform the faster randomized SVD
return randomized_svd(
X,
max_rank,
n_iter=self.n_power_iterations)
else:
# perform a full rank SVD using ARPACK
return np.linalg.svd(
X,
full_matrices=False,
compute_uv=True)
def __init__(self, X, kern, M):
super(SVD, self).__init__("SVD")
start = time.time()
self.X = X
self.kern = kern
K = kern.K(X, X)
N = np.shape(X)[0]
#(self.U, self.Sigma, self.VT) = fb.pca(K, M)#, n_iter=1, l=M)
self.U, self.Sigma, self.VT = randomized_svd(K, M)
self.precon = np.dot(self.U, np.dot(np.diag(self.Sigma), self.VT)) + self.kern.noise*np.identity(N)
self.duration = time.time() - start
def __init__(self, X, kern, M):
super(SVD, self).__init__("SVD")
start = time.time()
self.X = X
self.kern = kern
K = kern.K(X, X)
N = np.shape(X)[0]
#(self.U, self.Sigma, self.VT) = fb.pca(K, M)#, n_iter=1, l=M)
self.U, self.Sigma, self.VT = randomized_svd(K, M)
self.precon = np.dot(self.U, np.dot(np.diag(self.Sigma), self.VT)) + self.kern.noise*np.identity(N)
self.duration = time.time() - start
def apply_uv_decomposition(self):
U, Sigma, VT = randomized_svd(self.behaviour_matrix,
n_components=15,
n_iter=10,
random_state=None)
print(U.shape)
print(VT.shape)
self.X_hat = np.dot(U, VT) # U * np.diag(Sigma)
def fit(self, train_input, train):
U, sigma, VT = randomized_svd(train, self.nfactor)
sigma = scipy.sparse.diags(sigma, 0)
self.U = U * sigma
self.V = VT.T
def compute_bench(samples_range, features_range, n_iter=3, rank=50):
it = 0
results = defaultdict(lambda: [])
max_it = len(samples_range) * len(features_range)
for n_samples in samples_range:
for n_features in features_range:
it += 1
print('====================')
print('Iteration %03d of %03d' % (it, max_it))
print('====================')
X = make_low_rank_matrix(n_samples, n_features,
effective_rank=rank,
tail_strength=0.2)
gc.collect()
print("benchmarking scipy svd: ")
tstart = time()
svd(X, full_matrices=False)
results['scipy svd'].append(time() - tstart)
gc.collect()
print("benchmarking scikit-learn randomized_svd: n_iter=0")
tstart = time()
randomized_svd(X, rank, n_iter=0)
results['scikit-learn randomized_svd (n_iter=0)'].append(
time() - tstart)
gc.collect()
print("benchmarking scikit-learn randomized_svd: n_iter=%d "
% n_iter)
tstart = time()
randomized_svd(X, rank, n_iter=n_iter)
results['scikit-learn randomized_svd (n_iter=%d)'
% n_iter].append(time() - tstart)
return results
def test_randomized_svd_low_rank():
# Check that extmath.randomized_svd is consistent with linalg.svd
n_samples = 100
n_features = 500
rank = 5
k = 10
# generate a matrix X of approximate effective rank `rank` and no noise
# component (very structured signal):
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=0.0,
random_state=0)
assert_equal(X.shape, (n_samples, n_features))
# compute the singular values of X using the slow exact method
U, s, V = linalg.svd(X, full_matrices=False)
for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable
# compute the singular values of X using the fast approximate method
Ua, sa, Va = \
randomized_svd(X, k, power_iteration_normalizer=normalizer,
random_state=0)
assert_equal(Ua.shape, (n_samples, k))
assert_equal(sa.shape, (k,))
assert_equal(Va.shape, (k, n_features))
# ensure that the singular values of both methods are equal up to the
# real rank of the matrix
assert_almost_equal(s[:k], sa)
# check the singular vectors too (while not checking the sign)
assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))
# check the sparse matrix representation
X = sparse.csr_matrix(X)
# compute the singular values of X using the fast approximate method
Ua, sa, Va = \
randomized_svd(X, k, power_iteration_normalizer=normalizer,
random_state=0)
assert_almost_equal(s[:rank], sa[:rank])
def test_randomized_svd_low_rank_with_noise():
# Check that extmath.randomized_svd can handle noisy matrices
n_samples = 100
n_features = 500
rank = 5
k = 10
# generate a matrix X wity structure approximate rank `rank` and an
# important noisy component
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=0.1,
random_state=0)
assert_equal(X.shape, (n_samples, n_features))
# compute the singular values of X using the slow exact method
_, s, _ = linalg.svd(X, full_matrices=False)
for normalizer in ['auto', 'none', 'LU', 'QR']:
# compute the singular values of X using the fast approximate
# method without the iterated power method
_, sa, _ = randomized_svd(X, k, n_iter=0,
power_iteration_normalizer=normalizer,
random_state=0)
# the approximation does not tolerate the noise:
assert_greater(np.abs(s[:k] - sa).max(), 0.01)
# compute the singular values of X using the fast approximate
# method with iterated power method
_, sap, _ = randomized_svd(X, k,
power_iteration_normalizer=normalizer,
random_state=0)
# the iterated power method is helping getting rid of the noise:
assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank():
# Check that extmath.randomized_svd can handle noisy matrices
n_samples = 100
n_features = 500
rank = 5
k = 10
# let us try again without 'low_rank component': just regularly but slowly
# decreasing singular values: the rank of the data matrix is infinite
X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
effective_rank=rank, tail_strength=1.0,
random_state=0)
assert_equal(X.shape, (n_samples, n_features))
# compute the singular values of X using the slow exact method
_, s, _ = linalg.svd(X, full_matrices=False)
for normalizer in ['auto', 'none', 'LU', 'QR']:
# compute the singular values of X using the fast approximate method
# without the iterated power method
_, sa, _ = randomized_svd(X, k, n_iter=0,
power_iteration_normalizer=normalizer)
# the approximation does not tolerate the noise:
assert_greater(np.abs(s[:k] - sa).max(), 0.1)
# compute the singular values of X using the fast approximate method
# with iterated power method
_, sap, _ = randomized_svd(X, k, n_iter=5,
power_iteration_normalizer=normalizer)
# the iterated power method is still managing to get most of the
# structure at the requested rank
assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_power_iteration_normalizer():
# randomized_svd with power_iteration_normalized='none' diverges for
# large number of power iterations on this dataset
rng = np.random.RandomState(42)
X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
X += 3 * rng.randint(0, 2, size=X.shape)
n_components = 50
# Check that it diverges with many (non-normalized) power iterations
U, s, V = randomized_svd(X, n_components, n_iter=2,
power_iteration_normalizer='none')
A = X - U.dot(np.diag(s).dot(V))
error_2 = linalg.norm(A, ord='fro')
U, s, V = randomized_svd(X, n_components, n_iter=20,
power_iteration_normalizer='none')
A = X - U.dot(np.diag(s).dot(V))
error_20 = linalg.norm(A, ord='fro')
assert_greater(np.abs(error_2 - error_20), 100)
for normalizer in ['LU', 'QR', 'auto']:
U, s, V = randomized_svd(X, n_components, n_iter=2,
power_iteration_normalizer=normalizer,
random_state=0)
A = X - U.dot(np.diag(s).dot(V))
error_2 = linalg.norm(A, ord='fro')
for i in [5, 10, 50]:
U, s, V = randomized_svd(X, n_components, n_iter=i,
power_iteration_normalizer=normalizer,
random_state=0)
A = X - U.dot(np.diag(s).dot(V))
error = linalg.norm(A, ord='fro')
assert_greater(15, np.abs(error_2 - error))
def test_randomized_svd_sign_flip():
a = np.array([[2.0, 0.0], [0.0, 1.0]])
u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
for seed in range(10):
u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
assert_almost_equal(u1, u2)
assert_almost_equal(v1, v2)
assert_almost_equal(np.dot(u2 * s2, v2), a)
assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
def test_randomized_svd_sign_flip_with_transpose():
# Check if the randomized_svd sign flipping is always done based on u
# irrespective of transpose.
# See https://github.com/scikit-learn/scikit-learn/issues/5608
# for more details.
def max_loading_is_positive(u, v):
"""
returns bool tuple indicating if the values maximising np.abs
are positive across all rows for u and across all columns for v.
"""
u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
return u_based, v_based
mat = np.arange(10 * 8).reshape(10, -1)
# Without transpose
u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
assert_true(u_based)
assert_false(v_based)
# With transpose
u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
mat, 3, flip_sign=True, transpose=True)
u_based, v_based = max_loading_is_positive(
u_flipped_with_transpose, v_flipped_with_transpose)
assert_true(u_based)
assert_false(v_based)
def sv_thresh(X, t, k):
m, n = X.shape
U, s, V = randomized_svd(X, k) #pca(X, raw=True, k=25)
# Number of singular values greater than `t`
greater_sv = np.sum(s > t)
s = soft_thresh(s, t)
S = np.diag(s)
ret = np.dot(U, np.dot(S, V))
assert ret.shape == X.shape
return ret, greater_sv
def _fit_local(self, mat):
from sklearn.utils.extmath import randomized_svd
U, S, V = randomized_svd(mat, n_components=self.k, n_iter=self.max_iter, random_state=self.seed)
return U, S, V
def nn_ksvd(Y, D, X, n_cycles=1, verbose=True):
# the non-negative variant
n_atoms = D.shape[1]
n_features, n_samples = Y.shape
unused_atoms = []
R = Y - fast_dot(D, X)
for k in range(n_atoms):
if verbose:
sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100))
sys.stdout.flush()
# find all the datapoints that use the kth atom
omega_k = X[k, :] != 0
if not np.any(omega_k):
unused_atoms.append(k)
continue
# the residual due to all the other atoms but k
Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k])
try:
U, S, V = randomized_svd(Rk, n_components=1, n_iter=50, flip_sign=False)
except:
warnings.warn('SVD error')
continue
d = U[:, 0]
x = V[0, :] * S[0]
# projection to the constraint set
d[d < 0] = 0
x[x < 0] = 0
dTd = np.dot(d, d)
xTx = np.dot(x, x)
if dTd <= np.finfo('float').eps or xTx <= np.finfo('float').eps:
continue
for j in range(n_cycles):
d = np.dot(Rk, x) / np.dot(x, x)
d[d < 0] = 0
x = np.dot(d.T, Rk) / np.dot(d, d)
x[x < 0] = 0
_norm = norm(d)
d = d / _norm
x = x * _norm
D[:, k] = d
X[k, omega_k] = x
# update the residual
R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k])
print ""
return D, X, unused_atoms