def theta_matrix(coord, adj, preload=True, train=True):
print "creating adjacent theta matrix ..."
if preload is True:
if train is True:
theta_matrix = np.load('../data/theta_matrix_train_n_100.npy')
else:
theta_matrix = np.load('../data/theta_matrix_test_n_100.npy')
else:
theta_matrix = []
for i in tqdm(range(coord.shape[0])):
for j in range(coord.shape[1]):
theta_row = angle(coord[i,adj[i][j].nonzero()[1],:] - coord[i,j,:])
col_indice = adj[i][j].nonzero()[1]
row_indice = (np.zeros(col_indice.shape[0])).astype(int32)
if j == 0:
theta_matrix_tmp = csc_matrix((theta_row, (row_indice, col_indice)), shape=(1,coord.shape[1]))
else:
theta_matrix_tmp = scipy.sparse.vstack((theta_matrix_tmp, csc_matrix((theta_row, (row_indice, col_indice)), shape=(1,coord.shape[1]))))
theta_matrix.append(theta_matrix_tmp)
theta_matrix = np.array(theta_matrix)
return theta_matrix
python类sparse()的实例源码
def iter_chunks(self, chunksize=None):
"""
Iteratively yield the index as chunks of documents, each of size <= chunksize.
The chunk is returned in its raw form (matrix or sparse matrix slice).
The size of the chunk may be smaller than requested; it is up to the caller
to check the result for real length, using `chunk.shape[0]`.
"""
self.close_shard()
if chunksize is None:
# if not explicitly specified, use the chunksize from the constructor
chunksize = self.chunksize
for shard in self.shards:
query = shard.get_index().index
for chunk_start in xrange(0, query.shape[0], chunksize):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(query.shape[0], chunk_start + chunksize)
chunk = query[chunk_start: chunk_end] # create a view
yield chunk
def close_shard(self):
"""
Force the latest shard to close (be converted to a matrix and stored
to disk). Do nothing if no new documents added since last call.
**NOTE**: the shard is closed even if it is not full yet (its size is smaller
than `self.shardsize`). If documents are added later via `add_documents()`,
this incomplete shard will be loaded again and completed.
"""
if not self.fresh_docs:
return
shardid = len(self.shards)
# consider the shard sparse if its density is < 30%
issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features)
if issparse:
index = SparseMatrixSimilarity(self.fresh_docs, num_terms=self.num_features,
num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz)
else:
index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features)
logger.info("creating %s shard #%s" % ('sparse' if issparse else 'dense', shardid))
shard = Shard(self.shardid2filename(shardid), index)
shard.num_best = self.num_best
shard.num_nnz = self.fresh_nnz
self.shards.append(shard)
self.fresh_docs, self.fresh_nnz = [], 0
def close_shard(self):
"""
Force the latest shard to close (be converted to a matrix and stored
to disk). Do nothing if no new documents added since last call.
**NOTE**: the shard is closed even if it is not full yet (its size is smaller
than `self.shardsize`). If documents are added later via `add_documents()`,
this incomplete shard will be loaded again and completed.
"""
if not self.fresh_docs:
return
shardid = len(self.shards)
# consider the shard sparse if its density is < 30%
issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features)
if issparse:
index = SparseMatrixSimilarity(self.fresh_docs, num_terms=self.num_features,
num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz)
else:
index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features)
logger.info("creating %s shard #%s" % ('sparse' if issparse else 'dense', shardid))
shard = Shard(self.shardid2filename(shardid), index)
shard.num_best = self.num_best
shard.num_nnz = self.fresh_nnz
self.shards.append(shard)
self.fresh_docs, self.fresh_nnz = [], 0
def iter_chunks(self, chunksize=None):
"""
Iteratively yield the index as chunks of documents, each of size <= chunksize.
The chunk is returned in its raw form (matrix or sparse matrix slice).
The size of the chunk may be smaller than requested; it is up to the caller
to check the result for real length, using `chunk.shape[0]`.
"""
self.close_shard()
if chunksize is None:
# if not explicitly specified, use the chunksize from the constructor
chunksize = self.chunksize
for shard in self.shards:
query = shard.get_index().index
for chunk_start in xrange(0, query.shape[0], chunksize):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(query.shape[0], chunk_start + chunksize)
chunk = query[chunk_start: chunk_end] # create a view
yield chunk
def get_contexts_rank(targets, cooc_mat, target_index):
"""
A dictionary in which each key is a target word and the value is a sorted list of
context columns in descending order
:param targets: the words
:return:
"""
contexts_rank = {}
for target in targets:
index = target_index.get(target, -1)
if index == -1:
contexts_rank[target] = []
row = cooc_mat[index, :]
contexts_rank[target] = sort_by_value_get_col(scipy.sparse.coo_matrix(row.mat)) # tuples of (row, col, value)
return contexts_rank
def APSyn(x_row, y_row, N):
"""
APSyn(x, y) = (\sum_{f\epsilon N(f_{x})\bigcap N(f_{y})))} \frac{1}{(rank(f_{x})+rank(f_{y})/2)})
:param x_row:
:param y_row:
:return:
"""
# Sort y's contexts
y_contexts_cols = sort_by_value_get_col(scipy.sparse.coo_matrix(y_row.mat)) # tuples of (row, col, value)
y_contexts_cols = y_contexts_cols[:N]
y_context_rank = { c : i + 1 for i, c in enumerate(y_contexts_cols) }
# Sort x's contexts
x_contexts_cols = sort_by_value_get_col(scipy.sparse.coo_matrix(x_row.mat))
x_contexts_cols = x_contexts_cols[:N]
x_context_rank = { c : i + 1 for i, c in enumerate(x_contexts_cols) }
# Average of 1/(rank(w1)+rank(w2)/2) for every intersected feature among the top N contexts
intersected_context = set(y_contexts_cols).intersection(set(x_contexts_cols))
score = sum([1.0 / ((x_context_rank[c] + y_context_rank[c]) / 2.0) for c in intersected_context])
#score *= (1.0 / N)
return score
def make_G_matrix(T,g):
''' create matrix of autoregression to enforce indicator dynamics
Inputs:
T: positive integer
number of time-bins
g: nd.array, vector p x 1
Discrete time constants
Output:
G: sparse diagonal matrix
Matrix of autoregression
'''
if type(g) is np.ndarray:
if len(g) == 1 and g < 0:
g=0
# gs=np.matrix(np.hstack((-np.flipud(g[:]).T,1)))
gs=np.matrix(np.hstack((1,-(g[:]).T)))
ones_=np.matrix(np.ones((T,1)))
G = spdiags((ones_*gs).T,range(0,-len(g)-1,-1),T,T)
return G
else:
raise Exception('g must be an array')
#%%
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
#choosing the particular flavor of vectorizer
if method == 'counts':
vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
elif method == 'tfidf':
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')
#fitting the vectorizer and converting the counts to an array
full_fit = vectorizer.fit_transform(df[x_name])
full_counts = full_fit.toarray()
self.vocabulary_ = vectorizer.vocabulary_
#passing the attributes up to the class instance
self.data = df
if sparse:
full_counts = csr_matrix(full_counts)
self.X = full_counts
if y_name != None:
self.y = np.array(df[y_name])
return
#splits the data into training and test sets; either called from process()
#or on its own when your text is already vectorized and divided into x and y
scipy_sparse.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
sort_labels=False):
""" Convert a SparseSeries to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError('to_coo requires MultiIndex with nlevels > 2')
if not ss.index.is_unique:
raise ValueError('Duplicate index entries are not allowed in to_coo '
'transformation.')
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels,
column_levels=column_levels,
sort_labels=sort_labels)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns)))
return sparse_matrix, rows, columns
testing.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def _create_missing_idx(nrows, ncols, density, random_state=None):
if random_state is None:
random_state = np.random
else:
random_state = np.random.RandomState(random_state)
# below is cribbed from scipy.sparse
size = int(np.round((1 - density) * nrows * ncols))
# generate a few more to ensure unique values
min_rows = 5
fac = 1.02
extra_size = min(size + min_rows, fac * size)
def _gen_unique_rand(rng, _extra_size):
ind = rng.rand(int(_extra_size))
return np.unique(np.floor(ind * nrows * ncols))[:size]
ind = _gen_unique_rand(random_state, extra_size)
while ind.size < size:
extra_size *= 1.05
ind = _gen_unique_rand(random_state, extra_size)
j = np.floor(ind * 1. / nrows).astype(int)
i = (ind - j * nrows).astype(int)
return i.tolist(), j.tolist()
def jw_number_restrict_operator(operator, n_electrons, n_qubits=None):
"""Restrict a Jordan-Wigner encoded operator to a given particle number
Args:
sparse_operator(ndarray or sparse): Numpy operator acting on
the space of n_qubits.
n_electrons(int): Number of particles to restrict the operator to
n_qubits(int): Number of qubits defining the total state
Returns:
new_operator(ndarray or sparse): Numpy operator restricted to
acting on states with the same particle number.
"""
if n_qubits is None:
n_qubits = int(numpy.log2(operator.shape[0]))
select_indices = jw_number_indices(n_electrons, n_qubits)
return operator[numpy.ix_(select_indices, select_indices)]
def get_ground_state(sparse_operator):
"""Compute lowest eigenvalue and eigenstate.
Returns:
eigenvalue: The lowest eigenvalue, a float.
eigenstate: The lowest eigenstate in scipy.sparse csc format.
"""
if not is_hermitian(sparse_operator):
raise ValueError('sparse_operator must be Hermitian.')
values, vectors = scipy.sparse.linalg.eigsh(
sparse_operator, 2, which='SA', maxiter=1e7)
eigenstate = scipy.sparse.csc_matrix(vectors[:, 0])
eigenvalue = values[0]
return eigenvalue, eigenstate.getH()
def iter_chunks(self, chunksize=None):
"""
Iteratively yield the index as chunks of documents, each of size <= chunksize.
The chunk is returned in its raw form (matrix or sparse matrix slice).
The size of the chunk may be smaller than requested; it is up to the caller
to check the result for real length, using `chunk.shape[0]`.
"""
self.close_shard()
if chunksize is None:
# if not explicitly specified, use the chunksize from the constructor
chunksize = self.chunksize
for shard in self.shards:
query = shard.get_index().index
for chunk_start in xrange(0, query.shape[0], chunksize):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(query.shape[0], chunk_start + chunksize)
chunk = query[chunk_start: chunk_end] # create a view
yield chunk
def _getCscMatrix(self):#compressed sparse column matrix
if self._cscMatrix is not None:
return self._cscMatrix
# data and indices are parallel arrays,
# data storing values (ie tf*idf) and indices storing values
num_nnz, data, indices, indptr = 0, [], [], [0]
for termVector in self._termVectors:
newIndices = [i for i in termVector[1].keys()]
newValues = [v for v in termVector[1].values()]
indices.extend(newIndices)
data.extend(newValues)
num_nnz += len(newValues)
indptr.append(num_nnz)
data = numpy.asarray(data)
indices = numpy.asarray(indices)
# compressed sparse column matrix
# Rows terms, column docs
#
# doc1 doc2 doc3
# 'the' 1 1 1
# 'cat' 1 0 2
self._cscMatrix = scipy.sparse.csc_matrix((data, indices, indptr),
shape=(self.numTerms, self.numDocs))
return self._cscMatrix
def svd_recommender(self):
userid, itemid, contextid, values = self.fields
test_idx = (self.test.testset[userid].values,
self.test.testset[itemid].values)
if contextid:
#TODO: refactor it! need to think about dependence on self.arrange_by and contextid
#values are contextualized already
test_val = self.test.testset[values].values
else:
test_val = self.test.testset[values].values
v = self._items_factors
test_shp = (self.test.testset[userid].max()+1,
v.shape[1])
test_matrix = sp.sparse.coo_matrix((test_val, test_idx),
shape=test_shp,
dtype=np.float64).tocsr()
svd_scores = (test_matrix.dot(v.T)).dot(v)
return svd_scores
def test_serialize(self):
from scipy.sparse import lil_matrix
lil = lil_matrix((4, 1))
lil[1, 0] = 1
lil[3, 0] = 2
sv = SparseVector(4, {1: 1, 3: 2})
self.assertEqual(sv, _convert_to_vector(lil))
self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
self.assertEqual(sv, _convert_to_vector(lil.todok()))
def serialize(l):
return ser.loads(ser.dumps(_convert_to_vector(l)))
self.assertEqual(sv, serialize(lil))
self.assertEqual(sv, serialize(lil.tocsc()))
self.assertEqual(sv, serialize(lil.tocsr()))
self.assertEqual(sv, serialize(lil.todok()))
def _matvec(self, x_vec):
'''
dot operation provided for scipy.sparse.linalg
wrapper of self.forward()
'''
x2 = numpy.reshape(x_vec, self.st['Nd'], order='F')
return self.forward(x2)
def _matvec(self, x_vec):
"""
(To be tested): dot operation provided for scipy.sparse.linalg
wrapper of self.forward()
"""
x2 = numpy.reshape(x_vec, self.Nd, order='F')
return self.forward(x2)
def asmatrix(self):
"""
Return the sparse matrix representation of the separable filter.
"""
h_matrix = NP.array([1])
for i in range(self.ndim):
if self.mode == 'circ':
h_i = Convmtx([self.k[i]], self.h_list[i], mode=self.mode)
else:
h_i = Convmtx([self.n[i]], self.h_list[i], mode=self.mode)
h_matrix = scipy.sparse.kron(h_matrix, h_i)
return h_matrix