def get_sparse_matrix(self, chunk_size = 1000):
"""Fetches the time-series data matrix in compressed sparse row (csr)
format. Does this in chunks to prevent memory usage issues.
Parameters
----------
chunk_size: int
the number of items to fetch at one time. Default is 1000.
Returns
-------
scipy.sparse.csr_matrix
csr matrix object containing sequences/time-series as rows, samples
/time-points as columns
"""
data = np.empty(self.h5_table["timeseries/data"].shape)
indices = np.empty(self.h5_table["timeseries/indices"].shape)
indptr = np.empty(self.h5_table["timeseries/indptr"].shape)
chunks = list(range(0, data.shape[0], chunk_size))
if chunks[-1] != data.shape[0]:
chunks = chunks + [data.shape[0]]
for i,j in zip(chunks[0:-1], chunks[1:]):
self.h5_table["timeseries/data"].read_direct(data, np.s_[i:j],
np.s_[i:j])
chunks = list(range(0, indices.shape[0], chunk_size))
if chunks[-1] != indices.shape[0]:
chunks = chunks + [indices.shape[0]]
for i,j in zip(chunks[0:-1], chunks[1:]):
self.h5_table["timeseries/indices"].read_direct(indices,
np.s_[i:j],
np.s_[i:j])
chunks = list(range(0, indptr.shape[0], chunk_size))
if chunks[-1] != indptr.shape[0]:
chunks = chunks + [indptr.shape[0]]
for i,j in zip(chunks[0:-1], chunks[1:]):
self.h5_table["timeseries/indptr"].read_direct(indptr,
np.s_[i:j],
np.s_[i:j])
return csr_matrix((data, indices, indptr))
评论列表
文章目录