_database.py 文件源码-python代码片段

def get_sparse_matrix(self, chunk_size = 1000):
        """Fetches the time-series data matrix in compressed sparse row (csr)
        format. Does this in chunks to prevent memory usage issues.

        Parameters
        ----------
        chunk_size: int
            the number of items to fetch at one time. Default is 1000.

        Returns
        -------
        scipy.sparse.csr_matrix
            csr matrix object containing sequences/time-series as rows, samples
            /time-points as columns
        """
        data = np.empty(self.h5_table["timeseries/data"].shape)
        indices = np.empty(self.h5_table["timeseries/indices"].shape)
        indptr = np.empty(self.h5_table["timeseries/indptr"].shape)       
        chunks = list(range(0, data.shape[0], chunk_size))
        if chunks[-1] != data.shape[0]:
            chunks = chunks + [data.shape[0]]
        for i,j in zip(chunks[0:-1], chunks[1:]):
            self.h5_table["timeseries/data"].read_direct(data, np.s_[i:j],
                                                               np.s_[i:j])       
        chunks = list(range(0, indices.shape[0], chunk_size))
        if chunks[-1] != indices.shape[0]:
            chunks = chunks + [indices.shape[0]]
        for i,j in zip(chunks[0:-1], chunks[1:]):
            self.h5_table["timeseries/indices"].read_direct(indices,
                                                            np.s_[i:j],
                                                            np.s_[i:j])       
        chunks = list(range(0, indptr.shape[0], chunk_size))
        if chunks[-1] != indptr.shape[0]:
            chunks = chunks + [indptr.shape[0]]
        for i,j in zip(chunks[0:-1], chunks[1:]):
            self.h5_table["timeseries/indptr"].read_direct(indptr,
                                                           np.s_[i:j],
                                                           np.s_[i:j])
        return csr_matrix((data, indices, indptr))