docsim.py 文件源码-python代码片段

docsim.py 文件源码

python

阅读 33 收藏 0 点赞 0 评论 0

项目：paragraph2vec 作者: thunlp 项目源码文件源码

def iter_chunks(self, chunksize=None):
        """
        Iteratively yield the index as chunks of documents, each of size <= chunksize.

        The chunk is returned in its raw form (matrix or sparse matrix slice).
        The size of the chunk may be smaller than requested; it is up to the caller
        to check the result for real length, using `chunk.shape[0]`.
        """
        self.close_shard()

        if chunksize is None:
            # if not explicitly specified, use the chunksize from the constructor
            chunksize = self.chunksize

        for shard in self.shards:
            query = shard.get_index().index
            for chunk_start in xrange(0, query.shape[0], chunksize):
                # scipy.sparse doesn't allow slicing beyond real size of the matrix
                # (unlike numpy). so, clip the end of the chunk explicitly to make
                # scipy.sparse happy
                chunk_end = min(query.shape[0], chunk_start + chunksize)
                chunk = query[chunk_start: chunk_end] # create a view
                yield chunk