matutils.py 文件源码-python代码片段

matutils.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

项目：paragraph2vec 作者: thunlp 项目源码文件源码

def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
    """
    Convert corpus into a dense numpy array (documents will be columns). You
    must supply the number of features `num_terms`, because dimensionality
    cannot be deduced from the sparse vectors alone.

    You can optionally supply `num_docs` (=the corpus length) as well, so that
    a more memory-efficient code path is taken.

    This is the mirror function to `Dense2Corpus`.

    """
    if num_docs is not None:
        # we know the number of documents => don't bother column_stacking
        docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype)
        for docno, doc in enumerate(corpus):
            result[:, docno] = sparse2full(doc, num_terms)
        assert docno + 1 == num_docs
    else:
        result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus)
    return result.astype(dtype)