def _getCscMatrix(self):#compressed sparse column matrix
if self._cscMatrix is not None:
return self._cscMatrix
# data and indices are parallel arrays,
# data storing values (ie tf*idf) and indices storing values
num_nnz, data, indices, indptr = 0, [], [], [0]
for termVector in self._termVectors:
newIndices = [i for i in termVector[1].keys()]
newValues = [v for v in termVector[1].values()]
indices.extend(newIndices)
data.extend(newValues)
num_nnz += len(newValues)
indptr.append(num_nnz)
data = numpy.asarray(data)
indices = numpy.asarray(indices)
# compressed sparse column matrix
# Rows terms, column docs
#
# doc1 doc2 doc3
# 'the' 1 1 1
# 'cat' 1 0 2
self._cscMatrix = scipy.sparse.csc_matrix((data, indices, indptr),
shape=(self.numTerms, self.numDocs))
return self._cscMatrix
评论列表
文章目录