def getWordTypeCooccurPieces(self, dtype=np.float32):
""" Calculate building blocks for word-word cooccur calculation
These pieces can be used for incremental construction.
Returns
-------
Q : 2D matrix, W x W (where W is vocab_size)
sameWordVec : 1D array, size W
nDoc : scalar
"""
sameWordVec = np.zeros(self.vocab_size)
data = np.zeros(self.word_count.shape, dtype=dtype)
for docID in xrange(self.nDoc):
start = self.doc_range[docID]
stop = self.doc_range[docID + 1]
N = self.word_count[start:stop].sum()
NNm1 = N * (N - 1)
sameWordVec[self.word_id[start:stop]] += \
self.word_count[start:stop] / NNm1
data[start:stop] = self.word_count[start:stop] / np.sqrt(NNm1)
# Now, create a sparse matrix that's D x V
sparseDocWordMat = scipy.sparse.csr_matrix(
(data, self.word_id, self.doc_range),
shape=(self.nDoc, self.vocab_size),
dtype=dtype)
# Q : V x V
from sklearn.utils.extmath import safe_sparse_dot
Q = safe_sparse_dot(
sparseDocWordMat.T, sparseDocWordMat, dense_output=1)
return Q, sameWordVec, self.nDoc
评论列表
文章目录