def calc_underprediction_scores_per_word(model, Data, LP=None, **kwargs):
''' Find scalar score for each vocab word. Larger => worse prediction.
'''
if LP is None:
LP = model.calc_local_params(Data)
DocWordFreq_emp = calcWordFreqPerDoc_empirical(Data)
DocWordFreq_model = calcWordFreqPerDoc_model(model, LP)
uError = np.maximum(DocWordFreq_emp - DocWordFreq_model, 0)
# For each word, identify set of relevant documents
DocWordMat = Data.to_sparse_docword_matrix().toarray()
score = np.zeros(Data.vocab_size)
# TODO: only consider words with many docs overall
for vID in xrange(Data.vocab_size):
countPerDoc = DocWordMat[:, vID]
typicalWordCount = np.median(countPerDoc[countPerDoc > 0])
candidateDocs = np.flatnonzero(countPerDoc > typicalWordCount)
if len(candidateDocs) < 10:
continue
score[vID] = np.mean(uError[candidateDocs, vID])
# Only give positive probability to words with above average score
score = score - np.mean(score)
score = np.maximum(score, 0)
score = score * score # make more peaked!
score /= score.sum()
return score
评论列表
文章目录