TargetPlanner.py 文件源码-python代码片段

def calc_underprediction_scores_per_word(model, Data, LP=None, **kwargs):
    ''' Find scalar score for each vocab word. Larger => worse prediction.
    '''
    if LP is None:
        LP = model.calc_local_params(Data)
    DocWordFreq_emp = calcWordFreqPerDoc_empirical(Data)
    DocWordFreq_model = calcWordFreqPerDoc_model(model, LP)
    uError = np.maximum(DocWordFreq_emp - DocWordFreq_model, 0)
    # For each word, identify set of relevant documents
    DocWordMat = Data.to_sparse_docword_matrix().toarray()
    score = np.zeros(Data.vocab_size)
    # TODO: only consider words with many docs overall
    for vID in xrange(Data.vocab_size):
        countPerDoc = DocWordMat[:, vID]
        typicalWordCount = np.median(countPerDoc[countPerDoc > 0])
        candidateDocs = np.flatnonzero(countPerDoc > typicalWordCount)
        if len(candidateDocs) < 10:
            continue
        score[vID] = np.mean(uError[candidateDocs, vID])
    # Only give positive probability to words with above average score
    score = score - np.mean(score)
    score = np.maximum(score, 0)
    score = score * score  # make more peaked!
    score /= score.sum()
    return score