langid.py 文件源码-python代码片段

langid.py 文件源码

python

阅读 35 收藏 0 点赞 0 评论 0

项目：witch-language 作者: jonsafari 项目源码文件源码

def get_test_probs(cmd_args, ngrams_test, corpus_files, model):
    """ Get sum of probabilities for ngrams of test data. """
    # Initialize probs
    sumprobs = {}
    for lang in corpus_files:
        sumprobs[lang] = 0.0

    for ngram in ngrams_test:
        for lang in corpus_files:
            sumprobs[lang] += ngrams_test[ngram] * probability.LaplaceProbDist.logprob(model.smoothed[lang], ngram)

    # The population prior is mostly useful for really small test snippets
    if not cmd_args.no_prior:
        for lang in corpus_files:
            # Strip trailing .txt, and check if it's in the population statistics dict
            lang_prefix = lang[:-4]
            if lang_prefix in model.stats:
                # Normalize population counts by approximate total number of people on earth
                sumprobs[lang] += math.log(model.stats[lang_prefix] / 8e9)
            else:
                # If language isn't in the language population statistics,
                # assume median value of all langs, which is about 500K
                sumprobs[lang] += math.log(500000 / 8e9)

    return sumprobs