def get_test_probs(cmd_args, ngrams_test, corpus_files, model):
""" Get sum of probabilities for ngrams of test data. """
# Initialize probs
sumprobs = {}
for lang in corpus_files:
sumprobs[lang] = 0.0
for ngram in ngrams_test:
for lang in corpus_files:
sumprobs[lang] += ngrams_test[ngram] * probability.LaplaceProbDist.logprob(model.smoothed[lang], ngram)
# The population prior is mostly useful for really small test snippets
if not cmd_args.no_prior:
for lang in corpus_files:
# Strip trailing .txt, and check if it's in the population statistics dict
lang_prefix = lang[:-4]
if lang_prefix in model.stats:
# Normalize population counts by approximate total number of people on earth
sumprobs[lang] += math.log(model.stats[lang_prefix] / 8e9)
else:
# If language isn't in the language population statistics,
# assume median value of all langs, which is about 500K
sumprobs[lang] += math.log(500000 / 8e9)
return sumprobs
评论列表
文章目录