def info_content(lookup_word):
"""
Uses the Brown corpus available in NLTK to calculate a Laplace
smoothed frequency distribution of words, then uses this information
to compute the information content of the lookup_word.
"""
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not word in brown_freqs:
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
short_sentence_similarity.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录