def statistics_by_aspect():
filename = "aspects_train.csv"
words_dist = nltk.ConditionalFreqDist()
sample_sizes = nltk.FreqDist()
samples_stream = get_samples_stream(filename)
for aspect,words in samples_stream:
sample_sizes[aspect] += 1
for word in words:
words_dist[aspect][word] += 1
for category,dist in words_dist.iteritems():
print "\n------- Category: {}".format(category)
print dist.most_common(20)
total_samples = sample_sizes.N()
print "\ntotally {} samples".format(total_samples)
for aspect, count in sample_sizes.iteritems():
print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)
python类ConditionalFreqDist()的实例源码
def findtags(self, tag_prefix, tagged_text):
'''
Find all words that match a 'tag' (word type) prefix
:param tag_prefix: The tag prefix
:type tag_prefix: ``str``
:param tagged_text: The text to search
:type tagged_text: ``list`` of ``dict``
'''
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
if tag.startswith(tag_prefix))
return dict((tag, cfd[tag].most_common(50)) for tag in cfd.conditions())
classifier.py 文件源码
项目:Neural-Learner-for-English-Language-Test
作者: taineleau
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def ngram_baseline(text):
ngs = ngrams(text, 2)
cnt = 0
"""
for t in ngs:
print(t, )
cnt = cnt + 1
if (cnt > 1000):
break
"""
refine = []
for (first, second) in ngs:
if (second[1] == 1):
#print(first[0], second[0], zip(first[0], second[0]))
#tmp = (first[0], second[0])
#print(tmp)
#break
refine.append((first[0], second[0]))
cnt = 0
"""
for t in refine:
print(t)
cnt = cnt + 1
if (cnt > 1000):
break
#print(ngs)
"""
cfdist = nltk.ConditionalFreqDist(refine)
return cfdist
def calc_cfd(doc):
# Calculate conditional frequency distribution of bigrams
words = [w for w, t in Mecab().pos(doc)]
bigrams = nltk.bigrams(words)
return nltk.ConditionalFreqDist(bigrams)
def generate_from_trigrams(lm, start_words, n_words):
"""
backoff model
start_words: list of two strings.
n_words: integer >= 0, number of words to generate, not including start_words
lm: lowercase_tokens must be nonempty
"""
# Create probability maps
trigram_counter = Counter(ngrams(lm.lowercase_tokens, 3))
trigram_prob = trigram_prob_map(trigram_counter)
bigram_cfd = nltk.ConditionalFreqDist(ngrams(lm.lowercase_tokens, 2))
bigram_prob = bigram_prob_map(bigram_cfd)
unigram_counter = Counter(lm.lowercase_tokens)
unigram_prob = unigram_prob_map(unigram_counter)
# Build sentence
w1, w2 = start_words[0], start_words[1]
words = [w1, w2]
for i in range(n_words):
# Use trigram
if (w1, w2) in trigram_prob:
prob_map = trigram_prob[(w1, w2)]
next_words = prob_map.keys()
next_word = choice(next_words, p=[prob_map[w] for w in next_words])
# Use bigram
elif w2 in bigram_prob:
prob_map = bigram_prob[w2]
next_words = prob_map.keys()
next_word = choice(next_words, p=[prob_map[w] for w in next_words])
# Use unigram
else:
prob_map = unigram_prob
next_words = prob_map.keys()
next_word = choice(next_words, p=[prob_map[w] for w in next_words])
# Update words
w1 = w2
w2 = next_word
words.append(w2)
sentence = ' '.join(words)
return sentence