def compare_omission(mt_para_corpus, si_para_corpus, lang):
tag_weights, tok_weights = get_omission_weights(mt_para_corpus, si_para_corpus, lang)
mask = []
for mt_sent_pair, si_sent_pair in zip(mt_para_corpus.sent_pairs, si_para_corpus.sent_pairs):
if mt_sent_pair.good_alignment and si_sent_pair.good_alignment:
mask.append(True)
else:
mask.append(False)
mt_omit, mt_omit_detail, mt_omit_tok, mt_omit_all = count_omission(mask, mt_para_corpus, tag_weights, tok_weights, lang)
si_omit, si_omit_detail, si_omit_tok, si_omit_all = count_omission(mask, si_para_corpus, tag_weights, tok_weights, lang)
top_k = 10
print 'overall omission (si vs mt):'
ttest(si_omit_all, mt_omit_all)
print 'MT tag omissions:'
print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit if tag_weights[x[0]] > 0]).encode('utf-8')
print u'MT tok omissions:'
print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8')
print 'SI tag omissions:'
print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit if tag_weights[x[0]] > 0]).encode('utf8')
print 'SI tok omissions:'
print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8')
print 'Sentence omission stats:'
for tag in tag_weights:
if tag_weights[tag] > 0:
mt_mean = sum(mt_omit_detail[tag])
si_mean = sum(si_omit_detail[tag])
t, prob = stats.ttest_rel(mt_omit_detail[tag], si_omit_detail[tag])
if prob < 0.05:
print (u'%s\t%f\t%f\t%f\t%f' % (tag, mt_mean, si_mean, t, prob)).encode('utf8')
评论列表
文章目录