def test_homonym(H, sent, features, C=1.0):
X_0 = features(matching(sent, H[0]))
X_1 = features(matching(sent, H[1]))
y_0 = numpy.zeros(len(X_0))
y_1 = numpy.ones(len(X_1))
X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
y = numpy.hstack([y_0, y_1])
classifier = LogisticRegression(C=C)
fold = StratifiedKFold(y, n_folds=10)
score = []
count = []
for tr, te in fold:
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
classifier.fit(X_tr, y_tr)
score.append(sum(classifier.predict(X_te) == y_te))
count.append(len(y_te))
score = numpy.array(score, dtype='float')
count = numpy.array(count, dtype='float')
result = {'word1_count': len(y_0),
'word2_count': len(y_1),
'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
'kfold_acc': score/count }
return result
评论列表
文章目录