def train_and_score(max_movie_id, training, testset, model_sizes):
extractors = dict()
models = dict()
print "Creating models"
for model_size in model_sizes:
extractors[model_size] = FeatureHasher(n_features=2**model_size)
models[model_size] = SGDClassifier(loss="log", penalty="L2")
print "Training"
for i, (user_id, seen_movies) in enumerate(training):
print "Training on user", i, user_id
labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies)
for model_size, extractor in extractors.iteritems():
seen_features = extractor.transform(seen_pairs)
unseen_features = extractor.transform(unseen_pairs)
features = sp.vstack([seen_features, unseen_features])
model = models[model_size]
model.partial_fit(features, labels, classes=[0, 1])
print "Testing"
all_labels = []
all_predicted_labels = defaultdict(list)
all_predicted_prob = defaultdict(list)
for i, (user_id, seen_movies) in enumerate(testset):
print "Testing on user", i, user_id
labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies)
all_labels.extend(labels)
for model_size, extractor in extractors.iteritems():
seen_features = extractor.transform(seen_pairs)
unseen_features = extractor.transform(unseen_pairs)
features = sp.vstack([seen_features, unseen_features])
model = models[model_size]
predicted_labels = model.predict(features)
predicted_prob = model.predict_proba(features)
all_predicted_labels[model_size].extend(predicted_labels)
# Probabilities for positive class
all_predicted_prob[model_size].extend(predicted_prob[:, 1])
print "Scoring"
aucs = []
nnz_features = []
for model_size, model in models.iteritems():
pred_log_prob = all_predicted_prob[model_size]
auc = roc_auc_score(all_labels, pred_log_prob)
cm = confusion_matrix(all_labels, all_predicted_labels[model_size])
print "Model size", model_size, "auc", auc
print cm
print
aucs.append(auc)
nnz_features.append(np.count_nonzero(model.coef_))
return aucs, nnz_features
评论列表
文章目录