def compare_with_children(
self, idea_id, post_ids, post_clusters, remainder, labels):
# Compare to children classification
compare_with_ideas = None
all_idea_scores = []
ideas_of_post = defaultdict(list)
children_remainder = set(post_ids)
children_ids = self.idea_children[idea_id]
if len(children_ids):
posts_of_children = {
child_id: self.get_posts_of_idea(child_id)
for child_id in children_ids}
for idea_id, c_post_ids in posts_of_children.items():
for post_id in c_post_ids:
ideas_of_post[post_id].append(idea_id)
children_remainder -= set(c_post_ids)
for post_id in children_remainder:
ideas_of_post[post_id] = [idea_id]
# if many ideas to a post, choose one with the most ideas in same cluster.
# A bit arbitrary but I need a single idea.
for cluster in chain(post_clusters, (remainder,)):
idea_score = defaultdict(int)
all_idea_scores.append(idea_score)
for post_id in cluster:
for idea_id in ideas_of_post[post_id]:
idea_score[idea_id] += 1
for post_id in cluster:
if len(ideas_of_post[post_id]) > 1:
scores = [(idea_score[idea_id], idea_id)
for idea_id in ideas_of_post[post_id]]
scores.sort(reverse=True)
ideas_of_post[post_id] = [score[1] for score in scores]
# index_by_post_id = {v: k for (k, v) in post_id_by_index.iteritems()}
idea_of_index = [ideas_of_post[post_id][0] for post_id in post_ids]
compare_with_ideas = {
"Homogeneity": metrics.homogeneity_score(idea_of_index, labels),
"Completeness": metrics.completeness_score(idea_of_index, labels),
"V-measure": metrics.v_measure_score(idea_of_index, labels),
"Adjusted Rand Index": metrics.adjusted_rand_score(
idea_of_index, labels),
"Adjusted Mutual Information": metrics.adjusted_mutual_info_score(
idea_of_index, labels)}
else:
for post_id in children_remainder:
ideas_of_post[post_id] = [idea_id]
for cluster in chain(post_clusters, (remainder,)):
all_idea_scores.append({idea_id: len(cluster)})
return (compare_with_ideas, all_idea_scores, ideas_of_post,
children_remainder)
评论列表
文章目录