python类v_measure_score()的实例源码

test_birch.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_birch_predict():
    # Test the predict method predicts the nearest centroid.
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3,
                                n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
test_birch.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_birch_predict():
    # Test the predict method predicts the nearest centroid.
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3,
                                n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
Ex06.py 文件源码 项目:ml-deti 作者: mariolpantunes 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
utils.py 文件源码 项目:wikipedia_multilang 作者: ivanvladimir 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def bench_k_means(labels, labels_, name, data):
    print('%20s  %.3f   %.3f   %.3f   %.3f   %.3f'
          % ( name,
             metrics.homogeneity_score(labels,   labels_),
             metrics.completeness_score(labels,  labels_),
             metrics.v_measure_score(labels,     labels_),
             metrics.adjusted_rand_score(labels, labels_),
             metrics.adjusted_mutual_info_score(labels, labels_)))
    nbins=len(set(labels_))
    vals,bins=np.histogram(labels_,bins=nbins)
    print 20*' ','hist-min,max',np.min(vals),np.max(vals)
test_cluster.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_dbscan_noisy_utils():
    from freediscovery.cluster.utils import (_dbscan_noisy2unique,
                                             _dbscan_unique2noisy)
    from sklearn.metrics import v_measure_score

    x_ref = np.array([-1, 0, -1,  1, 1, -1,  0])
    y_ref = np.array([2, 0, 3, 1, 1, 4, 0])

    y = _dbscan_noisy2unique(x_ref)
    assert v_measure_score(y, y_ref) == 1

    # check inverse transform
    x = _dbscan_unique2noisy(y_ref)
    assert v_measure_score(x, x_ref) == 1
test_cluster.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_binary_linkage2clusters():
    from freediscovery.cluster.utils import _binary_linkage2clusters
    from sklearn.metrics import v_measure_score
    n_samples = 10
    linkage = np.array([[1, 2],
                        [2, 3],
                        [5, 7],
                        [6, 9]])

    cluster_id = _binary_linkage2clusters(linkage, n_samples)

    cluster_id_ref = np.array([0, 1, 1, 1, 2, 3, 4, 3, 5, 4])

    assert cluster_id.shape == cluster_id_ref.shape
    # i.e. same clusters
    assert v_measure_score(cluster_id, cluster_id_ref) == 1.0
classifier.py 文件源码 项目:Clustering 作者: Ram81 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def analyze_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data) 
    print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels,  estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
cluster.py 文件源码 项目:fitr 作者: abrahamnunes 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def performance(self, group_labels=None):
        """
        Computes performance metrics for clustering algorithm

        Parameters
        ----------
        group_labels : (optional) ndarray(shape=nsubjects)
            Labels for subject groups
        """
        n_samples = len(self.algorithm.labels_)

        if group_labels is None:
            truelab = np.zeros(n_samples)
            unique_labels = np.unique(group_labels)
            self.clusters["true_int"] = truelab
        else:
            truelab = np.zeros(n_samples)
            unique_labels = np.unique(group_labels)

            for i, label_i in enumerate(unique_labels):
                truelab[group_labels == label_i] = i

            self.clusters["true"] = group_labels
            self.clusters["true_int"] = truelab

        lab = self.algorithm.labels_
        self.results["homogeneity"] = homogeneity_score(truelab, lab)
        self.results["completeness"] = completeness_score(truelab, lab)
        self.results["v_measure"] = v_measure_score(truelab, lab)
        self.results["adj_rand"] = adjusted_rand_score(truelab, lab)
        self.results["adj_MI"] = adjusted_mutual_info_score(truelab, lab)
score.py 文件源码 项目:cluster_paraphrases 作者: acocos 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def score_clustering_solution(tgt, sol, gold, tempdir='eval/semeval_unsup_eval/keys', use_sklearn_vmeas=False, semeval_root='eval/semeval_unsup_eval'):
    '''
    Score clustering solution sol against gold classes.
    Both the sol and gold are passed as dictionaries with integer keys (value
    is unimportant) and sets of paraphrases in each cluster as values.
    Returns (fscore, precision, recall, vmeasure, homogeneity, completeness)
    :param tgt: str (target word you're clustering)
    :param sol: dict {int -> set}
    :param gold: dict {int -> set}
    :param tempdir: stra (temporary directory to store scoring key files)
    :param use_sklearn_vmeas: boolean (setting true will use SKLearn version of V-Measure instead of semeval script)
    :param semeval_root: str (path to semeval root directory)
    :return: FScore, precision, recall, V-Measure, homogeneity, completeness (all floats)
    '''
    ## Verify set of paraphrases in gold and sol are the same
    assert set.union(*sol.values()) == set.union(*gold.values())

    ## Write temporary key files
    tempsolkey = os.path.join(tempdir, 'sol_temp.key')
    tempgoldkey = os.path.join(tempdir, 'gld_temp.key')
    write_key(tempsolkey, tgt, sol)
    write_key(tempgoldkey, tgt, gold)

    ## Call scoring script
    tempscorefile = os.path.join(tempdir, 'scorestemp')
    tempscores = open(tempscorefile, 'w')
    score_semeval(tempsolkey, tempgoldkey, tempscores, semeval_root=semeval_root)
    tempscores.close()
    fscore, prec, rec, vmeas, hom, comp = read_scoring_soln(tempscorefile, tgt)

    ## Delete temporary key files
    # os.remove(tempsolkey)
    # os.remove(tempgoldkey)
    # os.remove(tempscorefile)
    if use_sklearn_vmeas:
        goldlab, sollab, words = get_labels(gold, sol)
        vmeas = metrics.v_measure_score(goldlab, sollab)
        hom = metrics.homogeneity_score(goldlab, sollab)
        comp = metrics.completeness_score(goldlab, sollab)

    return fscore, prec, rec, vmeas, hom, comp
measure.py 文件源码 项目:mnogoznal 作者: nlpub 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def evaluate(path):
    system = systems[path]

    measure, scores, clusters_gold, clusters_system = 0., OrderedDict(), [], []

    for lemma in lemmas:
        instances = sorted(gold[lemma].keys())

        senses_gold   = {sid: i for i, sid in enumerate(sorted(set(gold[lemma].values())))}
        senses_system = {sid: i for i, sid in enumerate(sorted(set(system[lemma].values())))}

        clusters_gold   = [senses_gold[gold[lemma][instance]]     for instance in instances]
        clusters_system = [senses_system[system[lemma][instance]] for instance in instances]

        if 'vmeasure' == args.measure:
            if 'instances' == args.average:
                measure += v_measure_score(clusters_gold, clusters_system) * len(instances) / total
            else:
                measure += v_measure_score(clusters_gold, clusters_system)

            scores[lemma] = (
                homogeneity_score(clusters_gold, clusters_system),
                completeness_score(clusters_gold, clusters_system),
                v_measure_score(clusters_gold, clusters_system)
            )
        else:
            scores[lemma] = adjusted_rand_score(clusters_gold, clusters_system)

            if 'instances' == args.average:
                measure += scores[lemma] * len(instances) / total
            else:
                measure += scores[lemma]

    if 'words' == args.average:
        measure /= len(lemmas)

    return measure, scores
plot_kmeans_digits.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
test_birch.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_birch_hierarchy():
    X, y = make_blobs(random_state=40)
    brc = Birch(n_clusters=None, branching_factor=5,
                compute_sample_indices=True)
    brc.fit(X)

    # make sure that leave nodes contain all the samples
    n_leaves = 1
    sample_id = []
    current_leaf = brc.dummy_leaf_.next_leaf_
    while current_leaf:
        subclusters = current_leaf.subclusters_
        for sc in subclusters:
            assert sc.n_samples_ == len(sc.samples_id_)
            sample_id += sc.samples_id_
        current_leaf = current_leaf.next_leaf_
        n_leaves += 1
    assert_array_equal(np.sort(sample_id), np.arange(X.shape[0]))

    # Verify that the resulting hierarchical tree is deeper than 1 level
    # (i.e. subclusters of the root node are nor tree leaves )
    assert len(brc.root_.subclusters_) < n_leaves

    # Make sure that subclusters of the root_ node contain all the samples
    sample_id = []
    for sc in brc.root_.subclusters_:
        sample_id += sc.samples_id_
        assert sc.n_samples_ == len(sc.samples_id_)
    assert_array_equal(np.sort(sample_id), np.arange(X.shape[0]))

    # Pick a sample at random and make sure that reported samples_id_
    # matches with the subcluster the sample is closest to
    document_id = 45
    document_in_subcluster = []
    distance_to_centroid = []
    for sc in brc.root_.subclusters_:
        centroid = X[sc.samples_id_, :].mean(axis=0)
        distance_to_centroid.append(((X[[document_id]] - centroid)**2).sum())
        document_in_subcluster.append(document_id in sc.samples_id_)

    assert np.argmin(distance_to_centroid) == \
        np.nonzero(document_in_subcluster)[0][0]

    # Make sure that we can recompute labels from tree leaves
    labels2 = np.zeros(X.shape[0], dtype=int)
    cluster_id = 0
    for current_leaf in brc._get_leaves():
        subclusters = current_leaf.subclusters_
        for sc in subclusters:
            labels2[list(sc.samples_id_)] = cluster_id
            cluster_id += 1

    assert np.unique(brc.labels_).shape == np.unique(labels2).shape
    # The two methods yield approximately equal labels
    assert v_measure_score(brc.labels_, labels2) > 0.95
AffinityPropagation.py 文件源码 项目:ProjectOfDataMining 作者: IljaNovo 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def compute_affinity_propagation(preference_, X):
    # DATA FILLING
    #text = io.Input.local_read_text_file(inputFilePath)
    #input_array = text.split('\n')
    centers = [[1, 1], [-1, -1], [1, -1]]
    n_samples = 300
    #Make Blobs used for generating of labels_true array
    if (X == None):
        X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0)
        print("Data is none!!!")
        print("Generating " + str(n_samples) + " samples")
    else :
        data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0)
    #slist = list()
    #for line in X:
    #    slist.append(line)
    #io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist)
    #float_array = []
    #for line in input_array:
    #    float_line = [float(i) for i in line.split(' ')]
    #    float_array.append(float_line)
    #X = array(float_array)

    af = AffinityPropagation(preference=preference_).fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
#    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels))

    plt.close('all')
    plt.figure(1)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
clusters.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def compare_with_children(
            self, idea_id, post_ids, post_clusters, remainder, labels):
        # Compare to children classification
        compare_with_ideas = None
        all_idea_scores = []
        ideas_of_post = defaultdict(list)
        children_remainder = set(post_ids)
        children_ids = self.idea_children[idea_id]
        if len(children_ids):
            posts_of_children = {
                child_id: self.get_posts_of_idea(child_id)
                for child_id in children_ids}
            for idea_id, c_post_ids in posts_of_children.items():
                for post_id in c_post_ids:
                    ideas_of_post[post_id].append(idea_id)
                children_remainder -= set(c_post_ids)
            for post_id in children_remainder:
                ideas_of_post[post_id] = [idea_id]
            # if many ideas to a post, choose one with the most ideas in same cluster.
            # A bit arbitrary but I need a single idea.
            for cluster in chain(post_clusters, (remainder,)):
                idea_score = defaultdict(int)
                all_idea_scores.append(idea_score)
                for post_id in cluster:
                    for idea_id in ideas_of_post[post_id]:
                        idea_score[idea_id] += 1
                for post_id in cluster:
                    if len(ideas_of_post[post_id]) > 1:
                        scores = [(idea_score[idea_id], idea_id)
                                  for idea_id in ideas_of_post[post_id]]
                        scores.sort(reverse=True)
                        ideas_of_post[post_id] = [score[1] for score in scores]
            # index_by_post_id = {v: k for (k, v) in post_id_by_index.iteritems()}
            idea_of_index = [ideas_of_post[post_id][0] for post_id in post_ids]
            compare_with_ideas = {
                "Homogeneity": metrics.homogeneity_score(idea_of_index, labels),
                "Completeness": metrics.completeness_score(idea_of_index, labels),
                "V-measure": metrics.v_measure_score(idea_of_index, labels),
                "Adjusted Rand Index": metrics.adjusted_rand_score(
                    idea_of_index, labels),
                "Adjusted Mutual Information": metrics.adjusted_mutual_info_score(
                    idea_of_index, labels)}
        else:
            for post_id in children_remainder:
                ideas_of_post[post_id] = [idea_id]
            for cluster in chain(post_clusters, (remainder,)):
                all_idea_scores.append({idea_id: len(cluster)})
        return (compare_with_ideas, all_idea_scores, ideas_of_post,
                children_remainder)


问题


面经


文章

微信
公众号

扫码关注公众号