python类DBSCAN的实例源码-第2页-面圈网

vtTool.py 文件源码项目：Snakepit 作者: K4lium 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def clusterMalwareNames(malwareNames):
    # strictly lexical clustering over malware-names
    wordCount = {}
    # create a distance matrix
    matrix = np.zeros((len(malwareNames), len(malwareNames)))
    for i in range(len(malwareNames)):
        for j in range(len(malwareNames)):
            if matrix[i, j] == 0.0:        
                matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
                matrix[j, i] = matrix[i, j]

    # Scikit-Learn's DBSCAN implementation to cluster the malware-names
    clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
    clust.fit(matrix)    

    preds = clust.labels_
    clabels = np.unique(preds)

    # create Word-Count Map
    for i in range(clabels.shape[0]):
        if clabels[i] < 0:
            continue

        cmem_ids = np.where(preds == clabels[i])[0]
        cmembers = []

        for cmem_id in cmem_ids:
            cmembers.append(malwareNames[cmem_id])

        wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
    return wordCount

6.2 DBSCAN.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_DBSCAN(*data):
    '''
    test the DBSCAN method
    :param data:  train, target
    :return: None
    '''
    X,labels_true=data
    clst=cluster.DBSCAN()
    predicted_labels=clst.fit_predict(X)
    print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels))
    print("Core sample num:{0}".format(len(clst.core_sample_indices_)))

6.2 DBSCAN.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_DBSCAN_epsilon(*data):
    '''
    test the score with different eps
    :param data:  train, target
    :return: None
    '''
    X,labels_true=data
    epsilons=np.logspace(-1,1.5)
    ARIs=[]
    Core_nums=[]
    for epsilon in epsilons:
        clst=cluster.DBSCAN(eps=epsilon)
        predicted_labels=clst.fit_predict(X)
        ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
        Core_nums.append(len(clst.core_sample_indices_))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,2,1)
    ax.plot(epsilons,ARIs,marker='+')
    ax.set_xscale('log')
    ax.set_xlabel(r"$\epsilon$")
    ax.set_ylim(0,1)
    ax.set_ylabel('ARI')

    ax=fig.add_subplot(1,2,2)
    ax.plot(epsilons,Core_nums,marker='o')
    ax.set_xscale('log')
    ax.set_xlabel(r"$\epsilon$")
    ax.set_ylabel('Core_Nums')

    fig.suptitle("DBSCAN")
    plt.show()

6.2 DBSCAN.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test_DBSCAN_min_samples(*data):
    '''
    test the score with different min_sample
    :param data:  train, target
    :return:  None
    '''
    X,labels_true=data
    min_samples=range(1,100)
    ARIs=[]
    Core_nums=[]
    for num in min_samples:
        clst=cluster.DBSCAN(min_samples=num)
        predicted_labels=clst.fit_predict(X)
        ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
        Core_nums.append(len(clst.core_sample_indices_))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,2,1)
    ax.plot(min_samples,ARIs,marker='+')
    ax.set_xlabel( "min_samples")
    ax.set_ylim(0,1)
    ax.set_ylabel('ARI')

    ax=fig.add_subplot(1,2,2)
    ax.plot(min_samples,Core_nums,marker='o')
    ax.set_xlabel( "min_samples")
    ax.set_ylabel('Core_Nums')

    fig.suptitle("DBSCAN")
    plt.show()

runDBSCAN.py 文件源码项目：simsearch 作者: chrisjmccormick 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def runClustering(ssearch, eps, min_samples):
    """
    Run DBSCAN with the determined eps and MinPts values.
    """
    print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))

    # Initialize DBSCAN with parameters.
    # I forgot to use cosine at first!
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')

    # Time this step.
    t0 = time.time()

    # Cluster the LSI vectors.     
    db.fit(ssearch.index.index)

    # Calculate the elapsed time (in seconds)
    elapsed = (time.time() - t0)
    print("  done in %.3fsec" % elapsed)

    # Get the set of unique IDs.
    cluster_ids = set(db.labels_)

    # Show the number of clusters (don't include noise label)
    print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))  

    # For each of the clusters...    
    for cluster_id in cluster_ids:

            # Get the list of all doc IDs belonging to this cluster.
            cluster_doc_ids = []
            for doc_id in range(0, len(db.labels_)):            
                if db.labels_[doc_id] == cluster_id:
                    cluster_doc_ids.append(doc_id)

            # Get the top words in this cluster
            top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)

            print('  Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))

runDBSCAN.py 文件源码项目：simsearch 作者: chrisjmccormick 项目源码文件源码阅读 15 收藏 0 点赞 0 评论 0

def main():   
    """
    Entry point for the script.
    """

    ###########################################################################
    # Load the corpus
    ###########################################################################

    # Load the pre-built corpus.
    print('Loading the saved SimSearch and corpus...')
    (ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/')

    print '    %d documents.' % len(ssearch.index.index)

    # Step 1: Run a technique to find a good 'eps' value.
    #findEps(ssearch)
    #eps = 0.5
    eps = 0.44

    # Step 2: Run a technique to find a good 'MinPts' value.    
    # TODO - This took ~17 min. on my desktop!
    #findMinPts(ssearch, eps)
    #min_samples = 8
    min_samples = 4

    # Step 3: Run DBSCAN
    runClustering(ssearch, eps, min_samples)

dbscan.py 文件源码项目：geolife 作者: xuzhongyou 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def dbscan(userid,X):

    db = DBSCAN(eps=0.15,min_samples=4).fit(X)
    # print db.labels_     zeros_like
    core_samples_mask = np.zeros_like(db.labels_,dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    lables = db.labels_ 
    labels_list = list(lables)
    # print labels_list.count(-1)
    out_user.setdefault(userid,0)
    out_user[userid] = labels_list.count(-1)
    print out_user

    # print labels_list.index(-1)
    print lables
    n_clusters_ = len(set(lables)) -(1 if -1 in lables else 0)
    unique_lables = set(lables)
    cols = plt.cm.Spectral(np.linspace(0,1,len(unique_lables)))
    # center_points = []
    for k,col in zip(unique_lables,cols):
        if k == -1:
            col = 'k'
        class_member_mask = (lables == k)
        k_x = X[class_member_mask & core_samples_mask]
        plt.plot(k_x[:,0],k_x[:,1],'o',markerfacecolor = col,
            markeredgecolor = 'k' , markersize = 5)
        center_points.append([np.mean(k_x[:,1]),np.mean(k_x[:,0])])
    plt.title('DBSCAN :Estimated number of clusters: %d' % n_clusters_)
    # plt.show()

mapper.py 文件源码项目：cartographer 作者: pablodecm 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, filterer=PCA(n_components=2),
                 coverer=HyperRectangleCoverer(),
                 clusterer=DBSCAN(),
                 params=None):
        self.filterer = filterer
        self.coverer = coverer
        self.clusterer = clusterer
        if params is not None:
            self.set_params(**params)

testing.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def set_random_state(estimator, random_state=0):
    """Set random state of an estimator if it has the `random_state` param.

    Classes for whom random_state is deprecated are ignored. Currently DBSCAN
    is one such class.
    """

    if isinstance(estimator, DBSCAN):
        return

    if "random_state" in estimator.get_params():
        estimator.set_params(random_state=random_state)

dbscan.py 文件源码项目：pypardis 作者: bwoneill 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def train(self, data):
        """
        :type data: pyspark.RDD
        :param data: (key, k-dim vector like)
        Train the model using a (key, vector) RDD
        """
        parts = KDPartitioner(data, self.max_partitions)
        self.data = data
        self.bounding_boxes = parts.bounding_boxes
        self.expanded_boxes = {}
        self._create_neighborhoods()
        # repartition data set on the partition label
        self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \
            .partitionBy(len(parts.partitions)) \
            .map(lambda (p, (k, v)): ((k, p), v))
        # create parameters for sklearn DBSCAN
        params = {'eps': self.eps, 'min_samples': self.min_samples,
                  'metric': self.metric}
        # perform dbscan on each part
        self.data = self.data.mapPartitions(
            lambda iterable: dbscan_partition(iterable, params))
        self.data.cache()
        self._remap_cluster_ids()

dbscan.py 文件源码项目：pypardis 作者: bwoneill 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def assignments(self):
        """
        :rtype: list
        :return: list of (key, cluster_id)
        Retrieve the results of the DBSCAN
        """
        return self.result.collect()

clusters.py 文件源码项目：extract 作者: dblalock 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def makeClusterers(X, k=2):
    return [('MiniBatchKMeans', makeKMeans(X, k)),
            ('AffinityPropagation', makeAffinityProp()),
            ('MeanShift', makeMeanShift(X)),
            ('SpectralClustering', makeSpectral(X, k)),
            ('Ward', makeWard(X, k)),
            ('AgglomerativeAvg', makeAvgLinkage(X, k)),
            ('AgglomerativeMax', makeMaxLinkage(X, k)),
            ('AgglomerativeWard', makeWardLinkage(X, k)),
            ('DBSCAN', makeDBScan())]

clustering.py 文件源码项目：SnapStitch 作者: avikj 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def cluster(X, eps=1, min_pts=30, algorithm='DBSCAN', n_clusters=10):
  if algorithm == 'DBSCAN':
    cluster_result = DBSCAN(eps=eps, min_samples=min_pts).fit(X)
  elif algorithm == 'KMeans':
    cluster_result = KMeans(n_clusters=n_clusters)
  labels = cluster_result.labels_
  return labels

location.py 文件源码项目：TrackToTrip 作者: ruipgil 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def update_location_centroid(point, cluster, max_distance, min_samples):
    """ Updates the centroid of a location cluster with another point

    Args:
        point (:obj:`Point`): Point to add to the cluster
        cluster (:obj:`list` of :obj:`Point`): Location cluster
        max_distance (float): Max neighbour distance
        min_samples (int): Minimum number of samples
    Returns:
        (:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid
            and new point cluster (given cluster + given point)
    """
    cluster.append(point)
    points = [p.gen2arr() for p in cluster]

    # Estimates the epsilon
    eps = estimate_meters_to_deg(max_distance, precision=6)

    p_cluster = DBSCAN(eps=eps, min_samples=min_samples)
    p_cluster.fit(points)

    clusters = {}
    for i, label in enumerate(p_cluster.labels_):
        if label in clusters.keys():
            clusters[label].append(points[i])
        else:
            clusters[label] = [points[i]]

    centroids = []
    biggest_centroid_l = -float("inf")
    biggest_centroid = None

    for label, n_cluster in clusters.items():
        centroid = compute_centroid(n_cluster)
        centroids.append(centroid)

        if label >= 0 and len(n_cluster) >= biggest_centroid_l:
            biggest_centroid_l = len(n_cluster)
            biggest_centroid = centroid

    if biggest_centroid is None:
        biggest_centroid = compute_centroid(points)

    return biggest_centroid, cluster

preprocess.py 文件源码项目：tianchi_power 作者: lvniqi 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def classify_user():
    new_df_log_scaled = get_scaled_user()
    c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T)
    pd.value_counts(c.labels_)
    d = c.labels_
    types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0]
    types[types == -1] = 2
    return types

dmonscilearncluster.py 文件源码项目：dmon-adp 作者: igabriel85 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def detect(self, method, model, data):
        '''
        :param method: -> method name
        :param model: -> trained clusterer
        :param data: -> dataframe with data
        :return: -> dictionary that contains the list of anomalous timestamps
        '''
        smodel = self.__loadClusterModel(method, model)
        anomalieslist = []
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                if isinstance(smodel, IsolationForest):
                    print "Detected IsolationForest model"
                    print "Contamination -> %s" % smodel.contamination
                    print "Max_Features -> %s" % smodel.max_features
                    print "Max_Samples -> %s" % smodel.max_samples_
                    print "Threashold -> %s " % smodel.threshold_
                    try:
                        dpredict = smodel.predict(data)
                        print "IsolationForest Prediction Array -> %s" %str(dpredict)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
                        dpredict = 0

                elif isinstance(smodel, DBSCAN):
                    print "Detected DBSCAN model"
                    print "Leaf_zise -> %s" % smodel.leaf_size
                    print "Algorithm -> %s" % smodel.algorithm
                    print "EPS -> %s" % smodel.eps
                    print "Min_Samples -> %s" % smodel.min_samples
                    print "N_jobs -> %s" % smodel.n_jobs
                    try:
                        dpredict = smodel.fit_predict(data)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                                     inst.args)
                        dpredict = 0
            else:
                dpredict = 0
                logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
                             str(data.shape[1]))
                print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
                             str(data.shape[1]))
            print "dpredict type is %s" % (type(dpredict))
            if type(dpredict) is not int:
                anomalyarray = np.argwhere(dpredict == -1)
                for an in anomalyarray:
                    anomalies = {}
                    anomalies['utc'] = int(data.iloc[an[0]]['key'])
                    anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key']))
                    anomalieslist.append(anomalies)
        anomaliesDict = {}
        anomaliesDict['anomalies'] = anomalieslist
        logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
        return anomaliesDict

cluster-triples.py 文件源码项目：information-extraction-PT 作者: davidsbatista 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def main():

    """
    compute_embeddings_vectors()
    print "Reading embedding vectors"
    with open('triples_vectors.pkl', 'r') as in_file:
        triples = pickle.load(in_file)
    vectors = []
    for t in triples:
        vectors.append(t.vector)
    """

    text = []
    triples = []
    with open('triples.csv', 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for t in reader:
            e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
            t = Triple(e1, e1_type, rel, e2, e2_type)
            text.append(rel)
            triples.append(t)

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(text)

    print "Clustering"
    dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute',
                    leaf_size=30, p=None, n_jobs=1)
    labels = dbscan.fit_predict(tfidf_matrix)
    with open('triples_labels.txt', 'w') as out_file:
        for l in labels:
            out_file.write(str(l) + '\n')

    print "Reading cluster labels"
    labels = []
    with open('triples_labels.txt', 'r') as in_file:
        for label in in_file:
            labels.append(int(label.strip()))

    for i in range(len(triples)):
        triples[i].label = labels[i]

    clusters = dict()
    for t in triples:
        try:
            clusters[t.label] += 1
        except KeyError:
            clusters[t.label] = 1

    print clusters
    exit(-1)
    # print len(clusters)

    # top-terms for each cluster
    for x in range(-1, len(clusters)):
        print x, len(clusters[x])
        for t in triples:
            if t.label == str(x):
                print t.rel
        print
        print

StreamParser.py 文件源码项目：meleedb-segment 作者: sashahashi 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def detect_match_chunks(self, max_error=.06):
        percent = cv2.imread("assets/pct.png")
        corr_series = []

        for (time, scene) in self.sample_frames(interval=self.polling_interval):
            cv2.imwrite("scene.png", scene)
            scene = cv2.imread("scene.png")

            scaled_percent = cv2.resize(
                percent, (0, 0), fx=self.scale, fy=self.scale)
            scaled_percent = cv2.Canny(scaled_percent, 50, 200)

            percent_corrs = []
            for port_number, roi in enumerate(self.ports):
                if roi is not None:
                    scene_roi = scene[roi.top:(roi.top + roi.height), roi.left:(roi.left + roi.width)]
                    scene_roi = cv2.Canny(scene_roi, 50, 200)

                    corr_map = cv2.matchTemplate(scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
                    _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
                    percent_corrs.append(max_corr)

            point = [time, max(percent_corrs)]
            corr_series.append(point)

        corr_series = np.array(corr_series)

        medians = pd.rolling_median(corr_series[:, 1], self.min_gap //
                                    self.polling_interval, center=True)[2:-2]

        clusters = DBSCAN(eps=0.03, min_samples=10).fit(medians.reshape(-1, 1))

        dataframe = list(zip(corr_series[:, 0][2:-2], medians, clusters.labels_))

        labels = list(set(x[2] for x in dataframe))
        cluster_means = [sum(cluster) / len(cluster) for cluster in [[x[1] for x in dataframe if x[2] == label] for label in labels]]
        cluster_means = list(zip(labels, cluster_means))

        game_label = max(cluster_means, key=lambda x: x[1])[0]
        game_groups = [(k, list(v)) for k, v in groupby(dataframe, lambda pt: pt[2])]
        games = [[v[0][0], v[-1][0]] for k, v in game_groups if k == game_label]

        return games

MatchParser.py 文件源码项目：meleedb-segment 作者: sashahashi 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __detect_match_chunks(self, max_error=.04):
        percent = cv2.imread("assets/pct.png")
        corr_series = []

        for (time, scene) in spaced_frames(self, interval=self.polling_interval):
            cv2.imwrite("scene.png", scene)
            scene = cv2.imread("scene.png")

            scaled_percent = cv2.resize(
                percent, (0, 0), fx=self.scale, fy=self.scale)
            scaled_percent = cv2.Canny(scaled_percent, 50, 200)

            percent_corrs = []
            for port_number, roi in enumerate(self.ports):
                if roi is not None:
                    scene_roi = scene[roi.top:roi.bottom, roi.left:roi.right]
                    scene_roi = cv2.Canny(scene_roi, 50, 200)

                    corr_map = cv2.matchTemplate(
                        scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
                    _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
                    percent_corrs.append(max_corr)

            point = [time, max(percent_corrs)]
            corr_series.append(point)

        corr_series = np.array(corr_series)

        def moving_average(series, n=5):
            return np.convolve(series, np.ones((n,)) / n, mode='valid')

        medians = rolling_median(corr_series[:, 1], self.min_gap // self.polling_interval, center=True)[2:-2]
        clusters = DBSCAN(eps=0.05, min_samples=10).fit(medians.reshape(-1, 1))

        centers = kmeans.cluster_centers_
        points = zip([time + (self.min_gap / 2)
                      for time, corr in corr_series], kmeans.labels_)

        # Throw out the lowest cluster
        groups = [(k, list(v))
                  for k, v in groupby(points, lambda pt: centers[pt[1]] > max(min(centers), .2))]
        games = [[v[0][0], v[-1][0]] for k, v in groups if k]

        return games

cluster.py 文件源码项目：icing 作者: slipguru 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))