python类cluster()的实例源码-面圈网

deal.py 文件源码项目：DomainDependencyMemeJsai2017 作者: GINK03 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def _step5(arr):
  kmeans = pickle.loads(open("kmeans.model", "rb").read())
  key, lines, tipe = arr
  print(key)
  open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines))
  res  = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read()
  w    = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w")
  for line in res.split("\n"):
    try:
      vec = list(map(float, line.split()[-100:]))
    except:
      print(line)
      print(res)
      continue
    x = np.array(vec)
    if np.isnan(x).any():
      continue
    cluster = kmeans.predict([vec])
    txt = line.split()[:-100]
    obj = {"txt": txt, "cluster": cluster.tolist()} 
    data = json.dumps(obj, ensure_ascii=False)
    w.write( data + "\n" )

deal.py 文件源码项目：DomainDependencyMemeJsai2017 作者: GINK03 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def step6():

  for tipe in ["news", "nocturne"]:
    names = [name for name in reversed(sorted(glob.glob("./tmp/tmp.{tipe}.*.json".format(tipe=tipe))))]
    size  = len(names)
    for en, name in enumerate(names):
      term_clus = {}
      oss = []
      with open(name) as f:
        for line in f:
          line = line.strip()
          oss.append(json.loads(line))
      for i in range(3, len(oss) - 3):
        terms = set( oss[i]["txt"] )
        for term in terms:
          if term_clus.get(term) is None:
             term_clus[term] = [0.0]*128
          cd = [oss[i+d]["cluster"][0] for d in [-3, -2, -1, 1, 2, 3]]
          for c in cd: 
            term_clus[term][c] += 1.0
      print("{}/{} finished {}".format(en, size, name))
    open("{tipe}.term_clus.pkl".format(tipe=tipe), "wb").write( pickle.dumps(term_clus) )

gmm.py 文件源码项目：meetups 作者: papers-we-love-bucharest 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def do_kmeans(data, k):
  km = sklearn.cluster.KMeans(n_clusters=k)
  km.fit(data)

  means = km.cluster_centers_.reshape((-1,))

  #initialize standard deviations with distances between random cluster centers
  sds = []
  for i in range(means.shape[0]):
    # choose any 2 means and take half the distance between them
    x, y = np.random.choice(means, 2, replace=False)
    sds.append((x-y)/2)
  sds = np.abs(np.array(sds))

  return (means, sds)

# expectation maximization for gmm
# use_kmeans: whether to initialize using kmeans or randomly
# use_priors: whether to model the prior distribution;
# this attaches a weight to each distribution that tells us
# the percentage of points generated from that distribution

TICC.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def computeF1_macro(confusion_matrix,matching, num_clusters):
    """
    computes the macro F1 score
    confusion matrix : requres permutation
    matching according to which matrix must be permuted
    """
    ##Permute the matrix columns
    permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
    for cluster in xrange(num_clusters):
        matched_cluster = matching[cluster]
        permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
    ##Compute the F1 score for every cluster
    F1_score = 0
    for cluster in xrange(num_clusters):
        TP = permuted_confusion_matrix[cluster,cluster]
        FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
        FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        f1 = stats.hmean([precision,recall])
        F1_score += f1
    F1_score /= num_clusters
    return F1_score

network_accuracy.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def computeF1_macro(confusion_matrix,matching, num_clusters):
    """
    computes the macro F1 score
    confusion matrix : requres permutation
    matching according to which matrix must be permuted
    """
    ##Permute the matrix columns
    permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
    for cluster in xrange(num_clusters):
        matched_cluster = matching[cluster]
        permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
    ##Compute the F1 score for every cluster
    F1_score = 0
    for cluster in xrange(num_clusters):
        TP = permuted_confusion_matrix[cluster,cluster]
        FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
        FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        f1 = stats.hmean([precision,recall])
        F1_score += f1
    F1_score /= num_clusters
    return F1_score

dbscan.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options

train.py 文件源码项目：bitcoin 作者: westrik 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def split_into_intervals(data, n):
    """
    Split time series into n minute intervals
    """
    # Throw away time, bid/ask numbers
    prices =  [x[1] for x in data]

    # create a len n-1 array of price differences (10 second increments)
    price_diffs = np.diff(prices)

    # m = interval length in terms of data points (6*~10sec = 1 minute)
    m = n * 6

    # each datapoint we're trying to cluster will be of the form:
    #     (xi,yi) = (time series of prices, price change after series)
    intervals = np.zeros((len(prices)-1,m+1))

    for i in range(0, len(prices)-m-1):
        intervals[i,0:m] = prices[i:i+m]
        intervals[i,m] = price_diffs[i+m]

    return intervals

6.4 GaussianMixture.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def plot_data(*data):
    '''
    graph the dataset
    :param data: data, target
    :return: None
    '''
    X,labels_true=data
    labels=np.unique(labels_true)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    colors='rgbyckm'
    for i,label in enumerate(labels):
        position=labels_true==label
        ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
        color=colors[i%len(colors)])

    ax.legend(loc="best",framealpha=0.5)
    ax.set_xlabel("X[0]")
    ax.set_ylabel("Y[1]")
    ax.set_title("data")
    plt.show()

cluster.py 文件源码项目：Treehacks 作者: andrewsy97 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def spectral_clustering(messages, dist_func=combined, num_clusters=3):
    '''
    takes a list of converstation messages and return `num_cluster` threads.
    '''
    m = len(messages)
    affinity = np.zeros((m, m))

    # extract message features.
    for (mi, message) in enumerate(messages):
        if type(message) != dict:
            message = {
                'text': message
            }
        if 'feat' not in message: # extract on the fly.
            message['feat'] = extract_all(parse_body(message['text']))
        messages[mi] = message # write back.

    # build affinity matrix.
    for mi in range(m):
        for mj in range(m):
            affinity[mi, mj] = np.exp(-1.0 * keywords_l0(
                        messages[mi]['feat'],
                        messages[mj]['feat']
                    ))

    # run clustering.
    print affinity
    labels = sklearn.cluster.spectral_clustering(affinity, n_clusters=num_clusters, eigen_solver='arpack')

    return labels

cluster.py 文件源码项目：Treehacks 作者: andrewsy97 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def adhoc_clustering(messages, dist_func=combined):
    ''' an adhoc method for clustering messages '''
    m = len(messages)

    # extract message features.
    for (mi, message) in enumerate(messages):
        if type(message) != dict:
            message = {
                'text': message
            }
        message.update(extract_all(parse_body(message['text'])))

    # run clustering (ad hoc).
    max_label = 0
    bias = 600
    labels = []

    for (mi, message) in enumerate(messages):
        min_mj = -1
        min_dist = float('inf')
        for mj in range(mi-1, -1, -1):
            dist = dist_func(messages[mi], messages[mj])
            if dist < min_dist:
                min_dist = dist
                min_mj = mj

        if (bias- 100 * worth(messages[mi])) < min_dist: # create new cluster.
            labels.append(max_label)
            max_label += 1
        else:  # assign to an old cluster.
            labels.append(labels[min_mj])

    return labels

scalability_test.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def updateClusters(LLE_node_vals,switch_penalty = 1):
    """
    Takes in LLE_node_vals matrix and computes the path that minimizes
    the total cost over the path
    Note the LLE's are negative of the true LLE's actually!!!!!

    Note: switch penalty > 0
    """
    (T,num_clusters) = LLE_node_vals.shape
    future_cost_vals = np.zeros(LLE_node_vals.shape)

    ##compute future costs
    for i in xrange(T-2,-1,-1):
        j = i+1
        indicator = np.zeros(num_clusters)
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        for cluster in xrange(num_clusters):
            total_vals = future_costs + lle_vals + switch_penalty
            total_vals[cluster] -= switch_penalty
            future_cost_vals[i,cluster] = np.min(total_vals)

    ##compute the best path
    path = np.zeros(T)

    ##the first location
    curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
    path[0] = curr_location
    DP_start2 = time.time()
    ##compute the path
    for i in xrange(T-1):
        j = i+1
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        total_vals = future_costs + lle_vals + switch_penalty
        total_vals[int(path[i])] -= switch_penalty

        path[i+1] = np.argmin(total_vals)

    ##return the computed path
    return path

scalability_test.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False):
    """
    computes the F1 scores and returns a list of values
    """
    F1_score = np.zeros(num_cluster)
    for cluster in xrange(num_cluster):
        matched_cluster = matching_algo[cluster]
        true_matrix = actual_clusters[cluster]
        estimated_matrix = threshold_algo[matched_cluster]
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for i in xrange(num_stacked*n):
            for j in xrange(num_stacked*n):
                if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0:
                    TP += 1.0
                elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0:
                    TN += 1.0
                elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0:
                    FP += 1.0
                else:
                    FN += 1.0
        precision = (TP)/(TP + FP)
        print "cluster #", cluster
        print "TP,TN,FP,FN---------->", (TP,TN,FP,FN)
        recall = TP/(TP + FN)
        f1 = (2*precision*recall)/(precision + recall)
        F1_score[cluster] = f1
    return F1_score

scalability_test.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo):
    """
    computes a confusion matrix and returns it
    """
    seg_len = 50
    true_confusion_matrix = np.zeros([num_clusters,num_clusters])
    for point in xrange(len(clustered_points_algo)):
        cluster = clustered_points_algo[point]

        #CASE E : ABCABC
        num = (int(sorted_indices_algo[point]/seg_len) %num_clusters)
        true_confusion_matrix[num,cluster] += 1

    return true_confusion_matrix

scalability_test.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def computeF1_macro(confusion_matrix,matching, num_clusters):
    """
    computes the macro F1 score
    confusion matrix : requres permutation
    matching according to which matrix must be permuted
    """
    ##Permute the matrix columns
    permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
    for cluster in xrange(num_clusters):
        matched_cluster = matching[cluster]
        permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
    ##Compute the F1 score for every cluster
    F1_score = 0
    for cluster in xrange(num_clusters):
        TP = permuted_confusion_matrix[cluster,cluster]
        FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
        FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        f1 = stats.hmean([precision,recall])
        F1_score += f1
    F1_score /= num_clusters
    return F1_score

############
##The basic folder to be created

TICC.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters):
    """
    Takes in the matching for the clusters
    takes the computed clusters
    computes the average F1 score over the network
    """
    threshold = 1e-2
    f1 = 0
    for cluster in xrange(num_clusters):
        true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",")
        matched_cluster = matching[cluster]
        matched_cluster_cov = train_cluster_inverse[matched_cluster] 
        (nrow,ncol) = true_cluster_cov.shape

        out_true = np.zeros([nrow,ncol])
        for i in xrange(nrow):
            for j in xrange(ncol):
                if np.abs(true_cluster_cov[i,j]) > threshold:
                    out_true[i,j] = 1
        out_matched = np.zeros([nrow,ncol])
        for i in xrange(nrow):
            for j in xrange(ncol):
                if np.abs(matched_cluster_cov[i,j]) > threshold:
                    out_matched[i,j] = 1
        np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",")
        np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",")


        ##compute the confusion matrix
        confusion_matrix = np.zeros([2,2])
        for i in xrange(nrow):
            for j in xrange(ncol):
                confusion_matrix[out_true[i,j],out_matched[i,j]] += 1
        f1 += computeF1_macro(confusion_matrix, [0,1],2)
    return f1/num_clusters

############

network_accuracy.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False):
    """
    computes the F1 scores and returns a list of values
    """
    F1_score = np.zeros(num_cluster)
    for cluster in xrange(num_cluster):
        matched_cluster = matching_algo[cluster]
        true_matrix = actual_clusters[cluster]
        estimated_matrix = threshold_algo[matched_cluster]
        if save_matrix: np.savetxt("estimated_matrix_cluster=" + str(cluster)+".csv",estimated_matrix,delimiter = ",", fmt = "%1.4f")
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for i in xrange(num_stacked*n):
            for j in xrange(num_stacked*n):
                if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0:
                    TP += 1.0
                elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0:
                    TN += 1.0
                elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0:
                    FP += 1.0
                else:
                    FN += 1.0
        precision = (TP)/(TP + FP)
        recall = TP/(TP + FN)
        f1 = (2*precision*recall)/(precision + recall)
        F1_score[cluster] = f1
    return F1_score

network_accuracy.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters):
    """
    Takes in the matching for the clusters
    takes the computed clusters
    computes the average F1 score over the network
    """
    threshold = 1e-2
    f1 = 0
    for cluster in xrange(num_clusters):
        true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",")
        matched_cluster = matching[cluster]
        matched_cluster_cov = train_cluster_inverse[matched_cluster] 
        (nrow,ncol) = true_cluster_cov.shape

        out_true = np.zeros([nrow,ncol])
        for i in xrange(nrow):
            for j in xrange(ncol):
                if np.abs(true_cluster_cov[i,j]) > threshold:
                    out_true[i,j] = 1
        out_matched = np.zeros([nrow,ncol])
        for i in xrange(nrow):
            for j in xrange(ncol):
                if np.abs(matched_cluster_cov[i,j]) > threshold:
                    out_matched[i,j] = 1
        np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",")
        np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",")


        ##compute the confusion matrix
        confusion_matrix = np.zeros([2,2])
        for i in xrange(nrow):
            for j in xrange(ncol):
                confusion_matrix[out_true[i,j],out_matched[i,j]] += 1
        f1 += computeF1_macro(confusion_matrix, [0,1],2)
    return f1/num_clusters

############

dbscan.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid)

dbscan.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def main(argv=None):
    if argv is None:
        argv = sys.argv

    try:
        words, vectors, options = process_options(argv[1:])
    except Exception, e:
        if str(e):
            print >> sys.stderr, 'Error: %s' % str(e)
            return 1
        else:
            raise

    dbscan = sklearn.cluster.DBSCAN(eps=options.eps, metric=options.metric)
    dbscan.fit(numpy.array(vectors))
    noisy = sum(1 for l in dbscan.labels_ if l == -1)
    unique = len(set(dbscan.labels_))
    logging.info('%d clusters, %d noisy, %d vectors' % (unique, noisy,
                                                        len(vectors)))
    if noisy >= len(vectors) / 4:
        logging.warning('%d/%d noisy (-1) labels (try higher eps?)' % \
                            (noisy, len(vectors)))
    elif unique < (len(vectors)/2)**0.5:
        logging.warning('only %d clusters (try lower eps?)' % unique)
    write_cluster_ids(words, dbscan.labels_)

    return 0

kmeans.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options

kmeans.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def minibatch_kmeans(vectors, k):
    if not with_sklearn:
        raise NotImplementedError
    # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
    # uses batch size 1000. sklearn KMeans defaults to n_init 10
    kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
    kmeans.fit(vectors)
    return kmeans.labels_

kmeans.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid)

6.4 GaussianMixture.py 文件源码项目：ML-note 作者: JasonK93 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def create_data(centers,num=100,std=0.7):
    '''
    generate data
    :param centers: dimension of centre
    :param num: number of sample
    :param std: std of each cluster
    :return: data, target
    '''
    X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
    return  X,labels_true

runDBSCAN.py 文件源码项目：simsearch 作者: chrisjmccormick 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def runClustering(ssearch, eps, min_samples):
    """
    Run DBSCAN with the determined eps and MinPts values.
    """
    print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))

    # Initialize DBSCAN with parameters.
    # I forgot to use cosine at first!
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')

    # Time this step.
    t0 = time.time()

    # Cluster the LSI vectors.     
    db.fit(ssearch.index.index)

    # Calculate the elapsed time (in seconds)
    elapsed = (time.time() - t0)
    print("  done in %.3fsec" % elapsed)

    # Get the set of unique IDs.
    cluster_ids = set(db.labels_)

    # Show the number of clusters (don't include noise label)
    print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))  

    # For each of the clusters...    
    for cluster_id in cluster_ids:

            # Get the list of all doc IDs belonging to this cluster.
            cluster_doc_ids = []
            for doc_id in range(0, len(db.labels_)):            
                if db.labels_[doc_id] == cluster_id:
                    cluster_doc_ids.append(doc_id)

            # Get the top words in this cluster
            top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)

            print('  Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))

TICC.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def updateClusters(LLE_node_vals,switch_penalty = 1):
    """
    Uses the Viterbi path dynamic programming algorithm
    to compute the optimal cluster assigments

    Takes in LLE_node_vals matrix and computes the path that minimizes
    the total cost over the path
    Note the LLE's are negative of the true LLE's actually!!!!!

    Note: switch penalty > 0
    """
    (T,num_clusters) = LLE_node_vals.shape
    future_cost_vals = np.zeros(LLE_node_vals.shape)
    ##compute future costs
    for i in xrange(T-2,-1,-1):
        j = i+1
        indicator = np.zeros(num_clusters)
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        for cluster in xrange(num_clusters):
            total_vals = future_costs + lle_vals + switch_penalty
            total_vals[cluster] -= switch_penalty
            future_cost_vals[i,cluster] = np.min(total_vals)
    ##compute the best path
    path = np.zeros(T)

    ##the first location
    curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
    path[0] = curr_location

    ##compute the path
    for i in xrange(T-1):
        j = i+1
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        total_vals = future_costs + lle_vals + switch_penalty
        total_vals[int(path[i])] -= switch_penalty

        path[i+1] = np.argmin(total_vals)

    ##return the computed path
    return path

TICC.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo):
    """
    computes a confusion matrix and returns it
    """
    seg_len = 200
    true_confusion_matrix = np.zeros([num_clusters,num_clusters])
    for point in xrange(len(clustered_points_algo)):
        cluster = int(clustered_points_algo[point])


        ##CASE G: ABBACCCA
        # num = (int(sorted_indices_algo[point]/seg_len) )
        # if num in [0,3,7]:
        #   true_confusion_matrix[0,cluster] += 1
        # elif num in[1,2]:
        #   true_confusion_matrix[1,cluster] += 1
        # else:
        #   true_confusion_matrix[2,cluster] += 1

        ##CASE F: ABCBA
        # num = (int(sorted_indices_algo[point]/seg_len))
        # num = min(num, 4-num)
        # true_confusion_matrix[num,cluster] += 1

        #CASE E : ABCABC
        num = (int(sorted_indices_algo[point]/seg_len) %num_clusters)
        true_confusion_matrix[num,cluster] += 1

        ##CASE D : ABABABAB
        # num = (int(sorted_indices_algo[point]/seg_len) %2)
        # true_confusion_matrix[num,cluster] += 1

        ##CASE C: 
        # num = (sorted_indices_algo[point]/seg_len)
        # if num < 15:
        #   true_confusion_matrix[0,cluster] += 1
        # elif num < 20:
        #   true_confusion_matrix[1,cluster] += 1
        # else:
        #   true_confusion_matrix[0,cluster] += 1

        ##CASE B : 
        # if num > 4:
        #   num = 9 - num
        # true_confusion_matrix[num,cluster] += 1

        ##CASE A : ABA
        # if sorted_indices_algo[point] < seg_len:
        #   true_confusion_matrix[0,cluster] += 1

        # elif sorted_indices_algo[point] <3*seg_len:
        #   true_confusion_matrix[1,cluster] += 1
        # else:
        #   true_confusion_matrix[0,cluster] += 1

    return true_confusion_matrix

network_accuracy.py 文件源码项目：TICC 作者: davidhallac 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def updateClusters(LLE_node_vals,switch_penalty = 1):
    """
    Uses the Viterbi path dynamic programming algorithm
    to compute the optimal cluster assigments

    Takes in LLE_node_vals matrix and computes the path that minimizes
    the total cost over the path
    Note the LLE's are negative of the true LLE's actually!!!!!

    Note: switch penalty > 0
    """
    (T,num_clusters) = LLE_node_vals.shape
    future_cost_vals = np.zeros(LLE_node_vals.shape)
    ##compute future costs
    for i in xrange(T-2,-1,-1):
        j = i+1
        indicator = np.zeros(num_clusters)
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        for cluster in xrange(num_clusters):
            total_vals = future_costs + lle_vals + switch_penalty
            total_vals[cluster] -= switch_penalty
            future_cost_vals[i,cluster] = np.min(total_vals)
    ##compute the best path
    path = np.zeros(T)

    ##the first location
    curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
    path[0] = curr_location

    ##compute the path
    for i in xrange(T-1):
        j = i+1
        future_costs = future_cost_vals[j,:]
        lle_vals = LLE_node_vals[j,:]
        total_vals = future_costs + lle_vals + switch_penalty
        total_vals[int(path[i])] -= switch_penalty

        path[i+1] = np.argmin(total_vals)

    ##return the computed path
    return path

train.py 文件源码项目：bitcoin 作者: westrik 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def cluster(data):
    """
    Use k-means clustering on training data to find profitable patterns
    we can exploit
    """

    num_clusters = 100
    num_selected_clusters = 20

    # Split into 30, 60, and 120 min time intervals, cluster each
    split = lambda n: split_into_intervals(data, n)
    kmeans30 = sklearn.cluster.k_means(split(30), num_clusters)
    kmeans60 = sklearn.cluster.k_means(split(60), num_clusters)
    kmeans120 = sklearn.cluster.k_means(split(120), num_clusters)

    # Sort the clusters by performance
    hp30, hp60, hp120 = [], [], []
    for i in range(0, num_clusters):
        hp30.append((i,kmeans30[0][i,-1]))
        hp60.append((i,kmeans60[0][i,-1]))
        hp120.append((i,kmeans120[0][i,-1]))

    hp30 = sorted(hp30, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
    hp60 = sorted(hp60, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
    hp60 = sorted(hp120, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]

    # Select the highest performing clusters
    top30 = np.zeros((num_selected_clusters,181))
    top60 = np.zeros((num_selected_clusters,361))
    top120 = np.zeros((num_selected_clusters,721))

    for i in range(0, num_selected_clusters):
        top30[i,0:181] = kmeans30[0][hp30[i][0],0:181]
        top60[i,0:361] = kmeans60[0][hp60[i][0],0:361]
        top120[i,0:721] = kmeans120[0][hp120[i][0],0:721]

    # Then normalize the clusters so we can use the faster similarity function
    #    from S&Z to compare instead of L2 norm
    scaler = sklearn.preprocessing.StandardScaler()
    for i in range(0, num_selected_clusters):
        top30[i,0:180] = scaler.fit_transform(top30[i,0:180])
        top60[i,0:360] = scaler.fit_transform(top60[i,0:360])
        top120[i,0:720] = scaler.fit_transform(top120[i,0:720])

    return [top30, top60, top120]