def clusterMalwareNames(malwareNames):
# strictly lexical clustering over malware-names
wordCount = {}
# create a distance matrix
matrix = np.zeros((len(malwareNames), len(malwareNames)))
for i in range(len(malwareNames)):
for j in range(len(malwareNames)):
if matrix[i, j] == 0.0:
matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
matrix[j, i] = matrix[i, j]
# Scikit-Learn's DBSCAN implementation to cluster the malware-names
clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
clust.fit(matrix)
preds = clust.labels_
clabels = np.unique(preds)
# create Word-Count Map
for i in range(clabels.shape[0]):
if clabels[i] < 0:
continue
cmem_ids = np.where(preds == clabels[i])[0]
cmembers = []
for cmem_id in cmem_ids:
cmembers.append(malwareNames[cmem_id])
wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
return wordCount
python类DBSCAN的实例源码
def test_DBSCAN(*data):
'''
test the DBSCAN method
:param data: train, target
:return: None
'''
X,labels_true=data
clst=cluster.DBSCAN()
predicted_labels=clst.fit_predict(X)
print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels))
print("Core sample num:{0}".format(len(clst.core_sample_indices_)))
def test_DBSCAN_epsilon(*data):
'''
test the score with different eps
:param data: train, target
:return: None
'''
X,labels_true=data
epsilons=np.logspace(-1,1.5)
ARIs=[]
Core_nums=[]
for epsilon in epsilons:
clst=cluster.DBSCAN(eps=epsilon)
predicted_labels=clst.fit_predict(X)
ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
Core_nums.append(len(clst.core_sample_indices_))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,2,1)
ax.plot(epsilons,ARIs,marker='+')
ax.set_xscale('log')
ax.set_xlabel(r"$\epsilon$")
ax.set_ylim(0,1)
ax.set_ylabel('ARI')
ax=fig.add_subplot(1,2,2)
ax.plot(epsilons,Core_nums,marker='o')
ax.set_xscale('log')
ax.set_xlabel(r"$\epsilon$")
ax.set_ylabel('Core_Nums')
fig.suptitle("DBSCAN")
plt.show()
def test_DBSCAN_min_samples(*data):
'''
test the score with different min_sample
:param data: train, target
:return: None
'''
X,labels_true=data
min_samples=range(1,100)
ARIs=[]
Core_nums=[]
for num in min_samples:
clst=cluster.DBSCAN(min_samples=num)
predicted_labels=clst.fit_predict(X)
ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
Core_nums.append(len(clst.core_sample_indices_))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,2,1)
ax.plot(min_samples,ARIs,marker='+')
ax.set_xlabel( "min_samples")
ax.set_ylim(0,1)
ax.set_ylabel('ARI')
ax=fig.add_subplot(1,2,2)
ax.plot(min_samples,Core_nums,marker='o')
ax.set_xlabel( "min_samples")
ax.set_ylabel('Core_Nums')
fig.suptitle("DBSCAN")
plt.show()
def runClustering(ssearch, eps, min_samples):
"""
Run DBSCAN with the determined eps and MinPts values.
"""
print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))
# Initialize DBSCAN with parameters.
# I forgot to use cosine at first!
db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')
# Time this step.
t0 = time.time()
# Cluster the LSI vectors.
db.fit(ssearch.index.index)
# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print(" done in %.3fsec" % elapsed)
# Get the set of unique IDs.
cluster_ids = set(db.labels_)
# Show the number of clusters (don't include noise label)
print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))
# For each of the clusters...
for cluster_id in cluster_ids:
# Get the list of all doc IDs belonging to this cluster.
cluster_doc_ids = []
for doc_id in range(0, len(db.labels_)):
if db.labels_[doc_id] == cluster_id:
cluster_doc_ids.append(doc_id)
# Get the top words in this cluster
top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)
print(' Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
def main():
"""
Entry point for the script.
"""
###########################################################################
# Load the corpus
###########################################################################
# Load the pre-built corpus.
print('Loading the saved SimSearch and corpus...')
(ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/')
print ' %d documents.' % len(ssearch.index.index)
# Step 1: Run a technique to find a good 'eps' value.
#findEps(ssearch)
#eps = 0.5
eps = 0.44
# Step 2: Run a technique to find a good 'MinPts' value.
# TODO - This took ~17 min. on my desktop!
#findMinPts(ssearch, eps)
#min_samples = 8
min_samples = 4
# Step 3: Run DBSCAN
runClustering(ssearch, eps, min_samples)
def dbscan(userid,X):
db = DBSCAN(eps=0.15,min_samples=4).fit(X)
# print db.labels_ zeros_like
core_samples_mask = np.zeros_like(db.labels_,dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
lables = db.labels_
labels_list = list(lables)
# print labels_list.count(-1)
out_user.setdefault(userid,0)
out_user[userid] = labels_list.count(-1)
print out_user
# print labels_list.index(-1)
print lables
n_clusters_ = len(set(lables)) -(1 if -1 in lables else 0)
unique_lables = set(lables)
cols = plt.cm.Spectral(np.linspace(0,1,len(unique_lables)))
# center_points = []
for k,col in zip(unique_lables,cols):
if k == -1:
col = 'k'
class_member_mask = (lables == k)
k_x = X[class_member_mask & core_samples_mask]
plt.plot(k_x[:,0],k_x[:,1],'o',markerfacecolor = col,
markeredgecolor = 'k' , markersize = 5)
center_points.append([np.mean(k_x[:,1]),np.mean(k_x[:,0])])
plt.title('DBSCAN :Estimated number of clusters: %d' % n_clusters_)
# plt.show()
def __init__(self, filterer=PCA(n_components=2),
coverer=HyperRectangleCoverer(),
clusterer=DBSCAN(),
params=None):
self.filterer = filterer
self.coverer = coverer
self.clusterer = clusterer
if params is not None:
self.set_params(**params)
def set_random_state(estimator, random_state=0):
"""Set random state of an estimator if it has the `random_state` param.
Classes for whom random_state is deprecated are ignored. Currently DBSCAN
is one such class.
"""
if isinstance(estimator, DBSCAN):
return
if "random_state" in estimator.get_params():
estimator.set_params(random_state=random_state)
def train(self, data):
"""
:type data: pyspark.RDD
:param data: (key, k-dim vector like)
Train the model using a (key, vector) RDD
"""
parts = KDPartitioner(data, self.max_partitions)
self.data = data
self.bounding_boxes = parts.bounding_boxes
self.expanded_boxes = {}
self._create_neighborhoods()
# repartition data set on the partition label
self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \
.partitionBy(len(parts.partitions)) \
.map(lambda (p, (k, v)): ((k, p), v))
# create parameters for sklearn DBSCAN
params = {'eps': self.eps, 'min_samples': self.min_samples,
'metric': self.metric}
# perform dbscan on each part
self.data = self.data.mapPartitions(
lambda iterable: dbscan_partition(iterable, params))
self.data.cache()
self._remap_cluster_ids()
def assignments(self):
"""
:rtype: list
:return: list of (key, cluster_id)
Retrieve the results of the DBSCAN
"""
return self.result.collect()
def makeClusterers(X, k=2):
return [('MiniBatchKMeans', makeKMeans(X, k)),
('AffinityPropagation', makeAffinityProp()),
('MeanShift', makeMeanShift(X)),
('SpectralClustering', makeSpectral(X, k)),
('Ward', makeWard(X, k)),
('AgglomerativeAvg', makeAvgLinkage(X, k)),
('AgglomerativeMax', makeMaxLinkage(X, k)),
('AgglomerativeWard', makeWardLinkage(X, k)),
('DBSCAN', makeDBScan())]
def cluster(X, eps=1, min_pts=30, algorithm='DBSCAN', n_clusters=10):
if algorithm == 'DBSCAN':
cluster_result = DBSCAN(eps=eps, min_samples=min_pts).fit(X)
elif algorithm == 'KMeans':
cluster_result = KMeans(n_clusters=n_clusters)
labels = cluster_result.labels_
return labels
def update_location_centroid(point, cluster, max_distance, min_samples):
""" Updates the centroid of a location cluster with another point
Args:
point (:obj:`Point`): Point to add to the cluster
cluster (:obj:`list` of :obj:`Point`): Location cluster
max_distance (float): Max neighbour distance
min_samples (int): Minimum number of samples
Returns:
(:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid
and new point cluster (given cluster + given point)
"""
cluster.append(point)
points = [p.gen2arr() for p in cluster]
# Estimates the epsilon
eps = estimate_meters_to_deg(max_distance, precision=6)
p_cluster = DBSCAN(eps=eps, min_samples=min_samples)
p_cluster.fit(points)
clusters = {}
for i, label in enumerate(p_cluster.labels_):
if label in clusters.keys():
clusters[label].append(points[i])
else:
clusters[label] = [points[i]]
centroids = []
biggest_centroid_l = -float("inf")
biggest_centroid = None
for label, n_cluster in clusters.items():
centroid = compute_centroid(n_cluster)
centroids.append(centroid)
if label >= 0 and len(n_cluster) >= biggest_centroid_l:
biggest_centroid_l = len(n_cluster)
biggest_centroid = centroid
if biggest_centroid is None:
biggest_centroid = compute_centroid(points)
return biggest_centroid, cluster
def classify_user():
new_df_log_scaled = get_scaled_user()
c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T)
pd.value_counts(c.labels_)
d = c.labels_
types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0]
types[types == -1] = 2
return types
def detect(self, method, model, data):
'''
:param method: -> method name
:param model: -> trained clusterer
:param data: -> dataframe with data
:return: -> dictionary that contains the list of anomalous timestamps
'''
smodel = self.__loadClusterModel(method, model)
anomalieslist = []
if not smodel:
dpredict = 0
else:
if data.shape[0]:
if isinstance(smodel, IsolationForest):
print "Detected IsolationForest model"
print "Contamination -> %s" % smodel.contamination
print "Max_Features -> %s" % smodel.max_features
print "Max_Samples -> %s" % smodel.max_samples_
print "Threashold -> %s " % smodel.threshold_
try:
dpredict = smodel.predict(data)
print "IsolationForest Prediction Array -> %s" %str(dpredict)
except Exception as inst:
logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
dpredict = 0
elif isinstance(smodel, DBSCAN):
print "Detected DBSCAN model"
print "Leaf_zise -> %s" % smodel.leaf_size
print "Algorithm -> %s" % smodel.algorithm
print "EPS -> %s" % smodel.eps
print "Min_Samples -> %s" % smodel.min_samples
print "N_jobs -> %s" % smodel.n_jobs
try:
dpredict = smodel.fit_predict(data)
except Exception as inst:
logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
dpredict = 0
else:
dpredict = 0
logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
str(data.shape[1]))
print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
str(data.shape[1]))
print "dpredict type is %s" % (type(dpredict))
if type(dpredict) is not int:
anomalyarray = np.argwhere(dpredict == -1)
for an in anomalyarray:
anomalies = {}
anomalies['utc'] = int(data.iloc[an[0]]['key'])
anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key']))
anomalieslist.append(anomalies)
anomaliesDict = {}
anomaliesDict['anomalies'] = anomalieslist
logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
return anomaliesDict
cluster-triples.py 文件源码
项目:information-extraction-PT
作者: davidsbatista
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def main():
"""
compute_embeddings_vectors()
print "Reading embedding vectors"
with open('triples_vectors.pkl', 'r') as in_file:
triples = pickle.load(in_file)
vectors = []
for t in triples:
vectors.append(t.vector)
"""
text = []
triples = []
with open('triples.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for t in reader:
e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
t = Triple(e1, e1_type, rel, e2, e2_type)
text.append(rel)
triples.append(t)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(text)
print "Clustering"
dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute',
leaf_size=30, p=None, n_jobs=1)
labels = dbscan.fit_predict(tfidf_matrix)
with open('triples_labels.txt', 'w') as out_file:
for l in labels:
out_file.write(str(l) + '\n')
print "Reading cluster labels"
labels = []
with open('triples_labels.txt', 'r') as in_file:
for label in in_file:
labels.append(int(label.strip()))
for i in range(len(triples)):
triples[i].label = labels[i]
clusters = dict()
for t in triples:
try:
clusters[t.label] += 1
except KeyError:
clusters[t.label] = 1
print clusters
exit(-1)
# print len(clusters)
# top-terms for each cluster
for x in range(-1, len(clusters)):
print x, len(clusters[x])
for t in triples:
if t.label == str(x):
print t.rel
print
print
def detect_match_chunks(self, max_error=.06):
percent = cv2.imread("assets/pct.png")
corr_series = []
for (time, scene) in self.sample_frames(interval=self.polling_interval):
cv2.imwrite("scene.png", scene)
scene = cv2.imread("scene.png")
scaled_percent = cv2.resize(
percent, (0, 0), fx=self.scale, fy=self.scale)
scaled_percent = cv2.Canny(scaled_percent, 50, 200)
percent_corrs = []
for port_number, roi in enumerate(self.ports):
if roi is not None:
scene_roi = scene[roi.top:(roi.top + roi.height), roi.left:(roi.left + roi.width)]
scene_roi = cv2.Canny(scene_roi, 50, 200)
corr_map = cv2.matchTemplate(scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
_, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
percent_corrs.append(max_corr)
point = [time, max(percent_corrs)]
corr_series.append(point)
corr_series = np.array(corr_series)
medians = pd.rolling_median(corr_series[:, 1], self.min_gap //
self.polling_interval, center=True)[2:-2]
clusters = DBSCAN(eps=0.03, min_samples=10).fit(medians.reshape(-1, 1))
dataframe = list(zip(corr_series[:, 0][2:-2], medians, clusters.labels_))
labels = list(set(x[2] for x in dataframe))
cluster_means = [sum(cluster) / len(cluster) for cluster in [[x[1] for x in dataframe if x[2] == label] for label in labels]]
cluster_means = list(zip(labels, cluster_means))
game_label = max(cluster_means, key=lambda x: x[1])[0]
game_groups = [(k, list(v)) for k, v in groupby(dataframe, lambda pt: pt[2])]
games = [[v[0][0], v[-1][0]] for k, v in game_groups if k == game_label]
return games
def __detect_match_chunks(self, max_error=.04):
percent = cv2.imread("assets/pct.png")
corr_series = []
for (time, scene) in spaced_frames(self, interval=self.polling_interval):
cv2.imwrite("scene.png", scene)
scene = cv2.imread("scene.png")
scaled_percent = cv2.resize(
percent, (0, 0), fx=self.scale, fy=self.scale)
scaled_percent = cv2.Canny(scaled_percent, 50, 200)
percent_corrs = []
for port_number, roi in enumerate(self.ports):
if roi is not None:
scene_roi = scene[roi.top:roi.bottom, roi.left:roi.right]
scene_roi = cv2.Canny(scene_roi, 50, 200)
corr_map = cv2.matchTemplate(
scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
_, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
percent_corrs.append(max_corr)
point = [time, max(percent_corrs)]
corr_series.append(point)
corr_series = np.array(corr_series)
def moving_average(series, n=5):
return np.convolve(series, np.ones((n,)) / n, mode='valid')
medians = rolling_median(corr_series[:, 1], self.min_gap // self.polling_interval, center=True)[2:-2]
clusters = DBSCAN(eps=0.05, min_samples=10).fit(medians.reshape(-1, 1))
centers = kmeans.cluster_centers_
points = zip([time + (self.min_gap / 2)
for time, corr in corr_series], kmeans.labels_)
# Throw out the lowest cluster
groups = [(k, list(v))
for k, v in groupby(points, lambda pt: centers[pt[1]] > max(min(centers), .2))]
games = [[v[0][0], v[-1][0]] for k, v in groups if k]
return games
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
method='ap'):
"""Define clusters given the similarity matrix and the threshold."""
n, labels = connected_components(similarity_matrix, directed=False)
prev_max_clust = 0
print("connected components: %d" % n)
clusters = labels.copy()
if method == 'dbscan':
ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
if method == 'ap':
ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
preference='median')
for i in range(n):
idxs = np.where(labels == i)[0]
if idxs.shape[0] > 1:
sm = similarity_matrix[idxs][:, idxs]
sm += sm.T + scipy.sparse.eye(sm.shape[0])
# Hierarchical clustering
if method == 'hc':
dists = squareform(1 - sm.toarray())
links = fastcluster.linkage(dists, method='ward')
try:
clusters_ = fcluster(links, threshold, 'distance')
except ValueError as err:
logging.critical(err)
clusters_ = np.zeros(1, dtype=int)
# DBSCAN
elif method == 'dbscan':
db = ap.fit(1. - sm.toarray())
# Number of clusters in labels, ignoring noise if present.
clusters_ = db.labels_
# n_clusters_ = len(set(clusters_)) - int(0 in clusters_)
# AffinityPropagation
# ap = AffinityPropagation(affinity='precomputed')
elif method == 'ap':
db = ap.fit(sm)
clusters_ = db.labels_
else:
raise ValueError("clustering method %s unknown" % method)
if np.min(clusters_) == 0:
clusters_ += 1
clusters_ += prev_max_clust
clusters[idxs] = clusters_
prev_max_clust = max(clusters_)
else: # connected component contains just 1 element
prev_max_clust += 1
clusters[idxs] = prev_max_clust
return np.array(extra.flatten(clusters))