def assignClasses(self):
clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
train_locs = self.df_train[['lat', 'lon']].values
clusterer.fit(train_locs)
clusters = clusterer.get_clusters()
cluster_points = dd(list)
for i, cluster in enumerate(clusters):
cluster_points[cluster].append(train_locs[i])
logging.info('#labels: %d' %len(cluster_points))
self.cluster_median = OrderedDict()
for cluster in sorted(cluster_points):
points = cluster_points[cluster]
median_lat = np.median([p[0] for p in points])
median_lon = np.median([p[1] for p in points])
self.cluster_median[cluster] = (median_lat, median_lon)
dev_locs = self.df_dev[['lat', 'lon']].values
test_locs = self.df_test[['lat', 'lon']].values
nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4)
nnbr.fit(np.array(self.cluster_median.values()))
self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0]
self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0]
self.train_classes = clusters
if self.one_hot_labels:
num_labels = np.max(self.train_classes) + 1
y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32)
y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32)
y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32)
y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
self.train_classes = y_train
self.dev_classes = y_dev
self.test_classes = y_test
评论列表
文章目录