def bic_gaussian_kernel(chain_fname,data_length):
"""
Bayesian information criterion
Only valid if data_length >> n_params
# Note that bandwidth of kernel is set to 1 universally
"""
chain = np.load(chain_fname + '.npy')
n_params = np.shape(chain)[-1]
samples = chain.reshape((-1,n_params))
kde = KernelDensity(kernel='gaussian',bandwidth=1).fit(samples)
# Best fit = medians of the distribution
medians = np.median(samples,axis=0) # shape = (n_params,)
medians = medians.reshape(1,-1) # Reshape to (1,n_params)
log10_L = float(kde.score_samples(medians))
lnL = log10_L /np.log10(2.7182818)
return -2*lnL + n_params*np.log(data_length)
###############################################################################
# Process chain
###############################################################################
python类KernelDensity()的实例源码
def kde_sklearn(data, grid, **kwargs):
"""
Kernel Density Estimation with Scikit-learn
Parameters
----------
data : numpy.array
Data points used to compute a density estimator. It
has `n x p` dimensions, representing n points and p
variables.
grid : numpy.array
Data points at which the desity will be estimated. It
has `m x p` dimensions, representing m points and p
variables.
Returns
-------
out : numpy.array
Density estimate. Has `m x 1` dimensions
"""
kde_skl = KernelDensity(**kwargs)
kde_skl.fit(data)
# score_samples() returns the log-likelihood of the samples
log_pdf = kde_skl.score_samples(grid)
return np.exp(log_pdf)
def calculate_densities(target_feature_id, feature_id):
feature = Feature.objects.get(pk=feature_id)
target_feature = Feature.objects.get(pk=target_feature_id)
df = _get_dataframe(feature.dataset.id)
target_col = df[target_feature.name]
categories = target_feature.categories
def calc_density(category):
kde = KernelDensity(kernel='gaussian', bandwidth=0.75)
X = df[target_col == category][feature.name]
# Fitting requires expanding dimensions
X = np.expand_dims(X, axis=1)
kde.fit(X)
# We'd like to sample 100 values
X_plot = np.linspace(feature.min, feature.max, 100)
# We need the last dimension again
X_plot = np.expand_dims(X_plot, axis=1)
log_dens = kde.score_samples(X_plot)
return np.exp(log_dens).tolist()
return [{'target_class': category, 'density_values': calc_density(category)} for category in categories]
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def kde_opt1(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["hour"] = (1 + df["hour"]) * 3.92105
df_new["weekday"] = (1 + df["weekday"]) * 4.28947
df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 9.44736
df_new["x"] = df["x"] * 424.489
df_new["y"] = df["y"] * 959.183
return df_new
logging.info("train kde_opt1 model")
df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
n_class = len(np.unique(y_train))
y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
Xte = df_cell_test_feats_kde.values
for i in range(n_class):
X = df_cell_train_feats_kde[y_train == i].values
cstd = np.std(np.sum(np.abs(X), axis=1))
gridcv = GridSearchCV(KernelDensity(kernel='gaussian', metric='manhattan'), {'bandwidth': cstd * np.logspace(-1, 1, 10)}, cv=5)
gridcv.fit(X)
y_test_pred[:, i] += np.exp(gridcv.best_estimator_.score_samples(Xte))
return y_test_pred
def getKernelDensityEstimation(nodes, metric='euclidean', metric_params=None, bbox=None, bandwidth=0.002, optimizeBandwidth=False, bwmin=0.0001, bwmax=0.01, crossValidation=20):
lon = []
lat = []
for nlon,nlat in nodes:
lon.append(nlon)
lat.append(nlat)
lon = np.array(lon)
lat = np.array(lat)
if bbox is None:
xmin, xmax = min(lon), max(lon)
ymin, ymax = min(lat), max(lat)
bbox = [xmin, xmax, ymin, ymax]
else:
xmin, ymin, xmax, ymax = bbox[0],bbox[1],bbox[2],bbox[3]
bbox = [xmin, xmax, ymin, ymax]
x, y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([x.ravel(), y.ravel()])
values = np.vstack([lon, lat])
if optimizeBandwidth:
grid = GridSearchCV(KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree'), {'bandwidth': np.linspace(bwmin, bwmax, 30)}, cv=crossValidation) # 20-fold cross-validation
grid.fit(zip(*values))
bandwidth = grid.best_params_['bandwidth']
kernel = grid.best_estimator_
else:
kernel = KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree', bandwidth=bandwidth)
kernel.fit(zip(*values))
return kernel, positions, x, y, bbox, bandwidth
def __init__(self, hyperparameter, param_name, data, oob_strategy='resample', bandwith=0.4):
if oob_strategy not in ['resample', 'round', 'ignore']:
raise ValueError()
self.oob_strategy = oob_strategy
self.param_name = param_name
self.hyperparameter = hyperparameter
reshaped = np.reshape(data, (len(data), 1))
if self.hyperparameter.log:
if isinstance(self.hyperparameter, UniformIntegerHyperparameter):
# self.probabilities = {val: self.distrib.pdf(np.log2(val)) for val in range(self.hyperparameter.lower, self.hyperparameter.upper)}
raise ValueError('Log Integer hyperparameter not supported: %s' %param_name)
# self.distrib = gaussian_kde(np.log2(data))
# self.distrib = KernelDensity(kernel='gaussian').fit(np.log2(np.reshape(data, (len(data), 1))))
self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit(np.log2(reshaped))
else:
# self.distrib = gaussian_kde(data)
self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit(reshaped)
pass
def test_kernel_density(n_samples=100, n_features=3):
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
Y = rng.randn(n_samples, n_features)
for kernel in ['gaussian', 'tophat', 'epanechnikov',
'exponential', 'linear', 'cosine']:
for bandwidth in [0.01, 0.1, 1]:
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
def check_results(kernel, bandwidth, atol, rtol):
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
atol=atol, rtol=rtol)
log_dens = kde.fit(X).score_samples(Y)
assert_allclose(np.exp(log_dens), dens_true,
atol=atol, rtol=max(1E-7, rtol))
assert_allclose(np.exp(kde.score(Y)),
np.prod(dens_true),
atol=atol, rtol=max(1E-7, rtol))
for rtol in [0, 1E-5]:
for atol in [1E-6, 1E-2]:
for breadth_first in (True, False):
yield (check_results, kernel, bandwidth, atol, rtol)
def test_kde_algorithm_metric_choice():
# Smoke test for various metrics and algorithms
rng = np.random.RandomState(0)
X = rng.randn(10, 2) # 2 features required for haversine dist.
Y = rng.randn(10, 2)
for algorithm in ['auto', 'ball_tree', 'kd_tree']:
for metric in ['euclidean', 'minkowski', 'manhattan',
'chebyshev', 'haversine']:
if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
assert_raises(ValueError, KernelDensity,
algorithm=algorithm, metric=metric)
else:
kde = KernelDensity(algorithm=algorithm, metric=metric)
kde.fit(X)
y_dens = kde.score_samples(Y)
assert_equal(y_dens.shape, Y.shape[:1])
b3_data_iter.py 文件源码
项目:kaggle-dstl-satellite-imagery-feature-detection
作者: u1234x1234
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, masks):
n_class = 10
self.maps_with_class = [[], [], [], [], [], [], [], [], [], []]
self.kde_samplers = []
self.class_probs = np.ones(n_class) / n_class
# self.class_probs = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5])
self.mask_size = None
ts = time.time()
for mask_i, mask in enumerate(masks):
assert mask.shape[2] == n_class
if not self.mask_size:
self.mask_size = mask.shape[1]
samplers = []
for class_i in range(n_class):
X = np.nonzero(mask[:, :, class_i])
X = np.stack(X, axis=1)
# np.random.shuffle(X)
# X = X[:50000]
if not X.size:
samplers.append(None)
else:
self.maps_with_class[class_i].append(mask_i)
sampler = neighbors.KernelDensity(self.mask_size * 0.02).fit(X)
samplers.append(sampler)
assert len(samplers) == n_class
self.kde_samplers.append(samplers)
print('sampler init time: {}'.format(time.time() - ts))
def display(self, output_filename):
fig, (self.ax) = plt.subplots(1, 1)
self.kde = KernelDensity(kernel = 'gaussian', bandwidth = self.bandwidth)
for i in range(len(self.datasets)):
self.displayDataset(self.datasets[i])
if self.title is not None:
self.ax.set_xlabel(self.title)
self.ax.set_ylabel('Density')
self.ax.legend(bbox_to_anchor = (0., 1.02, 1., .102), loc = 3,
ncol = 3, mode = 'expand', borderaxespad = 0.)
fig.savefig(output_filename)
plt.close(fig)
def test_gridsearch_no_predict():
# test grid-search with an estimator without predict.
# slight duplication of a test from KDE
def custom_scoring(estimator, X):
return 42 if estimator.bandwidth == .1 else 0
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
search = dcv.GridSearchCV(KernelDensity(),
param_grid=dict(bandwidth=[.01, .1, 1]),
scoring=custom_scoring)
search.fit(X)
assert search.best_params_['bandwidth'] == .1
assert search.best_score_ == 42
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"] + df["hour"] / 24.
df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x))
df_new["x"] = df["x"]
df_new["y"] = df["y"]
return df_new
logging.info("train kde_opt4 model")
df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
n_class = len(np.unique(y_train))
y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
for i in range(n_class):
X = df_cell_train_feats_kde[y_train == i]
y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d")
for feat in df_cell_train_feats_kde.columns.values:
X_feat = X[feat].values
BGK10_output = kdeBGK10(X_feat)
if BGK10_output is None:
kde = gaussian_kde(X_feat, "scott")
kde = gaussian_kde(X_feat, kde.factor * 0.741379)
y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values)
else:
bandwidth, mesh, density = BGK10_output
kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth)
kde.fit(X_feat[:, np.newaxis])
y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis]))
y_test_pred[:, i] += y_test_pred_i
return y_test_pred
def get_bw_cv(x,params):
grid = GridSearchCV(KernelDensity(), params,cv=2,n_jobs=1)
grid.fit(x[:,None])
bw = grid.best_estimator_.bandwidth
return bw
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples.
Returns
-------
self : detector
Return self.
"""
X = check_array(X)
self._kde = KernelDensity(
bandwidth = self.bandwidth,
kernel = self.kernel,
metric = self.metric,
metric_params = self.metric_params
).fit(X)
self.y_score_ = self.anomaly_score(X)
self.threshold_ = np.percentile(
self.y_score_, 100.0 * (1.0 - self.fpr)
)
return self
def getKernelDensityEstimation(nodes, metric='euclidean', metric_params=None, bbox=None, bandwidth=0.002, optimizeBandwidth=False, bwmin=0.0001, bwmax=0.01, crossValidation=20):
lon = []
lat = []
for nlon,nlat in nodes:
lon.append(nlon)
lat.append(nlat)
lon = np.array(lon)
lat = np.array(lat)
if bbox is None:
xmin, xmax = min(lon), max(lon)
ymin, ymax = min(lat), max(lat)
bbox = [xmin, xmax, ymin, ymax]
else:
xmin, ymin, xmax, ymax = bbox[0],bbox[1],bbox[2],bbox[3]
bbox = [xmin, xmax, ymin, ymax]
x, y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([x.ravel(), y.ravel()])
values = np.vstack([lon, lat])
if optimizeBandwidth:
grid = GridSearchCV(KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree'), {'bandwidth': np.linspace(bwmin, bwmax, 30)}, cv=crossValidation) # 20-fold cross-validation
grid.fit(zip(*values))
bandwidth = grid.best_params_['bandwidth']
kernel = grid.best_estimator_
else:
kernel = KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree', bandwidth=bandwidth)
kernel.fit(zip(*values))
return kernel, positions, x, y, bbox, bandwidth
def fit_kde(X, bandwidth):
kde = KernelDensity(bandwidth=bandwidth)
kde.fit(X=X)
return kde
def search_bandwidth(val_data, cvJobs):
data = convert_to_ndarrays(val_data)
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params, n_jobs=cvJobs)
grid.fit(data)
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
return grid.best_estimator_.bandwidth
def test_kernel_density_sampling(n_samples=100, n_features=3):
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
bandwidth = 0.2
for kernel in ['gaussian', 'tophat']:
# draw a tophat sample
kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
samp = kde.sample(100)
assert_equal(X.shape, samp.shape)
# check that samples are in the right range
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
dist, ind = nbrs.kneighbors(X, return_distance=True)
if kernel == 'tophat':
assert np.all(dist < bandwidth)
elif kernel == 'gaussian':
# 5 standard deviations is safe for 100 samples, but there's a
# very small chance this test could fail.
assert np.all(dist < 5 * bandwidth)
# check unsupported kernels
for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
assert_raises(NotImplementedError, kde.sample, 100)
# non-regression test: used to return a scalar
X = rng.randn(4, 1)
kde = KernelDensity(kernel="gaussian").fit(X)
assert_equal(kde.sample().shape, (1, 1))
def test_kde_badargs():
assert_raises(ValueError, KernelDensity,
algorithm='blah')
assert_raises(ValueError, KernelDensity,
bandwidth=0)
assert_raises(ValueError, KernelDensity,
kernel='blah')
assert_raises(ValueError, KernelDensity,
metric='blah')
assert_raises(ValueError, KernelDensity,
algorithm='kd_tree', metric='blah')
def test_kde_pipeline_gridsearch():
# test that kde plays nice in pipelines and grid-searches
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
KernelDensity(kernel="gaussian"))
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
search = GridSearchCV(pipe1, param_grid=params, cv=5)
search.fit(X)
assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
def test_gridsearch_no_predict():
# test grid-search with an estimator without predict.
# slight duplication of a test from KDE
def custom_scoring(estimator, X):
return 42 if estimator.bandwidth == .1 else 0
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
search = GridSearchCV(KernelDensity(),
param_grid=dict(bandwidth=[.01, .1, 1]),
scoring=custom_scoring)
search.fit(X)
assert_equal(search.best_params_['bandwidth'], .1)
assert_equal(search.best_score_, 42)
def find_consensus_values(
x,
n_expected=None,
bandwidth_auto=True,
min_bandwidth=0.02, # XXX This is GC-MS specific
default_bandwidth=0.1, # XXX This too
show_plot=False,
out=sys.stdout,
err=sys.stderr):
"""
Use kernel density estimation to analyze the distribution of data points
along the X-axis, and identify consensus values for major clusters. This
is used to identify common peaks in a set of related GC-MS samples.
"""
if isinstance(x, list):
x = np.array(x)
x_grid = np.linspace(x.min() - 0.25, x.max() + 0.25, 1000)
if bandwidth_auto:
# http://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
grid = GridSearchCV(KernelDensity(),
{'bandwidth': np.linspace(min_bandwidth, 0.1, 30)},
cv=20) # 20-fold cross-validation
grid.fit(x[:, None])
bandwidth = grid.best_params_['bandwidth']
print("Best bandwidth: %.4f" % bandwidth, file=err)
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(x_grid[:, None]))
else:
bandwidth = default_bandwidth
kde = gaussian_kde(x)
pdf = kde.evaluate(x_grid)
i_maxima = local_maxima(x_grid, pdf)
max_values = []
for i_max in i_maxima:
max_values.append((x_grid[i_max], pdf[i_max]))
# sort maxima by value in distribution
max_values.sort(key=lambda x: x[1], reverse=True)
pdf_max = pdf.max()
major_peaks, minor_peaks = extract_peaks(max_values, pdf_max, n_expected=n_expected,
out=out, err=err)
# now sort major peaks by retention time
major_peaks.sort(key=lambda x: x[0])
print("Major retention time peaks:", file=err)
for i_peak, (xval, pdf_val) in enumerate(major_peaks):
print(" %2d %8.3f" % (i_peak+1, xval), file=err)
if show_plot:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
ax.hist(x, 50, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
for rt, pdf_val in major_peaks:
ax.axvline(rt, color='red')
ax.axvline(rt-bandwidth, color='magenta')
ax.axvline(rt+bandwidth, color='magenta')
plt.show()
return [xval for xval, yval in major_peaks], float(bandwidth)