# -*- coding: utf-8 -*-
"""
This module implements nonparametric ridge estimation and relevant
functions. The main class is `Scms` which provides ridge estimation
and bootstrap functions.
"""
import warnings
import numpy as np
from multiprocess import Pool
from numpy.core.umath_tests import matrix_multiply
from numpy.lib.index_tricks import as_strided
from scipy import linalg
from scipy.spatial.distance import cdist
from scipy.special import logsumexp
from sklearn.neighbors import NearestNeighbors
from .external import neighbors
[docs]def vdist(a, b):
"""
This function is similar to cdist but returning vectors rather than norm of vectors
implemented via virtual copies of a and b
Parameters
----------
a,b : np.ndarray
2D arrays of size (M,D), (N,D) for which differences for all pairwise combinations
of rows in the two arrays will be computed. The number of columns have to the same
between a and b.
Returns
----------
numpy.array
3D array of size (N, M, D), containing all pairwise differences of rows in a and b.
"""
return as_strided(a, strides=(0, a.strides[0], a.strides[1]),
shape=(b.shape[0], a.shape[0], a.shape[1])) - \
as_strided(b, strides=(b.strides[0], 0, b.strides[1]),
shape=(b.shape[0], a.shape[0], b.shape[1]))
[docs]def bootstrap_resample(X, n=None):
""" Bootstrap resample an array_like
Parameters
----------
X : array_like
data to resample
n : int, optional
length of resampled array, equal to len(X) if n==None
Returns
-------
array_like, numpy.array
* Bootstrap resampled array
* 1D array of indices corresponding to input.
"""
if n is None:
n = len(X)
resample_i = np.floor(np.random.rand(n) * len(X)).astype(int)
samples = X[resample_i]
return samples, resample_i
[docs]def multilevel_compression(data, n=5000, threshold=None, k=20):
"""
Iteratively combine nearest neighbor pairs below distance threshold.
This function can be used to reduce the number of data points to represent the whole dataset
without greatly affect the density estimation quality. Most of the data points in low density
areas will be unaffected and data points in high density areas will be combined and averaged.
Parameters
----------
data : array
2D array of size (N, D)
n : int, optional
Choose the distance threshold by the n-th largest k-NN distance; must be smaller than `N x k`.
threshold : float or None, optional
The distance threshold for combining nearest neighbor pairs. Override `N` when given.
k : int, optional
Number of nearesest neigbors for KNN graph. Choose larger k for better approximation,
smaller for better speed. Use at least 2.
Returns
----------
tuple(list, list)
* list of level 0, level 1, level 2, ... data points. As each level a data point corresponds to
2^0, 2^1, 2^2, ... original data points.
* list of input data indices corresponding to level-0,1,2,... data points.
"""
data = np.asarray(data)
c, _ = neighbors.neighbors(data, n_neighbors=np.minimum(k, data.shape[0]), metric='euclidean')
c = c.nonzero()
d = np.linalg.norm(data[c[0], :] - data[c[1], :], axis=1)
isorted = np.argsort(d)
_, iisorted = np.unique(c[0][isorted], return_index=True)
source_i = c[0][isorted[iisorted]]
target_i = c[1][isorted[iisorted]]
source_ik = c[0][isorted]
target_ik = c[1][isorted]
dk = d[isorted]
d = d[isorted[iisorted]]
assert len(source_i) == data.shape[0]
if threshold == None:
threshold = -np.sort(-d)[n]
solo_index = source_i[d > threshold]
source_ik = source_ik[dk <= threshold]
target_ik = target_ik[dk <= threshold]
dk = dk[dk <= threshold]
i = (~np.isin(source_ik, solo_index)) * (~np.isin(target_ik, solo_index))
source_ik = source_ik[i]
target_ik = target_ik[i]
dk = dk[i]
pairs_to_combine = np.vstack([source_ik, target_ik])
pairs_to_combine_done = []
while pairs_to_combine.shape[1] > 0:
pairs_to_combine_flat = pairs_to_combine.T.flatten()
_, i = np.unique(pairs_to_combine_flat, return_index=True)
temp = pairs_to_combine_flat[i].copy()
pairs_to_combine_flat[:] = -1
pairs_to_combine_flat[i] = temp
pairs_to_combine_working = np.reshape(pairs_to_combine_flat, (int(len(pairs_to_combine_flat) / 2), 2))
pairs_to_combine_working = pairs_to_combine_working[~np.any(pairs_to_combine_working == -1, axis=1), :]
pairs_to_combine_done.append(pairs_to_combine_working)
pairs_to_combine_working = pairs_to_combine_working.flatten()
pairs_to_combine = pairs_to_combine[:, (~np.isin(pairs_to_combine[0, :], pairs_to_combine_working)) * (
~np.isin(pairs_to_combine[1, :], pairs_to_combine_working))]
while True:
retry_index = np.setdiff1d(source_ik, np.vstack(pairs_to_combine_done).flatten())
if len(retry_index) == 0:
break
c_r, _ = neighbors.neighbors(data[retry_index, :], n_neighbors=np.minimum(k, len(retry_index)),
metric='euclidean')
c_r = c_r.nonzero()
d_r = np.linalg.norm(data[retry_index, :][c_r[0], :] - data[retry_index, :][c_r[1], :], axis=1)
isorted = np.argsort(d_r)
source_ik_r = c_r[0][isorted]
target_ik_r = c_r[1][isorted]
dk_r = d_r[isorted]
source_ik_r = source_ik_r[dk_r < threshold]
target_ik_r = target_ik_r[dk_r < threshold]
dk_r = dk_r[dk_r < threshold]
if len(source_ik_r) == 0:
break
pairs_to_combine = np.vstack([retry_index[source_ik_r], retry_index[target_ik_r]])
while pairs_to_combine.shape[1] > 0:
pairs_to_combine_flat = pairs_to_combine.T.flatten()
_, i = np.unique(pairs_to_combine_flat, return_index=True)
temp = pairs_to_combine_flat[i].copy()
pairs_to_combine_flat[:] = -1
pairs_to_combine_flat[i] = temp
pairs_to_combine_working = np.reshape(pairs_to_combine_flat, (int(len(pairs_to_combine_flat) / 2), 2))
pairs_to_combine_working = pairs_to_combine_working[~np.any(pairs_to_combine_working == -1, axis=1), :]
pairs_to_combine_done.append(pairs_to_combine_working)
pairs_to_combine_working = pairs_to_combine_working.flatten()
pairs_to_combine = pairs_to_combine[:, (~np.isin(pairs_to_combine[0, :], pairs_to_combine_working)) * (
~np.isin(pairs_to_combine[1, :], pairs_to_combine_working))]
if len(pairs_to_combine_done) == 0:
solo_data = data
return [solo_data]
else:
pairs_to_combine_done = np.vstack(pairs_to_combine_done)
solo_index = np.concatenate([solo_index, np.setdiff1d(source_ik, np.vstack(pairs_to_combine_done).flatten())])
solo_data = data[solo_index, :]
pair_data = (data[pairs_to_combine_done[:, 0], :] + data[pairs_to_combine_done[:, 1], :]) / 2.0
if len(pair_data) < 50:
collapsed_index = [[solo_index], [pairs_to_combine_done[:, 0], pairs_to_combine_done[:, 1]]]
return [solo_data, pair_data], collapsed_index
else:
coarse_data, coarse_index = multilevel_compression(pair_data, 0, threshold)
collapsed_index = [[solo_index]] + [
[pairs_to_combine_done[i, 0] for i in ind] + [pairs_to_combine_done[i, 1] for i in ind] for ind in
coarse_index]
return [solo_data] + coarse_data, collapsed_index
[docs]class Scms(object):
"""
An implementation of nonparametric ridge estimation.
The main algorithm implemented is subspace constrained mean-shift (SCMS).
Parameters
----------
data : array or list
* if `data` is a 2D array of size (N, D). Regular KDE is used.
* if `data` is a list. `data` is interpreted as compressed data as given by
`ridge_estimation.multilevel_compression` function.
Multi-level KDE is used. Different levels one data point corresponds to 1, 2, 4, 8, 16, ...
regular datapoints.
bw : float
Gaussian kernel bandwidth for KDE.
min_radius: int
Set data-point specific minimum bandwidth to be the distance to its `min_radius`-th nearest neighbor.
Attributes
----------
bw : float
adaptive_bw : array
Kernel bandwidth specified for each data point. This is the bandwidth directly used in the algorithm.
params : dict
Save parameters used in running Scms.scms(...) for bootstrap.
"""
def __init__(self, data, bw, min_radius=10):
self.data = data
self.bw = bw
self.min_radius = min_radius
self.params = {}
if not isinstance(data, list):
self._set_adaptive_bandwidth(self.min_radius)
self.multilevel = False
else:
self.multilevel = True
# self.p = np.exp(self._kernel_density_estimate(data, output_onlylogp=True))
# self.maxp = np.max(self.p)
[docs] def inverse_density_sampling(self, X, n_samples=2000, n_jobs=1, batch_size=16):
"""
Extract a subset of data points to represent the whole data. Data points
from high density areas are most likely to be subsampled and those from
low density areas are preserved. This is efficiently implemented with a
trick of sorting the data by log density + Gumbel noise.
Parameters
----------
X : 2D array
Input data points
n_samples : int, optional
Number of sub-sampled data points. Default is `2000`.
n_jobs : int, optional
Number of jobs to run in parallel. Default is `1`.
batch_size : int, optional
Batch size. Set this to a number number to reduce memory usage. Default is `16`
Returns
-------
indices : 1D array
Subsample indices corresponding to input `X`.
"""
if n_jobs == 1:
logp = [self._density_estimate(pos, output_onlylogp=True)
for pos in np.array_split(X, np.ceil(X.shape[0] / batch_size))]
else:
with Pool(n_jobs) as pool:
logp = pool.map(
lambda pos: self._density_estimate(pos, output_onlylogp=True),
np.array_split(X, np.ceil(X.shape[0] / (batch_size * n_jobs))))
logp = np.concatenate(logp)
s = -logp + np.random.gumbel(size=logp.shape)
return np.argsort(-s)[:n_samples]
[docs] def reset_bw(self, bw=None, min_radius=None):
"""
Reset bandwidth.
Parameters
----------
bw : float
Gaussian kernel bandwidth for KDE.
min_radius: int
Set data-point specific minimum bandwidth to be the distance to its `min_radius`-th nearest neighbor.
Returns
-------
None
"""
if bw:
self.bw = bw
if min_radius:
self.min_radius = min_radius
if self.min_radius > 0:
self._set_adaptive_bandwidth(self.min_radius)
else:
self.adaptive_bw = np.ones(self.data.shape[0]) * self.bw
def _set_adaptive_bandwidth(self, K):
if K > 0:
self.nbrs = NearestNeighbors(n_neighbors=K + 1).fit(self.data)
self.adaptive_bw = np.maximum(self.nbrs.kneighbors(self.data)[0][:, -1], self.bw)
else:
self.adaptive_bw = np.ones(self.data.shape[0]) * self.bw
def _density_estimate(self, X, output_onlylogp=False):
"""
Estimate density and relevant quantities for data points specified by X.
Parameters
----------
X : array
2D array. Input to density estimation.
output_onlylogp : bool
If true, returns logp, else returns p, g, h, msu.
Returns
-------
p : array
1D array. Unnormalized probability density. The probability density
is not normalized due to numerical stability. Exact log probability
density is returned if `output_onlylogp=True`.
g : array, optional
2D array. Gradient of the probability density function.
h : array, optional
3D array. Hessian of the probability density function.
msu :
2D array. Meanshift update based on the probability density function.
"""
if self.multilevel:
if output_onlylogp:
p = self._multilevel_kernel_density_estimate(X, output_onlylogp)
else:
p, g, h, msu = self._multilevel_kernel_density_estimate(X, output_onlylogp)
else:
if output_onlylogp:
p = self._kernel_density_estimate(X, output_onlylogp)
else:
p, g, h, msu = self._kernel_density_estimate(X, output_onlylogp)
if output_onlylogp:
return p
else:
return p, g, h, msu
def _kernel_density_estimate(self, X, output_onlylogp=False, ):
"""
Estimate density and relevant quantities for data points specified by X
with kernel density estimation.
Parameters
----------
X : array
2D array including multiple data points. Input to density estimation.
output_onlylogp : bool
If true, returns logp, else returns p, g, h, msu.
Returns
-------
p : array
1D array. Unnormalized probability density. The probability density
is not normalized due to numerical stability. Exact log probability
density is returned if `output_onlylogp=True`.
g : array, optional
2D array. Gradient of the probability density function.
h : array, optional
3D array. Hessian of the probability density function.
msu :
2D array. Meanshift update based on the probability density function.
"""
# the number of data points and the dimensionality
n, d = self.data.shape
# and evaluate the kernel at each distance
D = cdist(self.data, X)
# prevent numerical overflow due to large exponentials
logc = -d * np.log(np.min(self.adaptive_bw)) - d / 2 * np.log(2 * np.pi)
C = (self.adaptive_bw[:, np.newaxis] / np.min(self.adaptive_bw)) ** (-d) * \
np.exp(-1 / 2. * (D / self.adaptive_bw[:, np.newaxis]) ** 2)
if output_onlylogp:
# return the kernel density estimate
return np.log(np.mean(C, axis=0).T) + logc
else:
# gradient of p
u = vdist(self.data, X)
g = np.mean((C / (self.adaptive_bw[:, np.newaxis] ** 2)).T[:, :, np.newaxis] * u, axis=1)
# hessian of p
Z = np.eye(d)
h = matrix_multiply(
(C / (n * self.adaptive_bw[:, np.newaxis] ** 4)).T[:, np.newaxis, :] * u.transpose((0, 2, 1)), u) - \
as_strided(Z, strides=(0, Z.strides[0], Z.strides[1]), shape=(C.shape[1], Z.shape[0], Z.shape[1])) * \
(np.sum(C / (n * self.adaptive_bw[:, np.newaxis] ** 2), axis=0)[:, np.newaxis, np.newaxis])
# for computation of msu = Cp * g
Cp = 1. / np.mean(C / (self.adaptive_bw[:, np.newaxis] ** 2), axis=0)
# return the kernel density estimate
return np.mean(C, axis=0).T, g, h, Cp[:, np.newaxis] * g
def _multilevel_kernel_density_estimate(self, X, output_onlylogp=False, normalized=True):
"""
Estimate density and relevant quantities for data points specified by X
with kernel density estimation and multi-level data representation.
Data point-specific bandwidth (self.adaptive_bw) is not supported and self.bw
is used instead.
Parameters
----------
X : array
2D array. Input to density estimation.
output_onlylogp : bool
If true, returns logp, else returns p, g, h, msu.
Returns
-------
p : array
1D array. Unnormalized probability density. The probability density
is not normalized due to numerical stability. Exact log probability
density is returned if `output_onlylogp=True`.
g : array, optional
2D array. Gradient of the probability density function.
h : array, optional
3D array. Hessian of the probability density function.
msu :
2D array. Meanshift update based on the probability density function.
"""
# the number of data points and the dimensionality
npgh = []
for data in self.data:
n, d = data.shape
# and evaluate the kernel at each distance
D = cdist(data, X)
if normalized:
# prevent numerical overflow due to large exponentials
logc = -d * np.log(self.bw) - d / 2 * np.log(2 * np.pi)
else:
warnings.warn("Warning: use unnormalized probablity. This does not affect density ridge-related estimates.")
logc = 0
C = np.exp(-1 / 2. * (D / self.bw) ** 2)
if output_onlylogp:
# return the kernel density estimate
npgh.append((n, np.log(np.mean(C, axis=0).T) + logc))
else:
# gradient of p
u = vdist(data, X)
g = np.mean((C / (self.bw ** 2)).T[:, :, np.newaxis] * u, axis=1)
# hessian of p
Z = np.eye(d)
h = matrix_multiply((C / (n * self.bw ** 4)).T[:, np.newaxis, :] * u.transpose((0, 2, 1)), u) - \
as_strided(Z, strides=(0, Z.strides[0], Z.strides[1]), shape=(C.shape[1], Z.shape[0], Z.shape[1])) * \
(np.sum(C / (n * self.bw ** 2), axis=0)[:, np.newaxis, np.newaxis])
Cp = 1. / np.mean(C / (self.bw ** 2), axis=0)
npgh.append((n, np.mean(C, axis=0).T, g, h, Cp))
if output_onlylogp:
ns = np.asarray([n * 2 ** i for i, (n, _) in enumerate(npgh)])
ws = ns / ns.sum()
logps = [(logp + np.log(ws[i]))[np.newaxis, :] for i, (n, logp) in enumerate(npgh)]
logps = np.vstack(logps)
logps = logsumexp(logps, axis=0)
return logps
else:
ns = np.asarray([n * 2 ** i for i, (n, _, _, _, _) in enumerate(npgh)])
ws = ns / ns.sum()
ps, gs, hs, cps = npgh[0][1:]
ps = ps * ws[0]
gs = gs * ws[0]
hs = hs * ws[0]
cps = cps * ws[0]
for i, (n, p, g, h, cp) in enumerate(npgh[1:]):
ps = ps + p * ws[i]
gs = gs + g * ws[i]
hs = hs + h * ws[i]
cps = cps + cps * ws[i]
return ps, gs, hs, cps[:, np.newaxis] * gs
def _nlocal_inv_cov(self, X):
"""
Computes the hessian of log density function.
This function also returns other density estimation related
quantities.
Parameters
----------
X : array
2D array. Input data points.
Returns
-------
negative_local_inverse_covariance : array
3D array. Hessian of log probability density function (negative local
inverse covariance).
p : array
1D array. Unnormalized probability density. The probability density
is not normalized due to numerical stability. Exact log probability
density is returned if `output_onlylogp=True`.
g : array
2D array. Gradient of the probability density function.
h : array
3D array. Hessian of the probability density function.
msu :
2D array. Meanshift update based on the probability density function.
"""
p, g, h, msu = self._density_estimate(X)
return 1. / p[:, np.newaxis, np.newaxis] * h - 1. / p[:, np.newaxis, np.newaxis] ** 2 * matrix_multiply(
g[:, :, np.newaxis], g[:, :, np.newaxis].transpose((0, 2, 1))), p, g, h, msu
[docs] def scms_update(self, X, method='LocInv', stepsize=0.01, ridge_dimensionality=1, relaxation=0):
"""
Compute the constrained mean shift update of the point at x.
Parameters
----------
X : array
2D array. Input data points.
method : str {'Gradient'|'GradientLogP'|'SuRF'|'LocInv'}, optional
* Gradient
Projected gradient update based on Hessian of probability density function p.
Update is specified by projected gradient of p * step size.
* GradientLogP
Projected gradient update based on Hessian of log probability density function log(p).
Update is specified by projected gradient of log(p) * step size.
* SuRF
Projected mean-shift update based on Hessian of probability density function p.
Update is specified by projected mean shift update * step size.
* LocInv
Projected mean-shift update based on Hessian of probability density function log p.
Update is specified by projected mean shift update * step size.
Default is 'LocInv'.
stepsize : float, optional
Step size.
ridge_dimensionality : int, optional
Ridge dimensionality.
relaxation : float [0,1], optional
Relax the ridge dimensionality based on eigen gap corresponding to the ridge_dimensionality.
Typically used with `ridge_dimensionality=1`. All point are projected to one-dimensional ridges
(trajectories) when `relaxation=0`. All points are projected to two-dimensional ridges
(surfaces) when `relaxation=1`.
Returns
----------
update : 2D array
SCMS update.
relax_bool : 1D array
Indicate whether the ridge dimensionality of a particular point was increased by 1 relative to
what `ridge_dimensionality` specified. The amount of relaxation is controlled by `relaxation` argument.
"""
if method == 'GradientP':
_, g, h, msu = self._density_estimate(X)
eigvals, eigvecs = np.linalg.eigh(-h)
eigvals = -eigvals
update = g * stepsize
elif method == 'GradientLogP':
h, p, g, _, msu = self._nlocal_inv_cov(X)
g = g / p[:, np.newaxis]
eigvals, eigvecs = np.linalg.eigh(-h)
eigvals = -eigvals
update = g * stepsize
else:
if method == 'MSP':
# h equals hessian of p(x)
p, g, h, msu = self._density_estimate(X)
eigvals, eigvecs = np.linalg.eigh(-h)
eigvals = -eigvals
elif method == 'LocInv' or method == 'MSLogP':
# h equals negative hessian of log(p(x)), or "local inverse covariance"
h, p, g, _, msu = self._nlocal_inv_cov(X)
eigvals, eigvecs = np.linalg.eigh(-h)
eigvals = -eigvals
else:
raise ValueError('Method has to be one of Gradient, GradientLogp, MSP, MSLogP, or LocInv.')
update = msu
update = update * stepsize
if ridge_dimensionality == 0:
relax_bool = np.zeros((X.shape[0],), dtype=bool)
return update, relax_bool
else:
eigvecs[:, :, :ridge_dimensionality] = 0
# sort the eigenvectors by decreasing eigenvalue
if relaxation > 0 and ridge_dimensionality >= 1:
relax_bool = (eigvals[:, ridge_dimensionality - 1] - eigvals[:, ridge_dimensionality]) / (
eigvals[:, ridge_dimensionality - 1] - eigvals[:, -1]) < relaxation
else:
relax_bool = np.zeros((X.shape[0],), dtype=bool)
eigvecs[relax_bool, :, ridge_dimensionality] = 0
return matrix_multiply(matrix_multiply(eigvecs, eigvecs.transpose((0, 2, 1))), update[:, :, np.newaxis])[:,
:, 0], relax_bool
[docs] def scms(self, X, n_iterations=50, threshold=0, tol=1e-7, method='LocInv', stepsize=0.5, ridge_dimensionality=1,
relaxation=0, n_jobs=1, batch_size=16):
"""
Performs subspace constrained mean shift on the entire data set using
a Gaussian kernel with bandwidth of bw.
Parameters
----------
X : array
2D array. Input data points.
n_iterations : int, optional
The number of iterations for running the algorithm
threshold : float, optional
Filter out data points below a probability density threshold.
The threshold value is specified relative to the maximum density.
tol : float, optional
Tolerance for assessing convergence, stop if `max(abs(new_pos - pos)) < tol`.
method : str {'GradientP'|'GradientLogP'|'MSP'|'MSLogP'|'InvCov'}, optional
* GradientP
Projected gradient update based on Hessian of probability density function p.
Update is specified by projected gradient of p * step size.
* GradientLogP
Projected gradient update based on Hessian of log probability density function log(p).
Update is specified by projected gradient of log(p) * step size.
* MSP
Projected mean-shift update based on Hessian of probability density function p.
Update is specified by projected mean shift update * step size.
* MSLogP or InvCov (equivalent)
Projected mean-shift update based on Hessian of probability density function log p.
Update is specified by projected mean shift update * step size.
Default is 'InvCov'.
stepsize : float, optional
Step size. Default is 0.5
ridge_dimensionality : int, optional
Ridge dimensionality. Default is 1.
relaxation : float [0,1], optional
Relax the ridge dimensionality based on eigen gap corresponding to the ridge_dimensionality.
Typically used with `ridge_dimensionality=1`. All point are projected to one-dimensional ridges
(trajectories) when `relaxation=0`. All points are projected to two-dimensional ridges
(surfaces) when `relaxation=1`. Default is 0.
n_jobs : int, optional
Number of jobs to run in parallel. Default is 1.
batch_size : int, optional
Number of data points to process in a batch. Reduce this number can decrease memory usage.
Default is 500.
Returns
----------
pos : array
Updated data points postions.
ifilter : array
Indices of returned positions (useful when `threshold > 0` so part of the input are filtered).
"""
self.params = {'X': X, 'n_iterations': n_iterations, 'threshold': threshold, 'tol': tol, 'method': method,
'stepsize': stepsize, 'ridge_dimensionality': ridge_dimensionality,
'relaxation': relaxation}
self.X = X.copy()
data_copy = self.data.copy()
if threshold > 0:
p = np.exp(self._kernel_density_estimate(X, output_onlylogp=True))
maxp = np.max(np.exp(self._kernel_density_estimate(self.data, output_onlylogp=True)))
ifilter = np.where(p >= (maxp * threshold))[0]
else:
ifilter = np.arange(X.shape[0])
pos = X[ifilter, :]
if n_jobs > 1:
with Pool(n_jobs) as pool:
for j in range(n_iterations):
pos_old = pos.copy()
if X is None:
self.data = pos.copy()
updates = pool.map(
lambda pos: self.scms_update(pos, method=method, stepsize=stepsize,
ridge_dimensionality=ridge_dimensionality, relaxation=relaxation),
np.array_split(pos, np.ceil(pos.shape[0] / (batch_size * n_jobs))))
if len(updates) == 1:
update = updates[0][0]
self.relax_bool = updates[0][1]
else:
update = np.vstack([i[0] for i in updates])
self.relax_bool = np.concatenate([i[1] for i in updates], axis=0)
pos = pos + update
if np.max(np.sum(np.abs(pos_old - pos), axis=1)) < tol:
break
else:
for j in range(n_iterations):
pos_old = pos.copy()
if X is None:
self.data = pos.copy()
updates = list(map(
lambda pos: self.scms_update(pos, method=method, stepsize=stepsize,
ridge_dimensionality=ridge_dimensionality, relaxation=relaxation),
np.array_split(pos, np.ceil(pos.shape[0] / batch_size ))))
if len(updates)==1:
update = updates[0][0]
self.relax_bool = updates[0][1]
else:
update = np.vstack([i[0] for i in updates])
self.relax_bool = np.concatenate([i[1] for i in updates],axis=0)
pos = pos + update
if np.max(np.sum(np.abs(pos_old - pos), axis=1)) < tol:
break
self.data = data_copy
self.ridge = pos
self.ifilter = ifilter
return pos, ifilter
[docs] def boostrap_trajectory(self, n_bootstrap=25, n_jobs=1, copy_bw=True):
"""
Construct confidence set of density ridge positions through bootstrap. For each bootstrap sample,
distance to its nearest neighbors in the original ridge estimate is recorded. The bootstrap runs
will copy the parameters of the previous `scms` call, therefore `scms` has to be executed
before this function.
Parameters
----------
n_bootstrap : int, optional
Number of bootstrap runs. Default is 25.
n_jobs : int, optional
Number of jobs to run in parallel. Default is 1. Only specify `n_jobs>1` if `n_jobs=1` is used
when running `scms`.
copy_bw : bool, optional
If true, the point-specific kernel bandwidths are unchanged after resampling.
This option only has an effect is `self.min_radius > 0`.
Returns
----------
boostrap_n : array, shape (n_samples, n_bootstrap)
2D array. Bootstrap nearest neighbor distances for each ridge position.
boostrap_s : array, shape (n_samples, n_bootstrap)
2D array. Bootstrap point-wise distances for each ridge position. Experimental.
boostrap_ifilters : array, shape (n_samples, n_bootstrap)
2D array. Bootstrap resample indices.
"""
if self.params == {}:
raise ValueError('Run scms first before bootstrap!')
if self.min_radius > 0:
warnings.warn('Boostrap with adaptive bandwidth (min_radius>0) preserves the '
'adaptive bandwidths and confidence sets contructed should be '
'interpreted as such. Use min_radius=0 to be consistent with '
'Chen et al. 2015 algorithm.')
self.bootstrap_distances_n = []
self.bootstrap_distances_s = []
self.bootstrap_ifilters = []
self.bootstrap_ridges = []
def boostrap(i):
np.random.seed(i)
bdata, bi = bootstrap_resample(self.data)
b = Scms(bdata, self.bw, min_radius=self.min_radius)
if copy_bw:
b.adaptive_bw = self.adaptive_bw[bi]
_ridge, _ifilter = b.scms(**self.params)
binaryfilter = np.zeros(self.data.shape[0], dtype=bool)
binaryfilter[_ifilter] = True
a = np.zeros(self.data.shape)
a.fill(np.nan)
b = np.zeros(self.data.shape)
b.fill(np.nan)
a[_ifilter, :] = _ridge
b[self.ifilter, :] = self.ridge
dists = linalg.norm(a - b, axis=1)[self.ifilter]
return NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(_ridge).kneighbors(self.ridge)[0], \
dists, binaryfilter, _ridge
with Pool(n_jobs) as p:
res = p.map(boostrap, range(n_bootstrap))
for dist_n, dist_s, ifilter, ridge in res:
self.bootstrap_distances_n.append(dist_n)
self.bootstrap_distances_s.append(dist_s)
self.bootstrap_ifilters.append(ifilter)
self.bootstrap_ridges.append(ridge)
self.bootstrap_distances_n = np.hstack(self.bootstrap_distances_n)
self.bootstrap_distances_s = np.vstack(self.bootstrap_distances_s).T
self.bootstrap_ifilters = np.hstack(self.bootstrap_ifilters)
return self.bootstrap_distances_n, self.bootstrap_distances_s, self.bootstrap_ifilters
# return scoreatpercentile(self.bootstrap_distances_n, percentile_confidence, axis=1),
# scoreatpercentile(self.bootstrap_distances_s, percentile_confidence, axis=1)
[docs] def bootstrap_landmark(self, X, n_bootstrap=20, method='LocInv', smooth_bootstrap=False, copy_bw=True, n_jobs=4):
"""
Experimental function. Work in progress.
"""
self.landmarks = X
if method == 'LocInv' or method == 'GradientLogP':
self.landmarks_h, self.landmarks_p, self.landmarks_g, _ = self._nlocal_inv_cov(self.landmarks)
else:
self.landmarks_p, self.landmarks_g, self.landmarks_h = self._kernel_density_estimate(self.landmarks)
# use megative of h because eigh sort eigen values in ascending order
eigvals, eigvecs = np.linalg.eigh(-self.landmarks_h)
eigvals = -eigvals
self.landmarks_h_eigvecs = eigvecs
self.landmarks_h_eigvals = eigvals
self.landmarks_bootstrap_p = []
self.landmarks_bootstrap_g = []
self.landmarks_bootstrap_h = []
self.landmarks_bootstrap_g_proj = []
self.landmarks_bootstrap_h_proj = []
self.landmarks_bootstrap_h_dir = []
def boostrap(i):
np.random.seed(i)
if smooth_bootstrap:
b = Scms(self.data + np.random.randn(*self.data.shape) * self.adaptive_bw, self.bw,
min_radius=self.min_radius)
if copy_bw:
b.adaptive_bw = self.adaptive_bw
else:
bdata, bi = bootstrap_resample(self.data)
b = Scms(bdata, self.bw, min_radius=self.min_radius)
if copy_bw:
b.adaptive_bw = self.adaptive_bw[bi]
if method == 'LocInv' or method == 'GradientLogP':
bh, bp, bg, _ = b._nlocal_inv_cov(self.landmarks)
else:
bp, bg, bh = b._kernel_density_estimate(self.landmarks)
gproj = np.sum(self.landmarks_g * bg, axis=1) / np.linalg.norm(self.landmarks_g, axis=1)
hproj = np.sum(
matrix_multiply(bh.transpose((0, 2, 1)), self.landmarks_h_eigvecs) * self.landmarks_h_eigvecs, axis=1)
return bp, bg, bh, gproj, hproj
with Pool(n_jobs) as p:
res = p.map(boostrap, range(n_bootstrap))
for bp, bg, bh, gproj, hproj in res:
self.landmarks_bootstrap_p.append(bp)
self.landmarks_bootstrap_g.append(bg)
self.landmarks_bootstrap_h.append(bh)
self.landmarks_bootstrap_g_proj.append(gproj)
self.landmarks_bootstrap_h_proj.append(hproj)
# _, eigvecs = np.linalg.eigh(-bh)
# self.landmarks_bootstrap_h_dir.append(np.abs(np.sum(eigvecs * self.landmarks_h_eigvecs,axis=1)))
self.landmarks_bootstrap_g_proj = np.vstack(self.landmarks_bootstrap_g_proj).T
self.landmarks_bootstrap_h_proj = np.dstack(self.landmarks_bootstrap_h_proj)
# self.landmarks_bootstrap_h_dir = np.dstack(self.landmarks_bootstrap_h_dir)
return self.landmarks_bootstrap_g_proj, self.landmarks_bootstrap_h_proj