Source code for quasildr.structdr

# -*- coding: utf-8 -*-
"""
This module implements nonparametric ridge estimation and relevant
functions.  The main class is `Scms` which provides ridge estimation
and bootstrap functions.
"""
import warnings

import numpy as np
from multiprocess import Pool
from numpy.core.umath_tests import matrix_multiply
from numpy.lib.index_tricks import as_strided
from scipy import linalg
from scipy.spatial.distance import cdist
from scipy.special import logsumexp
from sklearn.neighbors import NearestNeighbors

from .external import neighbors


[docs]def vdist(a, b):
    """
    This function is similar to cdist but returning vectors rather than norm of vectors
    implemented via virtual copies of a and b

    Parameters
    ----------
    a,b : np.ndarray
        2D arrays of size (M,D), (N,D) for which differences for all pairwise combinations
        of rows in the two arrays will be computed. The number of columns have to the same
        between a and b.

    Returns
    ----------
    numpy.array
        3D array of size (N, M, D), containing all pairwise differences of rows in a and b.
    """
    return as_strided(a, strides=(0, a.strides[0], a.strides[1]),
                      shape=(b.shape[0], a.shape[0], a.shape[1])) - \
           as_strided(b, strides=(b.strides[0], 0, b.strides[1]),
                      shape=(b.shape[0], a.shape[0], b.shape[1]))


[docs]def bootstrap_resample(X, n=None):
    """ Bootstrap resample an array_like

    Parameters
    ----------
    X : array_like
      data to resample
    n : int, optional
      length of resampled array, equal to len(X) if n==None

    Returns
    -------
    array_like, numpy.array
        * Bootstrap resampled array
        * 1D array of indices corresponding to input.
    """
    if n is None:
        n = len(X)

    resample_i = np.floor(np.random.rand(n) * len(X)).astype(int)
    samples = X[resample_i]
    return samples, resample_i


[docs]def multilevel_compression(data, n=5000, threshold=None, k=20):
    """
    Iteratively combine nearest neighbor pairs below distance threshold.
    This function can be used to reduce the number of data points to represent the whole dataset
    without greatly affect the density estimation quality. Most of the data points in low density
    areas will be unaffected and data points in high density areas will be combined and averaged.

    Parameters
    ----------
    data : array
        2D array of size (N, D)
    n : int, optional
        Choose the distance threshold by the n-th largest k-NN distance; must be smaller than `N x k`.
    threshold : float or None, optional
        The distance threshold for combining nearest neighbor pairs. Override `N` when given.
    k : int, optional
        Number of nearesest neigbors for KNN graph. Choose larger k for better approximation,
        smaller for better speed. Use at least 2.

    Returns
    ----------
    tuple(list, list)
        * list of level 0, level 1, level 2, ... data points. As each level a data point corresponds to
            2^0, 2^1, 2^2, ... original data points.
        * list of input data indices corresponding to level-0,1,2,... data points.

    """

    data = np.asarray(data)
    c, _ = neighbors.neighbors(data, n_neighbors=np.minimum(k, data.shape[0]), metric='euclidean')

    c = c.nonzero()
    d = np.linalg.norm(data[c[0], :] - data[c[1], :], axis=1)
    isorted = np.argsort(d)

    _, iisorted = np.unique(c[0][isorted], return_index=True)

    source_i = c[0][isorted[iisorted]]
    target_i = c[1][isorted[iisorted]]
    source_ik = c[0][isorted]
    target_ik = c[1][isorted]
    dk = d[isorted]
    d = d[isorted[iisorted]]
    assert len(source_i) == data.shape[0]
    if threshold == None:
        threshold = -np.sort(-d)[n]

    solo_index = source_i[d > threshold]
    source_ik = source_ik[dk <= threshold]
    target_ik = target_ik[dk <= threshold]
    dk = dk[dk <= threshold]

    i = (~np.isin(source_ik, solo_index)) * (~np.isin(target_ik, solo_index))
    source_ik = source_ik[i]
    target_ik = target_ik[i]
    dk = dk[i]

    pairs_to_combine = np.vstack([source_ik, target_ik])
    pairs_to_combine_done = []
    while pairs_to_combine.shape[1] > 0:
        pairs_to_combine_flat = pairs_to_combine.T.flatten()
        _, i = np.unique(pairs_to_combine_flat, return_index=True)
        temp = pairs_to_combine_flat[i].copy()
        pairs_to_combine_flat[:] = -1
        pairs_to_combine_flat[i] = temp
        pairs_to_combine_working = np.reshape(pairs_to_combine_flat, (int(len(pairs_to_combine_flat) / 2), 2))
        pairs_to_combine_working = pairs_to_combine_working[~np.any(pairs_to_combine_working == -1, axis=1), :]
        pairs_to_combine_done.append(pairs_to_combine_working)
        pairs_to_combine_working = pairs_to_combine_working.flatten()
        pairs_to_combine = pairs_to_combine[:, (~np.isin(pairs_to_combine[0, :], pairs_to_combine_working)) * (
            ~np.isin(pairs_to_combine[1, :], pairs_to_combine_working))]

    while True:
        retry_index = np.setdiff1d(source_ik, np.vstack(pairs_to_combine_done).flatten())
        if len(retry_index) == 0:
            break
        c_r, _ = neighbors.neighbors(data[retry_index, :], n_neighbors=np.minimum(k, len(retry_index)),
                                     metric='euclidean')
        c_r = c_r.nonzero()
        d_r = np.linalg.norm(data[retry_index, :][c_r[0], :] - data[retry_index, :][c_r[1], :], axis=1)

        isorted = np.argsort(d_r)
        source_ik_r = c_r[0][isorted]
        target_ik_r = c_r[1][isorted]
        dk_r = d_r[isorted]

        source_ik_r = source_ik_r[dk_r < threshold]
        target_ik_r = target_ik_r[dk_r < threshold]
        dk_r = dk_r[dk_r < threshold]
        if len(source_ik_r) == 0:
            break
        pairs_to_combine = np.vstack([retry_index[source_ik_r], retry_index[target_ik_r]])
        while pairs_to_combine.shape[1] > 0:
            pairs_to_combine_flat = pairs_to_combine.T.flatten()
            _, i = np.unique(pairs_to_combine_flat, return_index=True)
            temp = pairs_to_combine_flat[i].copy()
            pairs_to_combine_flat[:] = -1
            pairs_to_combine_flat[i] = temp
            pairs_to_combine_working = np.reshape(pairs_to_combine_flat, (int(len(pairs_to_combine_flat) / 2), 2))
            pairs_to_combine_working = pairs_to_combine_working[~np.any(pairs_to_combine_working == -1, axis=1), :]
            pairs_to_combine_done.append(pairs_to_combine_working)
            pairs_to_combine_working = pairs_to_combine_working.flatten()
            pairs_to_combine = pairs_to_combine[:, (~np.isin(pairs_to_combine[0, :], pairs_to_combine_working)) * (
                ~np.isin(pairs_to_combine[1, :], pairs_to_combine_working))]

    if len(pairs_to_combine_done) == 0:
        solo_data = data
        return [solo_data]
    else:
        pairs_to_combine_done = np.vstack(pairs_to_combine_done)

        solo_index = np.concatenate([solo_index, np.setdiff1d(source_ik, np.vstack(pairs_to_combine_done).flatten())])
        solo_data = data[solo_index, :]
        pair_data = (data[pairs_to_combine_done[:, 0], :] + data[pairs_to_combine_done[:, 1], :]) / 2.0
        if len(pair_data) < 50:
            collapsed_index = [[solo_index], [pairs_to_combine_done[:, 0], pairs_to_combine_done[:, 1]]]
            return [solo_data, pair_data], collapsed_index
        else:
            coarse_data, coarse_index = multilevel_compression(pair_data, 0, threshold)
            collapsed_index = [[solo_index]] + [
                [pairs_to_combine_done[i, 0] for i in ind] + [pairs_to_combine_done[i, 1] for i in ind] for ind in
                coarse_index]
            return [solo_data] + coarse_data, collapsed_index


[docs]class Scms(object):
    """
    An implementation of nonparametric ridge estimation.
    The main algorithm implemented is subspace constrained mean-shift (SCMS).

    Parameters
    ----------
    data : array or list
        * if `data` is a 2D array of size (N, D). Regular KDE is used.
        * if `data` is a list. `data` is interpreted as compressed data as given by
        `ridge_estimation.multilevel_compression` function.
        Multi-level KDE is used. Different levels one data point corresponds to 1, 2, 4, 8, 16, ...
        regular datapoints.

    bw : float
        Gaussian kernel bandwidth for KDE.
    min_radius: int
        Set data-point specific minimum bandwidth to be the distance to its `min_radius`-th nearest neighbor.

    Attributes
    ----------
    bw : float
    adaptive_bw : array
        Kernel bandwidth specified for each data point. This is the bandwidth directly used in the algorithm.
    params : dict
        Save parameters used in running Scms.scms(...) for bootstrap.

    """

    def __init__(self, data, bw, min_radius=10):
        self.data = data
        self.bw = bw
        self.min_radius = min_radius
        self.params = {}
        if not isinstance(data, list):
            self._set_adaptive_bandwidth(self.min_radius)
            self.multilevel = False
        else:
            self.multilevel = True

        # self.p = np.exp(self._kernel_density_estimate(data, output_onlylogp=True))
        # self.maxp = np.max(self.p)

[docs]    def inverse_density_sampling(self, X, n_samples=2000, n_jobs=1, batch_size=16):
        """
        Extract a subset of data points to represent the whole data. Data points
        from high density areas are most likely to be subsampled and those from
        low density areas are preserved.  This is efficiently implemented with a
        trick of sorting the data by log density + Gumbel noise.

        Parameters
        ----------
        X : 2D array
            Input data points
        n_samples : int, optional
            Number of sub-sampled data points. Default is `2000`.
        n_jobs : int, optional
            Number of jobs to run in parallel. Default is `1`.
        batch_size : int, optional
            Batch size. Set this to a number number to reduce memory usage. Default is `16`

        Returns
        -------
        indices : 1D array
            Subsample indices corresponding to input `X`.

        """
        if n_jobs == 1:
            logp = [self._density_estimate(pos, output_onlylogp=True)
                 for pos in np.array_split(X, np.ceil(X.shape[0] / batch_size))]
        else:
            with Pool(n_jobs) as pool:
                logp = pool.map(
                    lambda pos: self._density_estimate(pos, output_onlylogp=True),
                    np.array_split(X, np.ceil(X.shape[0] / (batch_size * n_jobs))))
        logp = np.concatenate(logp)
        s = -logp + np.random.gumbel(size=logp.shape)
        return np.argsort(-s)[:n_samples]

[docs]    def reset_bw(self, bw=None, min_radius=None):
        """
        Reset bandwidth.

        Parameters
        ----------
        bw : float
            Gaussian kernel bandwidth for KDE.
        min_radius: int
            Set data-point specific minimum bandwidth to be the distance to its `min_radius`-th nearest neighbor.


        Returns
        -------
        None
        """
        if bw:
            self.bw = bw
        if min_radius:
            self.min_radius = min_radius

        if self.min_radius > 0:
            self._set_adaptive_bandwidth(self.min_radius)
        else:
            self.adaptive_bw = np.ones(self.data.shape[0]) * self.bw

    def _set_adaptive_bandwidth(self, K):
        if K > 0:
            self.nbrs = NearestNeighbors(n_neighbors=K + 1).fit(self.data)
            self.adaptive_bw = np.maximum(self.nbrs.kneighbors(self.data)[0][:, -1], self.bw)
        else:
            self.adaptive_bw = np.ones(self.data.shape[0]) * self.bw

    def _density_estimate(self, X, output_onlylogp=False):
        """
        Estimate density and relevant quantities for data points specified by X.

        Parameters
        ----------
        X : array
            2D array. Input to density estimation.
        output_onlylogp : bool
            If true, returns logp, else returns p, g, h, msu.


        Returns
        -------
        p : array
            1D array.  Unnormalized probability density. The probability density
            is not normalized due to numerical stability. Exact log probability
            density is returned if `output_onlylogp=True`.
        g : array, optional
            2D array. Gradient of the probability density function.
        h : array, optional
            3D array. Hessian of the probability density function.
        msu :
            2D array. Meanshift update based on the probability density function.

        """
        if self.multilevel:
            if output_onlylogp:
                p = self._multilevel_kernel_density_estimate(X, output_onlylogp)
            else:
                p, g, h, msu = self._multilevel_kernel_density_estimate(X, output_onlylogp)
        else:
            if output_onlylogp:
                p = self._kernel_density_estimate(X, output_onlylogp)
            else:
                p, g, h, msu = self._kernel_density_estimate(X, output_onlylogp)

        if output_onlylogp:
            return p
        else:
            return p, g, h, msu

    def _kernel_density_estimate(self, X, output_onlylogp=False, ):
        """
        Estimate density and relevant quantities for data points specified by X
        with kernel density estimation.

        Parameters
        ----------
        X : array
            2D array including multiple data points. Input to density estimation.
        output_onlylogp : bool
            If true, returns logp, else returns p, g, h, msu.

        Returns
        -------
        p : array
            1D array.  Unnormalized probability density. The probability density
            is not normalized due to numerical stability. Exact log probability
            density is returned if `output_onlylogp=True`.
        g : array, optional
            2D array. Gradient of the probability density function.
        h : array, optional
            3D array. Hessian of the probability density function.
        msu :
            2D array. Meanshift update based on the probability density function.
        """

        # the number of data points and the dimensionality
        n, d = self.data.shape

        # and evaluate the kernel at each distance
        D = cdist(self.data, X)

        # prevent numerical overflow due to large exponentials
        logc = -d * np.log(np.min(self.adaptive_bw)) - d / 2 * np.log(2 * np.pi)
        C = (self.adaptive_bw[:, np.newaxis] / np.min(self.adaptive_bw)) ** (-d) * \
            np.exp(-1 / 2. * (D / self.adaptive_bw[:, np.newaxis]) ** 2)

        if output_onlylogp:
            # return the kernel density estimate
            return np.log(np.mean(C, axis=0).T) + logc
        else:
            # gradient of p
            u = vdist(self.data, X)
            g = np.mean((C / (self.adaptive_bw[:, np.newaxis] ** 2)).T[:, :, np.newaxis] * u, axis=1)

            # hessian of p
            Z = np.eye(d)
            h = matrix_multiply(
                (C / (n * self.adaptive_bw[:, np.newaxis] ** 4)).T[:, np.newaxis, :] * u.transpose((0, 2, 1)), u) - \
                as_strided(Z, strides=(0, Z.strides[0], Z.strides[1]), shape=(C.shape[1], Z.shape[0], Z.shape[1])) * \
                (np.sum(C / (n * self.adaptive_bw[:, np.newaxis] ** 2), axis=0)[:, np.newaxis, np.newaxis])
            # for computation of msu = Cp * g
            Cp = 1. / np.mean(C / (self.adaptive_bw[:, np.newaxis] ** 2), axis=0)
            # return the kernel density estimate
            return np.mean(C, axis=0).T, g, h, Cp[:, np.newaxis] * g

    def _multilevel_kernel_density_estimate(self, X, output_onlylogp=False, normalized=True):
        """
        Estimate density and relevant quantities for data points specified by X
        with kernel density estimation and multi-level data representation.
        Data point-specific bandwidth (self.adaptive_bw) is not supported and self.bw
        is used instead.


        Parameters
        ----------
        X : array
            2D array. Input to density estimation.
        output_onlylogp : bool
            If true, returns logp, else returns p, g, h, msu.

        Returns
        -------
        p : array
            1D array.  Unnormalized probability density. The probability density
            is not normalized due to numerical stability. Exact log probability
            density is returned if `output_onlylogp=True`.
        g : array, optional
            2D array. Gradient of the probability density function.
        h : array, optional
            3D array. Hessian of the probability density function.
        msu :
            2D array. Meanshift update based on the probability density function.
        """

        # the number of data points and the dimensionality
        npgh = []
        for data in self.data:
            n, d = data.shape
            # and evaluate the kernel at each distance
            D = cdist(data, X)
            if normalized:
                # prevent numerical overflow due to large exponentials
                logc = -d * np.log(self.bw) - d / 2 * np.log(2 * np.pi)
            else:
                warnings.warn("Warning: use unnormalized probablity. This does not affect density ridge-related estimates.")
                logc = 0
            C = np.exp(-1 / 2. * (D / self.bw) ** 2)

            if output_onlylogp:
                # return the kernel density estimate
                npgh.append((n, np.log(np.mean(C, axis=0).T) + logc))
            else:
                # gradient of p
                u = vdist(data, X)
                g = np.mean((C / (self.bw ** 2)).T[:, :, np.newaxis] * u, axis=1)

                # hessian of p
                Z = np.eye(d)
                h = matrix_multiply((C / (n * self.bw ** 4)).T[:, np.newaxis, :] * u.transpose((0, 2, 1)), u) - \
                    as_strided(Z, strides=(0, Z.strides[0], Z.strides[1]), shape=(C.shape[1], Z.shape[0], Z.shape[1])) * \
                    (np.sum(C / (n * self.bw ** 2), axis=0)[:, np.newaxis, np.newaxis])
                Cp = 1. / np.mean(C / (self.bw ** 2), axis=0)
                npgh.append((n, np.mean(C, axis=0).T, g, h, Cp))

            if output_onlylogp:
                ns = np.asarray([n * 2 ** i for i, (n, _) in enumerate(npgh)])
                ws = ns / ns.sum()
                logps = [(logp + np.log(ws[i]))[np.newaxis, :] for i, (n, logp) in enumerate(npgh)]
                logps = np.vstack(logps)
                logps = logsumexp(logps, axis=0)

                return logps

            else:
                ns = np.asarray([n * 2 ** i for i, (n, _, _, _, _) in enumerate(npgh)])
                ws = ns / ns.sum()

                ps, gs, hs, cps = npgh[0][1:]
                ps = ps * ws[0]
                gs = gs * ws[0]
                hs = hs * ws[0]
                cps = cps * ws[0]
                for i, (n, p, g, h, cp) in enumerate(npgh[1:]):
                    ps = ps + p * ws[i]
                    gs = gs + g * ws[i]
                    hs = hs + h * ws[i]
                    cps = cps + cps * ws[i]

                return ps, gs, hs, cps[:, np.newaxis] * gs

    def _nlocal_inv_cov(self, X):
        """
        Computes the hessian of log density function.
        This function also returns other density estimation related
        quantities.

        Parameters
        ----------
        X : array
            2D array. Input data points.

        Returns
        -------
        negative_local_inverse_covariance : array
            3D array. Hessian of log probability density function (negative local
            inverse covariance).
        p : array
            1D array.  Unnormalized probability density. The probability density
            is not normalized due to numerical stability. Exact log probability
            density is returned if `output_onlylogp=True`.
        g : array
            2D array. Gradient of the probability density function.
        h : array
            3D array. Hessian of the probability density function.
        msu :
            2D array. Meanshift update based on the probability density function.
        """

        p, g, h, msu = self._density_estimate(X)
        return 1. / p[:, np.newaxis, np.newaxis] * h - 1. / p[:, np.newaxis, np.newaxis] ** 2 * matrix_multiply(
            g[:, :, np.newaxis], g[:, :, np.newaxis].transpose((0, 2, 1))), p, g, h, msu

[docs]    def scms_update(self, X, method='LocInv', stepsize=0.01, ridge_dimensionality=1, relaxation=0):
        """
        Compute the constrained mean shift update of the point at x.

        Parameters
        ----------
        X : array
            2D array. Input data points.
        method : str {'Gradient'|'GradientLogP'|'SuRF'|'LocInv'}, optional
            * Gradient
                Projected gradient update based on Hessian of probability density function p.
                Update is specified by projected gradient of p * step size.
            * GradientLogP
                Projected gradient update based on Hessian of log probability density function log(p).
                Update is specified by projected gradient of log(p) * step size.
            * SuRF
                Projected mean-shift update based on Hessian of probability density function p.
                Update is specified by projected mean shift update * step size.
            * LocInv
                Projected mean-shift update based on Hessian of probability density function log p.
                Update is specified by projected mean shift update * step size.
            Default is 'LocInv'.
        stepsize : float, optional
            Step size.
        ridge_dimensionality : int, optional
            Ridge dimensionality.
        relaxation : float [0,1], optional
            Relax the ridge dimensionality based on eigen gap corresponding to the ridge_dimensionality.
            Typically used with `ridge_dimensionality=1`. All point are projected to one-dimensional ridges
            (trajectories) when `relaxation=0`. All points are projected to two-dimensional ridges
            (surfaces) when `relaxation=1`.

        Returns
        ----------
        update : 2D array
            SCMS update.
        relax_bool : 1D array
            Indicate whether the ridge dimensionality of a particular point was increased by 1 relative to
            what `ridge_dimensionality` specified. The amount of relaxation is controlled by `relaxation` argument.

        """

        if method == 'GradientP':
            _, g, h, msu = self._density_estimate(X)
            eigvals, eigvecs = np.linalg.eigh(-h)
            eigvals = -eigvals
            update = g * stepsize

        elif method == 'GradientLogP':
            h, p, g, _, msu = self._nlocal_inv_cov(X)
            g = g / p[:, np.newaxis]

            eigvals, eigvecs = np.linalg.eigh(-h)
            eigvals = -eigvals
            update = g * stepsize
        else:
            if method == 'MSP':
                # h equals hessian of p(x)
                p, g, h, msu = self._density_estimate(X)
                eigvals, eigvecs = np.linalg.eigh(-h)
                eigvals = -eigvals
            elif method == 'LocInv' or method == 'MSLogP':
                # h equals negative hessian of log(p(x)), or "local inverse covariance"
                h, p, g, _, msu = self._nlocal_inv_cov(X)
                eigvals, eigvecs = np.linalg.eigh(-h)
                eigvals = -eigvals
            else:
                raise ValueError('Method has to be one of Gradient, GradientLogp, MSP, MSLogP, or LocInv.')

            update = msu
            update = update * stepsize

        if ridge_dimensionality == 0:
            relax_bool = np.zeros((X.shape[0],), dtype=bool)
            return update, relax_bool
        else:
            eigvecs[:, :, :ridge_dimensionality] = 0
            # sort the eigenvectors by decreasing eigenvalue
            if relaxation > 0 and ridge_dimensionality >= 1:
                relax_bool = (eigvals[:, ridge_dimensionality - 1] - eigvals[:, ridge_dimensionality]) / (
                        eigvals[:, ridge_dimensionality - 1] - eigvals[:, -1]) < relaxation
            else:
                relax_bool = np.zeros((X.shape[0],), dtype=bool)
            eigvecs[relax_bool, :, ridge_dimensionality] = 0


            return matrix_multiply(matrix_multiply(eigvecs, eigvecs.transpose((0, 2, 1))), update[:, :, np.newaxis])[:,
                   :, 0], relax_bool

[docs]    def scms(self, X, n_iterations=50, threshold=0, tol=1e-7, method='LocInv', stepsize=0.5, ridge_dimensionality=1,
             relaxation=0, n_jobs=1, batch_size=16):
        """
        Performs subspace constrained mean shift on the entire data set using
        a Gaussian kernel with bandwidth of bw.

        Parameters
        ----------
        X : array
            2D array. Input data points.
        n_iterations : int, optional
            The number of iterations for running the algorithm
        threshold : float, optional
            Filter out data points below a probability density threshold.
            The threshold value is specified relative to the maximum density.
        tol : float, optional
            Tolerance for assessing convergence, stop if `max(abs(new_pos - pos)) < tol`.
        method : str {'GradientP'|'GradientLogP'|'MSP'|'MSLogP'|'InvCov'}, optional
            * GradientP
                Projected gradient update based on Hessian of probability density function p.
                Update is specified by projected gradient of p * step size.
            * GradientLogP
                Projected gradient update based on Hessian of log probability density function log(p).
                Update is specified by projected gradient of log(p) * step size.
            * MSP
                Projected mean-shift update based on Hessian of probability density function p.
                Update is specified by projected mean shift update * step size.
            * MSLogP or InvCov (equivalent)
                Projected mean-shift update based on Hessian of probability density function log p.
                Update is specified by projected mean shift update * step size.
            Default is 'InvCov'.
        stepsize : float, optional
            Step size. Default is 0.5
        ridge_dimensionality : int, optional
            Ridge dimensionality. Default is 1.
        relaxation : float [0,1], optional
            Relax the ridge dimensionality based on eigen gap corresponding to the ridge_dimensionality.
            Typically used with `ridge_dimensionality=1`. All point are projected to one-dimensional ridges
            (trajectories) when `relaxation=0`. All points are projected to two-dimensional ridges
            (surfaces) when `relaxation=1`. Default is 0.
        n_jobs : int, optional
            Number of jobs to run in parallel. Default is 1.
        batch_size : int, optional
            Number of data points to process in a batch. Reduce this number can decrease memory usage.
            Default is 500.

        Returns
        ----------
        pos : array
            Updated data points postions.
        ifilter : array
            Indices of returned positions (useful when `threshold > 0` so part of the input are filtered).
        """

        self.params = {'X': X, 'n_iterations': n_iterations, 'threshold': threshold, 'tol': tol, 'method': method,
                       'stepsize': stepsize, 'ridge_dimensionality': ridge_dimensionality,
                       'relaxation': relaxation}

        self.X = X.copy()
        data_copy = self.data.copy()

        if threshold > 0:
            p = np.exp(self._kernel_density_estimate(X, output_onlylogp=True))
            maxp = np.max(np.exp(self._kernel_density_estimate(self.data, output_onlylogp=True)))
            ifilter = np.where(p >= (maxp * threshold))[0]
        else:
            ifilter = np.arange(X.shape[0])

        pos = X[ifilter, :]
        if n_jobs > 1:
            with Pool(n_jobs) as pool:
                for j in range(n_iterations):
                    pos_old = pos.copy()
                    if X is None:
                        self.data = pos.copy()

                    updates = pool.map(
                        lambda pos: self.scms_update(pos, method=method, stepsize=stepsize,
                                                     ridge_dimensionality=ridge_dimensionality, relaxation=relaxation),
                        np.array_split(pos, np.ceil(pos.shape[0] / (batch_size * n_jobs))))

                    if len(updates) == 1:
                        update = updates[0][0]
                        self.relax_bool = updates[0][1]
                    else:
                        update = np.vstack([i[0] for i in updates])
                        self.relax_bool = np.concatenate([i[1] for i in updates], axis=0)
                    pos = pos + update
                    if np.max(np.sum(np.abs(pos_old - pos), axis=1)) < tol:
                        break
        else:
            for j in range(n_iterations):
                pos_old = pos.copy()
                if X is None:
                    self.data = pos.copy()

                updates = list(map(
                    lambda pos: self.scms_update(pos, method=method, stepsize=stepsize,
                                                 ridge_dimensionality=ridge_dimensionality, relaxation=relaxation),
                    np.array_split(pos, np.ceil(pos.shape[0] / batch_size ))))

                if len(updates)==1:
                    update = updates[0][0]
                    self.relax_bool = updates[0][1]
                else:
                    update = np.vstack([i[0] for i in updates])
                    self.relax_bool = np.concatenate([i[1] for i in updates],axis=0)
                pos = pos + update
                if np.max(np.sum(np.abs(pos_old - pos), axis=1)) < tol:
                    break

        self.data = data_copy
        self.ridge = pos
        self.ifilter = ifilter
        return pos, ifilter


[docs]    def boostrap_trajectory(self, n_bootstrap=25, n_jobs=1, copy_bw=True):
        """
        Construct confidence set of density ridge positions through bootstrap. For each bootstrap sample,
        distance to its nearest neighbors in the original ridge estimate is recorded. The bootstrap runs
        will copy the parameters of the previous `scms` call, therefore `scms` has to be executed
        before this function.

        Parameters
        ----------
        n_bootstrap : int, optional
            Number of bootstrap runs. Default is 25.
        n_jobs : int, optional
            Number of jobs to run in parallel. Default is 1. Only specify `n_jobs>1` if `n_jobs=1` is used
            when running `scms`.
        copy_bw : bool, optional
            If true, the point-specific kernel bandwidths are unchanged after resampling.
            This option only has an effect is `self.min_radius > 0`.

        Returns
        ----------
        boostrap_n : array, shape (n_samples, n_bootstrap)
            2D array. Bootstrap nearest neighbor distances for each ridge position.
        boostrap_s : array, shape (n_samples, n_bootstrap)
            2D array. Bootstrap point-wise distances for each ridge position. Experimental.
        boostrap_ifilters : array, shape (n_samples, n_bootstrap)
            2D array. Bootstrap resample indices.
        """
        if self.params == {}:
            raise ValueError('Run scms first before bootstrap!')

        if self.min_radius > 0:
            warnings.warn('Boostrap with adaptive bandwidth (min_radius>0) preserves the '
                          'adaptive bandwidths and confidence sets contructed should be '
                          'interpreted as such. Use min_radius=0 to be consistent with '
                          'Chen et al. 2015 algorithm.')

        self.bootstrap_distances_n = []
        self.bootstrap_distances_s = []
        self.bootstrap_ifilters = []
        self.bootstrap_ridges = []

        def boostrap(i):
            np.random.seed(i)
            bdata, bi = bootstrap_resample(self.data)
            b = Scms(bdata, self.bw, min_radius=self.min_radius)
            if copy_bw:
                b.adaptive_bw = self.adaptive_bw[bi]

            _ridge, _ifilter = b.scms(**self.params)
            binaryfilter = np.zeros(self.data.shape[0], dtype=bool)
            binaryfilter[_ifilter] = True
            a = np.zeros(self.data.shape)
            a.fill(np.nan)
            b = np.zeros(self.data.shape)
            b.fill(np.nan)

            a[_ifilter, :] = _ridge
            b[self.ifilter, :] = self.ridge
            dists = linalg.norm(a - b, axis=1)[self.ifilter]
            return NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(_ridge).kneighbors(self.ridge)[0], \
                   dists, binaryfilter, _ridge

        with Pool(n_jobs) as p:
            res = p.map(boostrap, range(n_bootstrap))
            for dist_n, dist_s, ifilter, ridge in res:
                self.bootstrap_distances_n.append(dist_n)
                self.bootstrap_distances_s.append(dist_s)
                self.bootstrap_ifilters.append(ifilter)
                self.bootstrap_ridges.append(ridge)


        self.bootstrap_distances_n = np.hstack(self.bootstrap_distances_n)
        self.bootstrap_distances_s = np.vstack(self.bootstrap_distances_s).T
        self.bootstrap_ifilters = np.hstack(self.bootstrap_ifilters)
        return self.bootstrap_distances_n, self.bootstrap_distances_s, self.bootstrap_ifilters
        # return scoreatpercentile(self.bootstrap_distances_n, percentile_confidence, axis=1),
        #       scoreatpercentile(self.bootstrap_distances_s, percentile_confidence, axis=1)

[docs]    def bootstrap_landmark(self, X, n_bootstrap=20, method='LocInv', smooth_bootstrap=False, copy_bw=True, n_jobs=4):
        """
        Experimental function. Work in progress.
        """
        self.landmarks = X
        if method == 'LocInv' or method == 'GradientLogP':
            self.landmarks_h, self.landmarks_p, self.landmarks_g, _ = self._nlocal_inv_cov(self.landmarks)
        else:
            self.landmarks_p, self.landmarks_g, self.landmarks_h = self._kernel_density_estimate(self.landmarks)

        # use megative of h because eigh sort eigen values in ascending order
        eigvals, eigvecs = np.linalg.eigh(-self.landmarks_h)
        eigvals = -eigvals
        self.landmarks_h_eigvecs = eigvecs
        self.landmarks_h_eigvals = eigvals

        self.landmarks_bootstrap_p = []
        self.landmarks_bootstrap_g = []
        self.landmarks_bootstrap_h = []
        self.landmarks_bootstrap_g_proj = []
        self.landmarks_bootstrap_h_proj = []
        self.landmarks_bootstrap_h_dir = []

        def boostrap(i):
            np.random.seed(i)
            if smooth_bootstrap:
                b = Scms(self.data + np.random.randn(*self.data.shape) * self.adaptive_bw, self.bw,
                         min_radius=self.min_radius)
                if copy_bw:
                    b.adaptive_bw = self.adaptive_bw
            else:
                bdata, bi = bootstrap_resample(self.data)
                b = Scms(bdata, self.bw, min_radius=self.min_radius)
                if copy_bw:
                    b.adaptive_bw = self.adaptive_bw[bi]

            if method == 'LocInv' or method == 'GradientLogP':
                bh, bp, bg, _ = b._nlocal_inv_cov(self.landmarks)
            else:
                bp, bg, bh = b._kernel_density_estimate(self.landmarks)

            gproj = np.sum(self.landmarks_g * bg, axis=1) / np.linalg.norm(self.landmarks_g, axis=1)
            hproj = np.sum(
                matrix_multiply(bh.transpose((0, 2, 1)), self.landmarks_h_eigvecs) * self.landmarks_h_eigvecs, axis=1)
            return bp, bg, bh, gproj, hproj

        with Pool(n_jobs) as p:
            res = p.map(boostrap, range(n_bootstrap))
            for bp, bg, bh, gproj, hproj in res:
                self.landmarks_bootstrap_p.append(bp)
                self.landmarks_bootstrap_g.append(bg)
                self.landmarks_bootstrap_h.append(bh)
                self.landmarks_bootstrap_g_proj.append(gproj)
                self.landmarks_bootstrap_h_proj.append(hproj)
                # _, eigvecs = np.linalg.eigh(-bh)
                # self.landmarks_bootstrap_h_dir.append(np.abs(np.sum(eigvecs * self.landmarks_h_eigvecs,axis=1)))

        self.landmarks_bootstrap_g_proj = np.vstack(self.landmarks_bootstrap_g_proj).T
        self.landmarks_bootstrap_h_proj = np.dstack(self.landmarks_bootstrap_h_proj)
        # self.landmarks_bootstrap_h_dir = np.dstack(self.landmarks_bootstrap_h_dir)

        return self.landmarks_bootstrap_g_proj, self.landmarks_bootstrap_h_proj