# -*- coding: utf-8 -*-
"""
This module implements the quaslinear representation method GraphDR and relevant
functions. The method is implemented in the graphdr function. A scikit-learn
Estimator-like interface to graphdr is also provided through the GraphDR class.
"""
import os
import scipy
import numpy as np
from scipy.sparse import csgraph, csr_matrix
from multiprocess import Pool
from scipy.sparse import issparse
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator
from sklearn.neighbors import kneighbors_graph
from sklearn.exceptions import NotFittedError
from .external import neighbors
[docs]def knn_graph(X, n_neighbors=15, space='l2', space_params=None, method='hnsw', num_threads=8, params={'post': 2},
verbose=False):
"""Construct an approximate K-nearest neighbors graph with nmslib.
Parameters
----------
X : array, shape (n_samples, n_features)
A input 2D array to construct KNN graph on. Each row represent a sample.
n_neighbors : int, optional
Default is `15`. The number of nearest neighbors per sample.
space : string, optional
Default is `l2`. The metric/non-metric distance functions to use to compute distances.
Note that l2 corresponds to Euclidean distance.
see https://github.com/nmslib/nmslib/blob/master/manual/manual.pdf
* bit_hamming
* l1
* l1_sparse
* l2, euclidean
* l2_sparse
* linf
* linf_sparse
* lp:p=...
* lp_sparse:p=...
* angulardist, angulardist sparse, angulardist sparse fast
* jsmetrslow, jsmetrfast, jsmetrfastapprox
* leven
* sqfd minus func, sqfd heuristic func:alpha=..., sqfd gaussian func:alpha=...
* jsdivslow, jsdivfast, jsdivfastapprox
* cosinesimil, cosine, cosinesimil sparse
* normleven
* kldivfast
* kldivgenslow, kldivgenfast, kldivgenfastrq
* itakurasaitoslow, itakurasaitofast, itakurasaitofastrq
* negdotprod_sparse
* querynorm_negdotprod_sparse
* renyi_diverg
* ab_diverg
space_params : dict or None, optional
Parameters for configuring the space.
method : str
The nmslib method to use. Default is `hnsw`.
num_threads : int, optional
Default is `8`. Number of threads to run nmslib with.
params : dict, optional
Default is `{'post': 2}`. Input parameters to nmslib index construction.
See https://github.com/nmslib/nmslib/blob/master/manual/methods.md for details.
verbose: bool, optional
Default is False. Print extra information.
Returns
-------
scipy.sparse.csr_matrix
Sparse matrix representing the constructed KNN graph.
"""
import nmslib
if space == 'euclidean':
space = 'l2'
elif space == 'manhattan':
space = 'l1'
elif space == 'cosine':
space = 'cosinesimil'
index = nmslib.init(method=method, space=space, space_params=space_params)
index.addDataPointBatch(X)
index.createIndex(params, print_progress=False)
neighbors = index.knnQueryBatch(X, k=n_neighbors, num_threads=num_threads)
ind = np.vstack([i for i, d in neighbors])
sind = np.repeat(np.arange(ind.shape[0]), ind.shape[1])
tind = ind.flatten()
g = csr_matrix((np.ones(ind.shape[0] * ind.shape[1]), (sind, tind)), (ind.shape[0], ind.shape[0]))
return g
[docs]def cg(A, B, x0=None, tol=1e-7, atol=1e-15, use_cuda=False):
"""
Solve X in linear equation AX = B when A is sparse with conjugate gradient method.
Support GPU with use_cuda=True.
Parameters
----------
A : sparse matrix or array
A N-by-N matrix of the linear system.
B : array
Right hand side of the linear system.
x0 : array or None, optional
If specified, initialize the solution with x0.
tol, atol : float, optional
Tolerances for convergence `norm(residual) <= max(tol*norm(b), atol)`.
Default is `tol=1e-7, atol=1e-15`.
use_cuda : bool, optional
Use GPU acceleration with CUDA. A GPU supporting CUDA is needed and CuPy
package need to be installed.
Default is `False`.
Returns
----------
array
The solution to X.
"""
if use_cuda:
import cupy
alg = cupy
A = alg.sparse.csr_matrix(A)
B = alg.array(B)
else:
alg = np
atol = alg.maximum(alg.sqrt(alg.sum(B * B, axis=0)) * tol, atol)
if not x0:
X = alg.ones((A.shape[1], B.shape[1]))
else:
X = x0
R = A.dot(X) - B
P = - R
r_k_norms = alg.sum(R * R, axis=0)
while True:
AP = A.dot(P)
alphas = r_k_norms / alg.sum(P * AP, axis=0)
X += alphas[alg.newaxis, :] * P
R += alphas[alg.newaxis, :] * AP
r_kplus1_norms = alg.sum(R * R, axis=0)
betas = r_kplus1_norms / r_k_norms
r_k_norms = r_kplus1_norms
if alg.all(alg.sqrt(r_kplus1_norms) < atol):
break
P = betas[alg.newaxis, :] * P - R
return X.get() if use_cuda else X
[docs]class GraphDR(BaseEstimator):
"""
Quasilinear data representation that preserves interpretability of a
linear space.
This class provides a scikit-learn type interface that wraps around
the graphdr function.
Parameters
----------
n_neighbors : int, optional
Number of nearest neigbors to compute per sample. Default is `10`.
regularization : float, optional
Regularization parameter determines the weight on the graph shrinkage term
relative to the reconstruction error term. A smaller regularization parameter
leads to output that are more similar to a linear representation. A larger
regularization parameter provides more contraction based on the graph. Default is `100`.
n_components : int or None, optional
Number of dimensions in the output. Specify a n_components smaller than the input
dimension when `no_rotation=True` can reduce the amount of computation.
Default is `None`.
no_rotation : bool, optional
:math:`W` is fixed to identity matrix so that the output is not rotated
relative to input. no_rotation can also speed up the computation for large
input when `n_components` is also specified, because only the required top
`n_components` are computed. If `no_rotation=True`, you may consider preprocessing
the input with another linear dimensionality reduction method like PCA.
Default is `False`.
metric : str, optional
distance metric for constructing nearest_neighbors graph. Note that here
`metric` can be non-metric distance function. The allowed values depend on
the `method` specified. As three nearest neighbors algorithms are supported,
and these algorithms support different distance metrics. Default is `euclidean`.
* 'small' :
Use sklearn.neighbors.kneighbors_graph
Supported values are
* euclidean
* manhattan
* chebyshev
* minkowski
* wminkowski
* seuclidean
* mahalanobis
* hamming
* canberra
* braycurtis
See https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html
* 'big_nndescent' or 'big_nnumap' :
umap.nndescent. Supported values are
* euclidean
* manhattan
* chebyshev
* minkowski
* canberra
* braycurtis
* mahalanobis
* wminkowski
* seuclidean
* cosine
* correlation
* haversine
* hamming
* jaccard
* dice
* russelrao
* kulsinski
* rogerstanimoto
* sokalmichener
* sokalsneath
* yule
* 'big_nmslib' :
nmslib. See https://github.com/nmslib/nmslib/blob/master/manual/manual.pdf for details:
* bit_hamming
* l1
* l1_sparse
* l2, euclidean
* l2_sparse
* linf
* linf_sparse
* lp:p=...
* lp_sparse:p=...
* angulardist, angulardist sparse, angulardist sparse fast
* jsmetrslow, jsmetrfast, jsmetrfastapprox
* leven
* sqfd minus func, sqfd heuristic func:alpha=..., sqfd gaussian func:alpha=...
* jsdivslow, jsdivfast, jsdivfastapprox
* cosinesimil, cosine, cosinesimil sparse
* normleven
* kldivfast
* kldivgenslow, kldivgenfast, kldivgenfastrq
* itakurasaitoslow, itakurasaitofast, itakurasaitofastrq
* negdotprod_sparse
* querynorm_negdotprod_sparse
* renyi_diverg
* ab_diverg
metric_params: dict, optional
Some distance metrics requires additional arguments which can be provided as
a dictionary. Default is `{}`.
method: str, optional {'auto', 'small', 'big_nmslib', 'big_umap', 'big_nndescent'}
Method to use. Different algorithms strongly affect the computation speed.
Optimal choices are dependent on the input size. Default is `auto`.
* 'auto'
Choose automatically based on input size. Recommended for most uses.
* 'small'
For input of small size, it is affordable to compute nearest-neigbors exactly
and use matrix inversion in the algorithm.
* 'big_nmslib'
For input of large size. If NMSlib is installed, it is the fastest algorithm that
is supported.
* 'big_nndescent'
For input of large size. The NN-Descent algorithm implemented in the the umap package.
Do not require external libraries.
* 'big_nnumap'
For input of large size. Do not require external libraries.
_lambda : float or None, optional
Regularization parameter :math:`\\lambda`. If not specified :math:`\\lambda` is
computed from `regularization`,
specifying `regularization` instead of `_lambda` is generally recommended as it
automatically adjusts based on graph density. Default is None.
symmetrize : bool, optional
Symmetrize the nearest-neighbors graph by averaging with its transpose. Default is `True`.
rescale : bool, optional
Rescale each dimension in the output to align with the input. It works with and without
`no_rotation`. Default is `False`
refine_iter : int, optional
Number of refinement iterations. Default is `3`.
refine_threshold : float, optional
Specify the cutoff for refinement. Unit is IQR (distance between the 25% quantile and the 75% quantile)
of overall "stress" distribution across all edges. Default is `12`.
_refine_threshold : float or None, optional
Used internally. Default is None.
tol, atol : float, optional
Tolerances for iterative linear solver `norm(residual) <= max(tol*norm(b), atol)`.
Default is `tol=1e-7, atol=1e-15`.
nmslib_params : dict, optional
Parameters given to nmslib createIndex function.
See https://github.com/nmslib/nmslib/blob/master/manual/methods.md for details.
Default is `{'post': 2}`.
n_jobs : int, optional
Number of jobs to run in parallel. Specifying `n_jobs > 1` can speed up computation
when method is one of `big_nmslib`, `big_nndescent`, or `big_nnumap`. Note that
when `use_cuda=True` specifying `n_jobs` does not run the jobs in parallel but
can reduce amount of GPU memory required by splitting the problem and run sequentially.
Default is 1.
n_jobs_nmslib : int, optional
Number of threads used by nmslib. Default is `8`.
use_cuda : bool, optional
Use CUDA GPU acceleration. CuPy has to be installed (https://github.com/cupy/cupy)
and a CUDA-enabled GPU is required. Default is `False`.
verbose : bool, optional
Print additional information. Default is `False`.
Attributes
----------
embedding_ : array, shape (n_samples, n_components)
W_ : array, shape (n_features, n_components)
Right linear operator in the transformation Z = KXW. W_ is None when `no_rotation=True`.
"""
def __init__(
self,
n_neighbors=10, regularization=1000, n_components=None, no_rotation=False,
metric='l2', metric_params={}, method='auto',
_lambda=None, symmetrize=True, rescale=False,
refine_iter=0, refine_threshold=12, _refine_threshold=None,
tol=1e-7, atol=1e-15, nmslib_params={'post': 2},
n_jobs=1, n_jobs_nmslib=8, use_cuda=False, verbose=False):
"""
Initialize parameters
"""
self.n_neighbors = n_neighbors
self.n_components = n_components
self.regularization = regularization
self._lambda = _lambda
self.metric = metric
self.metric_params = metric_params
self.method = method
self.symmetrize = symmetrize
self.rescale = rescale
self.no_rotation = no_rotation
self.refine_iter = refine_iter
self.refine_threshold = refine_threshold
self._refine_threshold = _refine_threshold
self.nmslib_params = nmslib_params
self.tol = tol
self.atol = atol
self.verbose = verbose
self.n_jobs = n_jobs
self.n_jobs_nmslib = n_jobs_nmslib
self.use_cuda = use_cuda
self.W_ = None
self.embedding_ = None
self.fitted = False
def _validate_parameters(self):
# TODO
pass
[docs] def fit(self, X, y=None, custom_graph=None, refine_protected_graph=None):
"""Fit the model from data in X.
Parameters
----------
X : array, shape (n_samples, n_features).
X may be a sparse matrix. If self.skip_pca is true, X must be orthogonal.
y : Ignored
custom_graph : sparse matrix, shape (n_samples, n_samples), optional
If provided, use it instead of the graph constructed from X.
refine_protected_graph : sparse matrix
Returns
----------
self : object
Returns the instance itself.
"""
output = graphdr(X, n_neighbors=self.n_neighbors, regularization=self.regularization,
metric=self.metric, metric_params=self.metric_params,
n_components=self.n_components, no_rotation=self.no_rotation, _lambda=self._lambda,
custom_graph=custom_graph, rescale=self.rescale, symmetrize=self.symmetrize,
refine_iter=self.refine_iter, refine_threshold=self.refine_threshold,
_refine_threshold=self._refine_threshold, refine_protected_graph=refine_protected_graph,
tol=self.tol, atol=self.atol, nmslib_params=self.nmslib_params, verbose=self.verbose,
n_jobs=self.n_jobs, n_jobs_nmslib=self.n_jobs_nmslib, use_cuda=self.use_cuda,
return_all=True)
if self.no_rotation:
Z, _ = output
self.W_ = None
else:
Z, W, _ = output
self.W_ = W
self.embedding_ = Z
self.fitted = True
return self
[docs]def graphdr(X, n_neighbors=10, regularization=100, n_components=None, no_rotation=False,
metric='euclidean', metric_params={}, method='auto',
_lambda=None, init=None, symmetrize=True,
custom_graph=None, rescale=False, refine_iter=0, refine_threshold=12,
_refine_threshold=None, refine_protected_graph=None, tol=1e-7, atol=1e-15,
nmslib_params={'post': 2}, n_jobs=1, n_jobs_nmslib=8, use_cuda=False, verbose=False,
return_all=False):
"""
Quasilinear data representation that preserves interpretability of a linear space.
Parameters
----------
X : array (n_samples, n_features)
The input array.
n_neighbors : int, optional
Number of nearest neigbors to compute per sample. Default is `10`.
regularization : float, optional
Regularization parameter determines the weight on the graph shrinkage term
relative to the reconstruction error term. A smaller regularization parameter
leads to output that are more similar to a linear representation. A larger
regularization parameter provides more contraction based on the graph. Default is `100`.
n_components : int or None, optional
Number of dimensions in the output. Specify a n_components smaller than the input
dimension when `no_rotation=True` can reduce the amount of computation.
Default is `None`.
no_rotation : bool, optional
:param refine_protected_graph:
:math:`W` is fixed to identity matrix so that the output is not rotated
relative to input. no_rotation can also speed up the computation for large
input when `n_components` is also specified, because only the required top
`n_components` are computed. If `no_rotation=True`, you may consider preprocessing
the input with another linear dimensionality reduction method like PCA.
Default is `False`.
metric : str, optional
distance metric for constructing nearest_neighbors graph. Note that here
`metric` can be non-metric distance function. The allowed values depend on
the `method` specified. As three nearest neighbors algorithms are supported,
and these algorithms support different distance metrics. Default is `euclidean`.
* 'small':
Use sklearn.neighbors.kneighbors_graph
Supported values are
* euclidean
* manhattan
* chebyshev
* minkowski
* wminkowski
* seuclidean
* mahalanobis
* hamming
* canberra
* braycurtis
See https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html
* 'big_nndescent' or 'big_nnumap':
umap.nndescent. Supported values are
* euclidean
* manhattan
* chebyshev
* minkowski
* canberra
* braycurtis
* mahalanobis
* wminkowski
* seuclidean
* cosine
* correlation
* haversine
* hamming
* jaccard
* dice
* russelrao
* kulsinski
* rogerstanimoto
* sokalmichener
* sokalsneath
* yule
* 'big_nmslib'
nmslib. See https://github.com/nmslib/nmslib/blob/master/manual/manual.pdf for details:
* bit_hamming
* l1
* l1_sparse
* l2, euclidean
* l2_sparse
* linf
* linf_sparse
* lp:p=...
* lp_sparse:p=...
* angulardist, angulardist sparse, angulardist sparse fast
* jsmetrslow, jsmetrfast, jsmetrfastapprox
* leven
* sqfd minus func, sqfd heuristic func:alpha=..., sqfd gaussian func:alpha=...
* jsdivslow, jsdivfast, jsdivfastapprox
* cosinesimil, cosine, cosinesimil sparse
* normleven
* kldivfast
* kldivgenslow, kldivgenfast, kldivgenfastrq
* itakurasaitoslow, itakurasaitofast, itakurasaitofastrq
* negdotprod_sparse
* querynorm_negdotprod_sparse
* renyi_diverg
* ab_diverg
metric_params: dict, optional
Some distance metrics requires additional arguments which can be provided as
a dictionary. Default is `{}`.
method: str, optional {'auto', 'small', 'big_nmslib', 'big_umap', 'big_nndescent'}
Method to use. Different algorithms strongly affect the computation speed.
Optimal choices are dependent on the input size. Default is `auto`.
* 'auto'
Choose automatically based on input size. Recommended for most uses.
* 'small'
For input of small size, it is affordable to compute nearest-neigbors exactly
and use matrix inversion in the algorithm.
* 'big_nmslib'
For input of large size. If NMSlib is installed, it is the fastest algorithm that
is supported.
* 'big_nndescent'
For input of large size. The NN-Descent algorithm implemented in the the umap package.
Do not require external libraries.
* 'big_nnumap'
For input of large size. Do not require external libraries.
_lambda : float or None, optional
Regularization parameter :math:`\\lambda`. If not specified :math:`\\lambda` is
computed from `regularization`,
specifying `regularization` instead of `_lambda` is generally recommended as it
automatically adjusts based on graph density. Default is None.
init : array or None, optional
Initialize output representation Z with this array if solved through iterative solver.
`init` is only used when `no_rotation=True` and method is not `small`.
Default is None.
symmetrize : bool, optional
Symmetrize the nearest-neighbors graph by averaging with its transpose. Default is `True`.
custom_graph : sparse matrix or None, optional
If specified, use this user-provided graph instead of constructing nearest-neighbors
graph from `X`. Default is None.
rescale : bool, optional
Rescale each dimension in the output to align with the input. It works with and without
`no_rotation`. Default is `False`
refine_iter : int, optional
Number of refinement iterations. Default is `3`.
refine_threshold : float, optional
Specify the cutoff for refinement. Unit is IQR (distance between the 25% quantile and the 75% quantile)
of overall "stress" distribution across all edges. Default is `12`.
_refine_threshold : float or None, optional
Used internally. Default is None.
refine_protected_graph : sparse matrix or None, optional
Sparse matrix of size (n_samples, n_samples).
Specify edges (any non-zero edges in refine_protected_graph) that will
not be removed during refinement. Useful for graph construction for complex experimental designs
like batch design so that edges connecting batches will not be removed. Default is `None`.
tol, atol : float, optional
Tolerances for iterative linear solver `norm(residual) <= max(tol*norm(b), atol)`.
Default is `tol=1e-7, atol=1e-15`.
nmslib_params : dict, optional
Parameters given to nmslib createIndex function.
See https://github.com/nmslib/nmslib/blob/master/manual/methods.md for details.
Default is `{'post': 2}`.
n_jobs : int, optional
Number of jobs to run in parallel. Specifying `n_jobs > 1` can speed up computation
when method is one of `big_nmslib`, `big_nndescent`, or `big_nnumap`. Note that
when `use_cuda=True` specifying `n_jobs` does not run the jobs in parallel but
can reduce amount of GPU memory required by splitting the problem and run sequentially.
Default is 1.
n_jobs_nmslib : int, optional
Number of threads used by nmslib. Default is `8`.
use_cuda : bool, optional
Use CUDA GPU acceleration. CuPy has to be installed (https://github.com/cupy/cupy)
and a CUDA-enabled GPU is required. Default is `False`.
verbose : bool, optional
Print additional information. Default is `False`.
return_all : bool, optional
Return a tuple of (Z, W, graph). Default is `False`.
Returns
-------
Z: array, (optional) W: array, (optional) Graph: sparse matrix
* Z : Embedding of the input data.
* W : Linear operator on the feature space, returned if `return_all==True`.
* graph : The graph used for GraphDR embedding, returned if `return_all==True`.
"""
if method == 'auto':
if X.shape[0] < 10000:
method = 'small'
else:
try:
import nmslib
method = 'big_nmslib'
except ImportError:
method = 'big_nndescent'
X_mean = X.mean(axis=0)
if method == 'small':
if custom_graph is None:
graph = kneighbors_graph(np.asarray(X), int(n_neighbors), metric=metric,
metric_params=metric_params, include_self=False)
else:
graph = custom_graph
if symmetrize:
graph = 0.5 * (graph + graph.T)
graphL = csgraph.laplacian(graph)
if _lambda is None:
_lambda = regularization / (graph.sum(axis=1).mean() / (X.shape[0] / 10000))
G = scipy.sparse.eye(X.shape[0]) + _lambda * graphL
if issparse(G):
Ginv = np.linalg.inv(G.todense())
else:
Ginv = np.linalg.inv(G)
if no_rotation:
if n_components:
try:
X = X[:, :n_components]
X_mean = X_mean[:n_components]
except:
raise ValueError('n_components value is not valid.')
Z = np.asarray(np.dot(X.T, Ginv).T)
else:
C = np.dot(np.dot(X.T, Ginv), X)
_, W = np.linalg.eigh(C)
W = np.array(W)
W = W[:, ::-1]
if n_components:
try:
W = W[:, :n_components]
except:
raise ValueError('n_components value is not valid.')
Z = np.asarray(np.dot(np.dot(W.T, X.T), Ginv).T)
graph = scipy.sparse.csr_matrix(graph)
graph.eliminate_zeros()
else:
if custom_graph is None:
if method == 'big_nmslib':
graph = knn_graph(X, n_neighbors=int(n_neighbors), num_threads=n_jobs_nmslib, space=metric,
params=nmslib_params, space_params=metric_params, verbose=verbose)
elif method == 'big_nnumap':
graph = \
neighbors.neighbors(X, n_neighbors=int(n_neighbors), metric=metric, metric_kwds=metric_params)[0]
elif method == 'big_nndescent':
graph = \
neighbors.neighbors(X, n_neighbors=int(n_neighbors), metric=metric, metric_kwds=metric_params,
smoothknn=False)[0]
else:
raise ValueError('Unrecognized method {}' % method)
else:
graph = custom_graph
if symmetrize:
graph = 0.5 * (graph + graph.T)
graphL = csgraph.laplacian(graph)
if _lambda is None:
_lambda = regularization / (graph.sum(axis=1).mean() / (X.shape[0] / 10000))
G = scipy.sparse.eye(X.shape[0]) + _lambda * graphL
def msolve(A, Y, init=None):
if use_cuda:
Z = np.asarray(np.hstack(list(
map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=True), np.split(Y, n_jobs, axis=1)))))
else:
if n_jobs <= 1:
Z = np.asarray(np.hstack([scipy.sparse.linalg.cg(A, Y[:, i],
x0=init[:, i] if init is not None else None,
tol=tol, atol=atol)[0][:, np.newaxis] for i in
range(Y.shape[1])]))
else:
p = Pool(n_jobs)
try:
if init is None:
Z = np.asarray(np.hstack(list(
p.map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=False),
np.split(Y, Y.shape[1], axis=1)))))
else:
Z = np.asarray(np.hstack(list(
p.map(lambda y, init_: cg(A, y, x0=init_, tol=tol, atol=atol, use_cuda=False),
zip(np.split(Y, Y.shape[1], axis=1), np.split(init, init.shape[1], axis=1))))))
p.close()
p.join()
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating workers")
p.terminate()
p.join()
raise
return Z
if no_rotation:
if n_components:
try:
X = X[:, :n_components]
X_mean = X_mean[:n_components]
except:
raise ValueError('n_components value is not valid.')
Z = msolve(G, X, init=init)
else:
GinvX = msolve(G, X)
C = np.dot(X.T, GinvX)
lambdas, W = np.linalg.eigh(C)
W = np.array(W)
W = W[:, ::-1]
if n_components:
try:
W = W[:, :n_components]
except:
raise ValueError('n_components value is not valid.')
Z = np.dot(GinvX, W)
# refine the graph by deleting edges with high stress
if refine_iter > 0:
if refine_protected_graph is not None:
frozen_bool = np.isin(graph.nonzero()[0] * graph.shape[0] + graph.nonzero()[1],
refine_protected_graph.nonzero()[0] * refine_protected_graph.shape[0] + refine_protected_graph.nonzero()[1])
distZ = np.sqrt(np.sum((Z[graph.nonzero()[0], :] - Z[graph.nonzero()[1], :]) ** 2, axis=1))
stress = distZ
if verbose:
print("Refinement round %d start..." % (refine_iter - 1))
print("Number of starting edges: %d" % np.sum(graph.data > 0))
if _refine_threshold is None:
q1, q3 = np.percentile(stress, [25, 75])
iqr = q3 - q1
_refine_threshold = q3 + (iqr * refine_threshold)
if refine_protected_graph is not None:
graph.data[frozen_bool] = graph.data[frozen_bool] * ((stress < _refine_threshold)[frozen_bool])
else:
graph.data = graph.data * (stress < _refine_threshold)
graph.eliminate_zeros()
if verbose:
print("Number of refined edges: %d" % np.sum(graph.data > 0))
n_components, labels = scipy.sparse.csgraph.connected_components(graph)
component_sizes = np.bincount(labels)
if np.any(component_sizes <= 5):
# largest_component = np.where(component_sizes == component_sizes.max())[0][0]
if verbose:
print("Connected components sizes: %s" % str(component_sizes))
large_components = np.isin(labels, np.where(component_sizes > 5)[0])
if np.mean(large_components) < 0.95:
if verbose:
print("Connected component(s) contains few than 95 percent of all data points. Stop refinement.")
else:
from sklearn.neighbors import NearestNeighbors
_, fillinds = NearestNeighbors(n_neighbors=1).fit(X[large_components, :]).kneighbors(
X[~large_components, :])
g_reconnect = scipy.sparse.csr_matrix((np.repeat(2, len(fillinds)), (
np.where(~large_components)[0], np.where(large_components)[0][fillinds[:, 0]])), graph.shape)
graph = graph + g_reconnect
if verbose:
print("Number of reconnection edges: %d" % np.sum(g_reconnect.data > 0))
Z = graphdr(X, custom_graph=graph, init=Z, _lambda=_lambda, method=method, refine_iter=refine_iter - 1,
refine_threshold=refine_threshold, _refine_threshold=_refine_threshold,
no_rotation=no_rotation, rescale=False, n_jobs_nmslib=n_jobs_nmslib, n_jobs=n_jobs,
use_cuda=use_cuda)
else:
if verbose:
print("Number of reconnection edges: %d" % 0)
Z = graphdr(X, custom_graph=graph, init=Z, _lambda=_lambda, method=method, refine_iter=refine_iter - 1,
refine_threshold=refine_threshold, _refine_threshold=_refine_threshold,
no_rotation=no_rotation, rescale=False, n_jobs_nmslib=n_jobs_nmslib, n_jobs=n_jobs,
use_cuda=use_cuda)
if rescale:
if no_rotation:
scale_factor = (np.mean(X * Z, axis=0) - X.mean(axis=0) * Z.mean(axis=0)) / (
np.mean(Z ** 2, axis=0) - np.mean(Z, axis=0) ** 2)
Z = Z * scale_factor[np.newaxis, :]
Z = Z - Z.mean(axis=0)[np.newaxis, :] + X_mean[np.newaxis, :]
else:
XW = np.dot(X, W)
scale_factor = (np.mean(XW * Z, axis=0) - XW.mean(axis=0) * Z.mean(axis=0)) / (
np.mean(Z ** 2, axis=0) - np.mean(Z, axis=0) ** 2)
Z = Z * scale_factor[np.newaxis, :]
Z = Z - Z.mean(axis=0)[np.newaxis, :]
if return_all:
if no_rotation:
return Z, graph
else:
return Z, W, graph
else:
return Z