Source code for quasildr.utils
import copy
import numpy as np
import pandas as pd
from numpy.core.umath_tests import matrix_multiply
from pynndescent import NNDescent
import scipy
from scipy.linalg import subspace_angles
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
from scipy.sparse.csgraph import minimum_spanning_tree, csgraph_from_dense
from scipy.spatial.distance import squareform, pdist
from sklearn.neighbors import NearestNeighbors
from .external import neighbors
[docs]def locCov(data, query, bw, min_radius=0, shrinkage = False, cov_data=None):
"""
Computes Local Covariance matrices. Data points were weighted with a Gaussian kernel centered at
the query point
Parameters
----------
data : 2D array
Array containing data points of shape (N, p)
query : 2D array
Array containing the query point of shape (1, p)
bw : float
Gaussian kernel bandwidth
min_radius : float, optional
If specified, clip the bandwidth to the min_radius-th nearest neighbor.
shrinkage : bool, optional
Apply OAS shrinkage for computing local covariance. Default is False.
cov_data : 2D array, optional
Array containing data points of shape (N, p), if specified will replace `data` for covariance
calculations, while `data` is still used to define the local neighborhood.
"""
D = cdist(data, query) # n x 1
if min_radius!=0:
bw = np.maximum(bw, np.sort(D,axis=0)[min_radius])
w = np.exp( - 0.5 * (D / bw)**2) # n x 1
#C = np.cov(data.T,ddof=1,aweights=w.flatten())
if cov_data is not None:
assert data.shape[0]==cov_data.shape[0]
else:
cov_data = data
mu = (np.sum(cov_data * w,axis=0)/np.sum(w))[np.newaxis,:] #1 x p
X = cov_data - mu
C = np.dot((X*w).T, X)
effective_N = np.sum(w)**2/np.sum(w**2)
C = C / (np.sum(w)*(1-1/effective_N))
if shrinkage:
#Apply OAS with sample size replaced by effective_N
mu = np.trace(C) / cov_data.shape[1]
# formula from Chen et al.'s **implementation**
alpha = np.mean(C ** 2)
num = alpha + mu ** 2
den = (effective_N + 1.) * (alpha - (mu ** 2) / cov_data.shape[1])
shrinkage = 1. if den == 0 else min(num / den, 1.)
print(shrinkage)
C = (1. - shrinkage) * C
C.flat[::cov_data.shape[1] + 1] += shrinkage * mu
return C, effective_N
[docs]def subspace_angles(A, B):
"""
Fast computation of subspace angles given A and B are orthogonal basis matrices and
only accuracy of small angles are of concern. Large angle are not numerically accurate.
Parameters
----------
A, B : 3D array
Array containing data points of shape (n, p, p).
"""
C = B - matrix_multiply(A, matrix_multiply(A.transpose((0, 2, 1)), B))
angles = np.arcsin(np.clip(np.linalg.svd(C)[1], -1, 1)) / np.pi * 180
angles = np.minimum(angles, 180 - angles)
return angles
[docs]def extract_structural_backbone(t, data, s, max_angle=90, relaxation=0):
"""
Construct simplified graphs connecting data points there have been projected to density ridges.
Two graphs are constructed for different purposes. The first graph (g_simple) is constructed with two major steps:
1. Construct nearest neighbor graph on both ridge positions
and raw data positions and combine. 2. Simplify graph so that each point is only connected to up
to 2^ridge_dimensionality points with a set of filtering criteria.
The second graph (g_mst) has two extra steps: 3. If the graph is not fully connected, connect the components by
nearest neigbors between all pairs of components. 4. Construct a minimum spanning tree of the graph.
Parameters
-----------
t : 2D array
Density ridge positions. Typically projected to density ridges with quasildr.dridge.Scms.
data : 2D array
Original data points.
s : object
`quasildr.dridge.Scms` object which were used to produce t0.
max_angle : float, optional
Maximum angle in degree for filtering graph edges. Default is 90.
relaxation : float, optional
The relaxation parameter used to produce `t0`. See `quasildr.dridge.Scms.scms` documention.
Returns
-----------
g_simple : sparse matrix
Simplified graph constructed without explicit shape or connectivity constraints (g_simple)
and the other one is from step 2
g_mst : sparse matrix
A tree-shaped graph connecting all points (g_mst). from step 4.
"""
h, _, _, _, _ = s._nlocal_inv_cov(t)
eigvals, eigvecs = np.linalg.eigh(-h)
eigvals = -eigvals
ridge_dims = ((eigvals[:, 0] - eigvals[:, 1]) / (eigvals[:, 0] - eigvals[:, -1]) < relaxation) + 1
gknn, _ = neighbors.neighbors(t, n_neighbors=50, smoothknn=False)
gknnZ, _ = neighbors.neighbors(data, n_neighbors=50, smoothknn=False)
gknn = gknn + gknn.T + gknnZ + gknnZ.T
gknn.setdiag(0)
gknn.eliminate_zeros()
# remove edge connecting structures of different dimensionality
gknn.data[ridge_dims[gknn.nonzero()[0]] != ridge_dims[gknn.nonzero()[1]]] = 0
gknn.eliminate_zeros()
# filter edges
edges = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :]
edges_norm = edges / (np.linalg.norm(edges, axis=1)[:, np.newaxis])
angles = np.zeros(len(gknn.nonzero()[0]))
angles_edge = np.zeros(len(gknn.nonzero()[0]))
for d in np.unique(ridge_dims):
if d == 1:
ind = ridge_dims[gknn.nonzero()[0]] == 1
angles[ind] = np.arccos(
np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * eigvecs[gknn.nonzero()[1][ind], :, 0], axis=1),
-1, 1)) / np.pi * 180
angles_edge[ind] = np.arccos(
np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * edges_norm[ind, :], axis=1), -1,
1)) / np.pi * 180
angles[ind] = np.minimum(angles[ind], 180 - angles[ind])
angles_edge[ind] = np.minimum(angles_edge[ind], 180 - angles_edge[ind])
gknn.data[(angles > max_angle) * (angles_edge > max_angle)] = 0
else:
# calculate mean principal angles
ind = ridge_dims[gknn.nonzero()[0]] == d
angles[ind] = np.mean(
subspace_angles(eigvecs[gknn.nonzero()[0][ind], :, :d], eigvecs[gknn.nonzero()[1][ind], :, :d]), axis=1)
gknn.data[(angles > max_angle)] = 0
gknn.eliminate_zeros()
# gknn = gknn + gknn.T
n_components, labels = scipy.sparse.csgraph.connected_components(gknn)
# simplify graph by connecting only closest nodes in the subspace
edges_vecs = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :]
edges_dist = np.linalg.norm(edges_vecs, axis=1)
rowinds = []
colinds = []
# orient eigen vectors in the same directions
for d in range(eigvecs.shape[2]):
eigvecs[eigvecs[:, 0, d] < 0, :, d] *= -1
for d in np.unique(ridge_dims):
if d == 1:
ind = ridge_dims[gknn.nonzero()[0]] == 1
proj_vecs = np.sum(edges_vecs[ind, :] * eigvecs[gknn.nonzero()[0][ind], :, 0], axis=1)[:,
np.newaxis] * eigvecs[gknn.nonzero()[0][ind], :, 0]
else:
ind = ridge_dims[gknn.nonzero()[0]] == d
proj_vecs = matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d], \
matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d].transpose((0, 2, 1)),
edges_vecs[ind, :, np.newaxis]))
proj_dists = edges_dist[ind]
gknn.data[ind] = proj_dists
gknn_directions = []
for k in range(d):
gknn_directions.append(gknn.copy())
gknn_directions[k].data[ind] = proj_vecs[:, k].squeeze()
from itertools import product
for directions in list(product([-1, 1], repeat=len(gknn_directions))):
for i in np.where(ridge_dims == d)[0]:
dist_data = gknn[i, :].data
direction_datas = [gd[i, :].data for gd in gknn_directions]
conditions = [(directions[i] * direction_datas[i]) > 0 for i in range(len(directions))]
conditions = np.all(np.vstack(conditions).T, axis=1)
if np.any(conditions):
min_dist = np.min(dist_data[conditions])
if directions[0] < 0:
rowinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0])
colinds.append(i)
else:
rowinds.append(i)
colinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0])
gedges = np.vstack([rowinds, colinds]).T
gedges = np.unique(gedges, axis=0)
g_simple = csr_matrix((np.repeat(1, gedges.shape[0]), (gedges[:, 0], gedges[:, 1])), shape=gknn.shape)
n_components, labels = scipy.sparse.csgraph.connected_components(g_simple)
# meta graph connecting each component.
components_dimensionality = []
for i in range(n_components):
components_dimensionality.append(ridge_dims[labels == i][0])
# To connect or not to connect
fc_metaedges = []
fc_edge_indices = []
for i in range(n_components - 1):
i_inds = np.where(labels == i)[0]
if len(i_inds) > 1000:
index_group_i = NNDescent(t[i_inds, :])
index_group_data_i = NNDescent(data[i_inds, :])
else:
index_group_i = NearestNeighbors(n_neighbors=1).fit(t[i_inds, :])
index_group_data_i = NearestNeighbors(n_neighbors=1).fit(data[i_inds, :])
# for g_mst
for j in range(i + 1, n_components):
j_inds = np.where(labels == j)[0]
if len(i_inds) > 1000:
nn, _ = index_group_i.query(t[j_inds, :], k=1)
_, dist = index_group_data_i.query(data[j_inds, :], k=1)
else:
_, nn = index_group_i.kneighbors(t[j_inds, :])
dist, _ = index_group_data_i.kneighbors(data[j_inds, :])
mindist = np.min(dist)
fc_metaedges.append([i, j])
fc_edge_indices.append([i_inds[nn[dist == mindist]][0], j_inds[np.where(dist == mindist)[0]][0]])
if len(fc_edge_indices) > 0:
fc_edge_indices = np.vstack(fc_edge_indices)
g_fc_connections = csr_matrix(
(np.repeat(2, fc_edge_indices.shape[0]), (fc_edge_indices[:, 0], fc_edge_indices[:, 1])), shape=gknn.shape)
g_fc = g_simple + g_fc_connections
else:
g_fc = g_simple
g_fc.data = np.linalg.norm(t[g_fc.nonzero()[0], :] - t[g_fc.nonzero()[1], :], axis=1)
g_mst = minimum_spanning_tree(g_fc)
g_simple.data = np.linalg.norm(t[g_simple.nonzero()[0], :] - t[g_simple.nonzero()[1], :], axis=1)
return g_simple, g_mst, ridge_dims
[docs]def make_mst(t):
"""
Construct minimum spanning tree.
Takes input from Euclidean space and construct graph based on pairwise distances
Parameters
-----------
t : 2D array
Input data points.
Returns
-----------
edge_list : 2D array
Edge list of the minimum spanning tree
"""
return mst(squareform(pdist(t[:, :])))
[docs]def mst(D):
"""
Construct minimum spanning tree from dense matrix.
Parameters
-----------
D : array or sparse matrix
Edge data in 2D square array or matrix format. Non-zero values represent edges.
Returns
-----------
edge_list : 2D array
Edge list of the minimum spanning tree
"""
edge_list = minimum_spanning_tree(csgraph_from_dense(D)).nonzero()
edge_list = np.vstack(edge_list).T
return edge_list
[docs]def make_projected_mst(t, data, radius=0.1):
"""
Experimental function
"""
dist_mat = squareform(pdist(data))
smooth_mat = np.exp(-(squareform(pdist(t)) / radius) ** 2)
smooth_mat = smooth_mat / smooth_mat.sum(axis=1)[:, np.newaxis]
dist_mat_transformed = np.dot(np.dot(smooth_mat, dist_mat), smooth_mat.T)
return mst(dist_mat_transformed)
[docs]def extract_segments(edge_list, degree):
"""
Extract segments from a graph. It can be used for, for example, paritioning a graph into different branches.
Parameters
----------
edge_list : 2D array or list
2D array or list where each row or item has two values indicating start and end indices.
degree: 1D array
1D array containing the degree of each node. This has to be consistent with edge_list.
Returns
-------
segments : list
Partitioned segments represented by a list in which each item is a list containing indices in a segments.
"""
terminals = np.where((degree != 2) * (degree != 0))[0].tolist()
edge_dict = {}
for edge in edge_list:
if edge[0] in edge_dict:
edge_dict[edge[0]].append(edge[1])
else:
edge_dict[edge[0]] = [edge[1]]
if edge[1] in edge_dict:
edge_dict[edge[1]].append(edge[0])
else:
edge_dict[edge[1]] = [edge[0]]
# make raw unpruned segments
segments = []
edge_dict_copy = copy.deepcopy(edge_dict)
while len(terminals) > 0:
seg = []
seg.append(terminals[0])
current_cell = terminals[0]
notfound = False
while not notfound:
notfound = True
for i in copy.copy(edge_dict_copy[current_cell]):
edge_dict_copy[current_cell].remove(i)
edge_dict_copy[i].remove(current_cell)
if i not in terminals:
if notfound:
notfound = False
new_cell = i
seg.append(i)
if notfound:
assert len(edge_dict_copy[seg[-1]]) == 0
else:
current_cell = new_cell
assert len(edge_dict_copy[seg[-2]]) == 0
segments.append(seg)
terminals.remove(terminals[0])
return segments
[docs]def count_degree(edge_list, N):
"""
Compute degrees of all nodes in a graph.
Parameters
----------
edge_list : 2D array or list
2D array or list where each row or item has two values indicating start and end indices.
N : int
Number of nodes in the graph.
Returns
-------
degree : 1D array
1D array containing degrees of each node
"""
degree = np.zeros(N)
counted = np.unique(edge_list.flatten(), return_counts=True)
degree[counted[0]] = counted[1]
return degree
[docs]def make_trajectory(t, output_prefix=None, prune_threshold=3):
"""
Construct MST, prune minor branches, and segment.
Parameters
----------
t
output_prefix
prune_threshold
Returns
-------
"""
# initialize trajectory with MST
edge_list = make_mst(t)
edge_list = np.asarray(edge_list)
degree = count_degree(edge_list, t.shape[0])
segments = extract_segments(edge_list, degree)
# prune minor branches
seglens = np.asarray([len(seg) for seg in segments if len(seg) != 0])
seg_min_degrees = np.asarray([np.min(degree[seg]) for seg in segments if len(seg) != 0])
remove_seginds = (seglens <= prune_threshold) * (seg_min_degrees == 1)
while np.any(remove_seginds):
remove_nodeinds = np.concatenate([segments[i] for i in np.where(remove_seginds)[0]])
# remove_nodeinds = segments[np.where(remove_seginds)[0][np.argmin(seglens[np.where(remove_seginds)[0]])]]
edge_list = np.asarray(
[edge for edge in edge_list if edge[0] not in remove_nodeinds and edge[1] not in remove_nodeinds])
degree = count_degree(edge_list, t.shape[0])
segments = extract_segments(edge_list, degree)
seglens = np.asarray([len(seg) for seg in segments if len(seg) != 0])
seg_min_degrees = np.asarray([np.min(degree[seg]) for seg in segments if len(seg) != 0])
remove_seginds = (seglens <= prune_threshold) * (seg_min_degrees == 1)
if output_prefix is not None:
for i, seg in enumerate(segments):
# get some basic ordering
if np.sum(t[seg[-1], :] - t[seg[0], :]) < 0:
seg = list(reversed(seg))
np.savetxt(output_prefix + '.segment.' + str(i), seg, fmt='%d')
return segments, edge_list