Source code for quasildr.utils

import copy

import numpy as np
import pandas as pd
from numpy.core.umath_tests import matrix_multiply
from pynndescent import NNDescent
import scipy
from scipy.linalg import subspace_angles
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
from scipy.sparse.csgraph import minimum_spanning_tree, csgraph_from_dense
from scipy.spatial.distance import squareform, pdist
from sklearn.neighbors import NearestNeighbors

from .external import neighbors

[docs]def locCov(data, query, bw, min_radius=0, shrinkage = False, cov_data=None): """ Computes Local Covariance matrices. Data points were weighted with a Gaussian kernel centered at the query point Parameters ---------- data : 2D array Array containing data points of shape (N, p) query : 2D array Array containing the query point of shape (1, p) bw : float Gaussian kernel bandwidth min_radius : float, optional If specified, clip the bandwidth to the min_radius-th nearest neighbor. shrinkage : bool, optional Apply OAS shrinkage for computing local covariance. Default is False. cov_data : 2D array, optional Array containing data points of shape (N, p), if specified will replace `data` for covariance calculations, while `data` is still used to define the local neighborhood. """ D = cdist(data, query) # n x 1 if min_radius!=0: bw = np.maximum(bw, np.sort(D,axis=0)[min_radius]) w = np.exp( - 0.5 * (D / bw)**2) # n x 1 #C = np.cov(data.T,ddof=1,aweights=w.flatten()) if cov_data is not None: assert data.shape[0]==cov_data.shape[0] else: cov_data = data mu = (np.sum(cov_data * w,axis=0)/np.sum(w))[np.newaxis,:] #1 x p X = cov_data - mu C = np.dot((X*w).T, X) effective_N = np.sum(w)**2/np.sum(w**2) C = C / (np.sum(w)*(1-1/effective_N)) if shrinkage: #Apply OAS with sample size replaced by effective_N mu = np.trace(C) / cov_data.shape[1] # formula from Chen et al.'s **implementation** alpha = np.mean(C ** 2) num = alpha + mu ** 2 den = (effective_N + 1.) * (alpha - (mu ** 2) / cov_data.shape[1]) shrinkage = 1. if den == 0 else min(num / den, 1.) print(shrinkage) C = (1. - shrinkage) * C C.flat[::cov_data.shape[1] + 1] += shrinkage * mu return C, effective_N
[docs]def subspace_angles(A, B): """ Fast computation of subspace angles given A and B are orthogonal basis matrices and only accuracy of small angles are of concern. Large angle are not numerically accurate. Parameters ---------- A, B : 3D array Array containing data points of shape (n, p, p). """ C = B - matrix_multiply(A, matrix_multiply(A.transpose((0, 2, 1)), B)) angles = np.arcsin(np.clip(np.linalg.svd(C)[1], -1, 1)) / np.pi * 180 angles = np.minimum(angles, 180 - angles) return angles
[docs]def extract_structural_backbone(t, data, s, max_angle=90, relaxation=0): """ Construct simplified graphs connecting data points there have been projected to density ridges. Two graphs are constructed for different purposes. The first graph (g_simple) is constructed with two major steps: 1. Construct nearest neighbor graph on both ridge positions and raw data positions and combine. 2. Simplify graph so that each point is only connected to up to 2^ridge_dimensionality points with a set of filtering criteria. The second graph (g_mst) has two extra steps: 3. If the graph is not fully connected, connect the components by nearest neigbors between all pairs of components. 4. Construct a minimum spanning tree of the graph. Parameters ----------- t : 2D array Density ridge positions. Typically projected to density ridges with quasildr.dridge.Scms. data : 2D array Original data points. s : object `quasildr.dridge.Scms` object which were used to produce t0. max_angle : float, optional Maximum angle in degree for filtering graph edges. Default is 90. relaxation : float, optional The relaxation parameter used to produce `t0`. See `quasildr.dridge.Scms.scms` documention. Returns ----------- g_simple : sparse matrix Simplified graph constructed without explicit shape or connectivity constraints (g_simple) and the other one is from step 2 g_mst : sparse matrix A tree-shaped graph connecting all points (g_mst). from step 4. """ h, _, _, _, _ = s._nlocal_inv_cov(t) eigvals, eigvecs = np.linalg.eigh(-h) eigvals = -eigvals ridge_dims = ((eigvals[:, 0] - eigvals[:, 1]) / (eigvals[:, 0] - eigvals[:, -1]) < relaxation) + 1 gknn, _ = neighbors.neighbors(t, n_neighbors=50, smoothknn=False) gknnZ, _ = neighbors.neighbors(data, n_neighbors=50, smoothknn=False) gknn = gknn + gknn.T + gknnZ + gknnZ.T gknn.setdiag(0) gknn.eliminate_zeros() # remove edge connecting structures of different dimensionality gknn.data[ridge_dims[gknn.nonzero()[0]] != ridge_dims[gknn.nonzero()[1]]] = 0 gknn.eliminate_zeros() # filter edges edges = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :] edges_norm = edges / (np.linalg.norm(edges, axis=1)[:, np.newaxis]) angles = np.zeros(len(gknn.nonzero()[0])) angles_edge = np.zeros(len(gknn.nonzero()[0])) for d in np.unique(ridge_dims): if d == 1: ind = ridge_dims[gknn.nonzero()[0]] == 1 angles[ind] = np.arccos( np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * eigvecs[gknn.nonzero()[1][ind], :, 0], axis=1), -1, 1)) / np.pi * 180 angles_edge[ind] = np.arccos( np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * edges_norm[ind, :], axis=1), -1, 1)) / np.pi * 180 angles[ind] = np.minimum(angles[ind], 180 - angles[ind]) angles_edge[ind] = np.minimum(angles_edge[ind], 180 - angles_edge[ind]) gknn.data[(angles > max_angle) * (angles_edge > max_angle)] = 0 else: # calculate mean principal angles ind = ridge_dims[gknn.nonzero()[0]] == d angles[ind] = np.mean( subspace_angles(eigvecs[gknn.nonzero()[0][ind], :, :d], eigvecs[gknn.nonzero()[1][ind], :, :d]), axis=1) gknn.data[(angles > max_angle)] = 0 gknn.eliminate_zeros() # gknn = gknn + gknn.T n_components, labels = scipy.sparse.csgraph.connected_components(gknn) # simplify graph by connecting only closest nodes in the subspace edges_vecs = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :] edges_dist = np.linalg.norm(edges_vecs, axis=1) rowinds = [] colinds = [] # orient eigen vectors in the same directions for d in range(eigvecs.shape[2]): eigvecs[eigvecs[:, 0, d] < 0, :, d] *= -1 for d in np.unique(ridge_dims): if d == 1: ind = ridge_dims[gknn.nonzero()[0]] == 1 proj_vecs = np.sum(edges_vecs[ind, :] * eigvecs[gknn.nonzero()[0][ind], :, 0], axis=1)[:, np.newaxis] * eigvecs[gknn.nonzero()[0][ind], :, 0] else: ind = ridge_dims[gknn.nonzero()[0]] == d proj_vecs = matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d], \ matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d].transpose((0, 2, 1)), edges_vecs[ind, :, np.newaxis])) proj_dists = edges_dist[ind] gknn.data[ind] = proj_dists gknn_directions = [] for k in range(d): gknn_directions.append(gknn.copy()) gknn_directions[k].data[ind] = proj_vecs[:, k].squeeze() from itertools import product for directions in list(product([-1, 1], repeat=len(gknn_directions))): for i in np.where(ridge_dims == d)[0]: dist_data = gknn[i, :].data direction_datas = [gd[i, :].data for gd in gknn_directions] conditions = [(directions[i] * direction_datas[i]) > 0 for i in range(len(directions))] conditions = np.all(np.vstack(conditions).T, axis=1) if np.any(conditions): min_dist = np.min(dist_data[conditions]) if directions[0] < 0: rowinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0]) colinds.append(i) else: rowinds.append(i) colinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0]) gedges = np.vstack([rowinds, colinds]).T gedges = np.unique(gedges, axis=0) g_simple = csr_matrix((np.repeat(1, gedges.shape[0]), (gedges[:, 0], gedges[:, 1])), shape=gknn.shape) n_components, labels = scipy.sparse.csgraph.connected_components(g_simple) # meta graph connecting each component. components_dimensionality = [] for i in range(n_components): components_dimensionality.append(ridge_dims[labels == i][0]) # To connect or not to connect fc_metaedges = [] fc_edge_indices = [] for i in range(n_components - 1): i_inds = np.where(labels == i)[0] if len(i_inds) > 1000: index_group_i = NNDescent(t[i_inds, :]) index_group_data_i = NNDescent(data[i_inds, :]) else: index_group_i = NearestNeighbors(n_neighbors=1).fit(t[i_inds, :]) index_group_data_i = NearestNeighbors(n_neighbors=1).fit(data[i_inds, :]) # for g_mst for j in range(i + 1, n_components): j_inds = np.where(labels == j)[0] if len(i_inds) > 1000: nn, _ = index_group_i.query(t[j_inds, :], k=1) _, dist = index_group_data_i.query(data[j_inds, :], k=1) else: _, nn = index_group_i.kneighbors(t[j_inds, :]) dist, _ = index_group_data_i.kneighbors(data[j_inds, :]) mindist = np.min(dist) fc_metaedges.append([i, j]) fc_edge_indices.append([i_inds[nn[dist == mindist]][0], j_inds[np.where(dist == mindist)[0]][0]]) if len(fc_edge_indices) > 0: fc_edge_indices = np.vstack(fc_edge_indices) g_fc_connections = csr_matrix( (np.repeat(2, fc_edge_indices.shape[0]), (fc_edge_indices[:, 0], fc_edge_indices[:, 1])), shape=gknn.shape) g_fc = g_simple + g_fc_connections else: g_fc = g_simple g_fc.data = np.linalg.norm(t[g_fc.nonzero()[0], :] - t[g_fc.nonzero()[1], :], axis=1) g_mst = minimum_spanning_tree(g_fc) g_simple.data = np.linalg.norm(t[g_simple.nonzero()[0], :] - t[g_simple.nonzero()[1], :], axis=1) return g_simple, g_mst, ridge_dims
[docs]def make_mst(t): """ Construct minimum spanning tree. Takes input from Euclidean space and construct graph based on pairwise distances Parameters ----------- t : 2D array Input data points. Returns ----------- edge_list : 2D array Edge list of the minimum spanning tree """ return mst(squareform(pdist(t[:, :])))
[docs]def mst(D): """ Construct minimum spanning tree from dense matrix. Parameters ----------- D : array or sparse matrix Edge data in 2D square array or matrix format. Non-zero values represent edges. Returns ----------- edge_list : 2D array Edge list of the minimum spanning tree """ edge_list = minimum_spanning_tree(csgraph_from_dense(D)).nonzero() edge_list = np.vstack(edge_list).T return edge_list
[docs]def make_projected_mst(t, data, radius=0.1): """ Experimental function """ dist_mat = squareform(pdist(data)) smooth_mat = np.exp(-(squareform(pdist(t)) / radius) ** 2) smooth_mat = smooth_mat / smooth_mat.sum(axis=1)[:, np.newaxis] dist_mat_transformed = np.dot(np.dot(smooth_mat, dist_mat), smooth_mat.T) return mst(dist_mat_transformed)
[docs]def extract_segments(edge_list, degree): """ Extract segments from a graph. It can be used for, for example, paritioning a graph into different branches. Parameters ---------- edge_list : 2D array or list 2D array or list where each row or item has two values indicating start and end indices. degree: 1D array 1D array containing the degree of each node. This has to be consistent with edge_list. Returns ------- segments : list Partitioned segments represented by a list in which each item is a list containing indices in a segments. """ terminals = np.where((degree != 2) * (degree != 0))[0].tolist() edge_dict = {} for edge in edge_list: if edge[0] in edge_dict: edge_dict[edge[0]].append(edge[1]) else: edge_dict[edge[0]] = [edge[1]] if edge[1] in edge_dict: edge_dict[edge[1]].append(edge[0]) else: edge_dict[edge[1]] = [edge[0]] # make raw unpruned segments segments = [] edge_dict_copy = copy.deepcopy(edge_dict) while len(terminals) > 0: seg = [] seg.append(terminals[0]) current_cell = terminals[0] notfound = False while not notfound: notfound = True for i in copy.copy(edge_dict_copy[current_cell]): edge_dict_copy[current_cell].remove(i) edge_dict_copy[i].remove(current_cell) if i not in terminals: if notfound: notfound = False new_cell = i seg.append(i) if notfound: assert len(edge_dict_copy[seg[-1]]) == 0 else: current_cell = new_cell assert len(edge_dict_copy[seg[-2]]) == 0 segments.append(seg) terminals.remove(terminals[0]) return segments
[docs]def count_degree(edge_list, N): """ Compute degrees of all nodes in a graph. Parameters ---------- edge_list : 2D array or list 2D array or list where each row or item has two values indicating start and end indices. N : int Number of nodes in the graph. Returns ------- degree : 1D array 1D array containing degrees of each node """ degree = np.zeros(N) counted = np.unique(edge_list.flatten(), return_counts=True) degree[counted[0]] = counted[1] return degree
[docs]def make_trajectory(t, output_prefix=None, prune_threshold=3): """ Construct MST, prune minor branches, and segment. Parameters ---------- t output_prefix prune_threshold Returns ------- """ # initialize trajectory with MST edge_list = make_mst(t) edge_list = np.asarray(edge_list) degree = count_degree(edge_list, t.shape[0]) segments = extract_segments(edge_list, degree) # prune minor branches seglens = np.asarray([len(seg) for seg in segments if len(seg) != 0]) seg_min_degrees = np.asarray([np.min(degree[seg]) for seg in segments if len(seg) != 0]) remove_seginds = (seglens <= prune_threshold) * (seg_min_degrees == 1) while np.any(remove_seginds): remove_nodeinds = np.concatenate([segments[i] for i in np.where(remove_seginds)[0]]) # remove_nodeinds = segments[np.where(remove_seginds)[0][np.argmin(seglens[np.where(remove_seginds)[0]])]] edge_list = np.asarray( [edge for edge in edge_list if edge[0] not in remove_nodeinds and edge[1] not in remove_nodeinds]) degree = count_degree(edge_list, t.shape[0]) segments = extract_segments(edge_list, degree) seglens = np.asarray([len(seg) for seg in segments if len(seg) != 0]) seg_min_degrees = np.asarray([np.min(degree[seg]) for seg in segments if len(seg) != 0]) remove_seginds = (seglens <= prune_threshold) * (seg_min_degrees == 1) if output_prefix is not None: for i, seg in enumerate(segments): # get some basic ordering if np.sum(t[seg[-1], :] - t[seg[0], :]) < 0: seg = list(reversed(seg)) np.savetxt(output_prefix + '.segment.' + str(i), seg, fmt='%d') return segments, edge_list