Source code for evaluation.lisi

# CODE TAKEN FROM: HarmonyPy (https://github.com/slowkow/harmonypy)
# LISI - The Local Inverse Simpson Index
# Copyright (C) 2018  Ilya Korsunsky
#               2019  Kamil Slowikowski <kslowikowski@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from typing import Iterable


[docs]def compute_lisi( X: np.array, metadata: pd.DataFrame, label_colnames: Iterable[str], perplexity: float = 30, ) -> np.ndarray: """ Compute the Local Inverse Simpson Index (LISI) for each label column in the metadata. LISI measures the diversity of labels in the local neighborhood of each sample, based on distances in the feature space `X`. It is commonly used to evaluate dataset mixing (e.g., integration of multiple batches or experimental conditions). A LISI score close to: - 1 indicates homogeneity (neighbors mostly from one label category), - N (number of categories) indicates high diversity (neighbors from all categories). Parameters ---------- X : np.ndarray Data matrix of shape (n_samples, n_features), representing the embedding space (e.g., PCA, UMAP). metadata : pd.DataFrame DataFrame containing categorical metadata for each sample (e.g., batch or cell type). label_colnames : Iterable[str] List of column names in `metadata` for which to compute LISI. perplexity : float, optional Perplexity parameter influencing the neighborhood size, by default 30. Returns ------- np.ndarray A matrix of shape (n_samples, len(label_colnames)) with LISI scores per label. """ n_cells = metadata.shape[0] n_labels = len(label_colnames) # We need at least 3 * n_neigbhors to compute the perplexity knn = NearestNeighbors(n_neighbors=perplexity * 3, algorithm="kd_tree").fit(X) distances, indices = knn.kneighbors(X) # Don't count yourself indices = indices[:, 1:] distances = distances[:, 1:] # Save the result lisi_df = np.zeros((n_cells, n_labels)) for i, label in enumerate(label_colnames): labels = pd.Categorical(metadata[label]) n_categories = len(labels.categories) simpson = compute_simpson( distances.T, indices.T, labels, n_categories, perplexity ) lisi_df[:, i] = 1 / simpson return lisi_df
[docs]def compute_simpson( distances: np.ndarray, indices: np.ndarray, labels: pd.Categorical, n_categories: int, perplexity: float, tol: float = 1e-5, ) -> np.ndarray: """ Compute the Simpson's diversity index for each sample using a locally scaled probability distribution. This function computes the effective number of categories in each sample's neighborhood, based on the similarity-weighted label distribution. It is internally used by `compute_lisi`. Parameters ---------- distances : np.ndarray Array of shape (n_neighbors, n_samples) with distances to each sample's neighbors. indices : np.ndarray Array of shape (n_neighbors, n_samples) with indices of nearest neighbors for each sample. labels : pd.Categorical Categorical labels for all samples, aligned with the rows of `X`. n_categories : int Number of unique categories in the label. perplexity : float Perplexity value, used to set the target entropy for neighborhood probability distribution. tol : float, optional Tolerance for the entropy convergence, by default 1e-5. Returns ------- np.ndarray Array of shape (n_samples,) with the Simpson's index per sample. Values closer to 1 indicate less diversity, higher values indicate more. """ n = distances.shape[1] P = np.zeros(distances.shape[0]) simpson = np.zeros(n) logU = np.log(perplexity) # Loop through each cell. for i in range(n): beta = 1 betamin = -np.inf betamax = np.inf # Compute Hdiff P = np.exp(-distances[:, i] * beta) P_sum = np.sum(P) if P_sum == 0: H = 0 P = np.zeros(distances.shape[0]) else: H = np.log(P_sum) + beta * np.sum(distances[:, i] * P) / P_sum P = P / P_sum Hdiff = H - logU n_tries = 50 for t in range(n_tries): # Stop when we reach the tolerance if abs(Hdiff) < tol: break # Update beta if Hdiff > 0: betamin = beta if not np.isfinite(betamax): beta *= 2 else: beta = (beta + betamax) / 2 else: betamax = beta if not np.isfinite(betamin): beta /= 2 else: beta = (beta + betamin) / 2 # Compute Hdiff P = np.exp(-distances[:, i] * beta) P_sum = np.sum(P) if P_sum == 0: H = 0 P = np.zeros(distances.shape[0]) else: H = np.log(P_sum) + beta * np.sum(distances[:, i] * P) / P_sum P = P / P_sum Hdiff = H - logU # distancesefault value if H == 0: simpson[i] = -1 # Simpson's index for label_category in labels.categories: ix = indices[:, i] q = labels[ix] == label_category if np.any(q): P_sum = np.sum(P[q]) simpson[i] += P_sum * P_sum return simpson