Source code for evaluation.lisi

# CODE TAKEN FROM: HarmonyPy (https://github.com/slowkow/harmonypy)
# LISI - The Local Inverse Simpson Index
# Copyright (C) 2018  Ilya Korsunsky
#               2019  Kamil Slowikowski <kslowikowski@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from typing import Iterable


[docs]def compute_lisi(
    X: np.array,
    metadata: pd.DataFrame,
    label_colnames: Iterable[str],
    perplexity: float = 30,
) -> np.ndarray:
    """
    Compute the Local Inverse Simpson Index (LISI) for each label column in the metadata.

    LISI measures the diversity of labels in the local neighborhood of each sample,
    based on distances in the feature space `X`. It is commonly used to evaluate
    dataset mixing (e.g., integration of multiple batches or experimental conditions).

    A LISI score close to:
        - 1 indicates homogeneity (neighbors mostly from one label category),
        - N (number of categories) indicates high diversity (neighbors from all categories).

    Parameters
    ----------
    X : np.ndarray
        Data matrix of shape (n_samples, n_features), representing the embedding space (e.g., PCA, UMAP).

    metadata : pd.DataFrame
        DataFrame containing categorical metadata for each sample (e.g., batch or cell type).

    label_colnames : Iterable[str]
        List of column names in `metadata` for which to compute LISI.

    perplexity : float, optional
        Perplexity parameter influencing the neighborhood size, by default 30.

    Returns
    -------
    np.ndarray
        A matrix of shape (n_samples, len(label_colnames)) with LISI scores per label.
    """
    n_cells = metadata.shape[0]
    n_labels = len(label_colnames)
    # We need at least 3 * n_neigbhors to compute the perplexity
    knn = NearestNeighbors(n_neighbors=perplexity * 3, algorithm="kd_tree").fit(X)
    distances, indices = knn.kneighbors(X)
    # Don't count yourself
    indices = indices[:, 1:]
    distances = distances[:, 1:]
    # Save the result
    lisi_df = np.zeros((n_cells, n_labels))
    for i, label in enumerate(label_colnames):
        labels = pd.Categorical(metadata[label])
        n_categories = len(labels.categories)
        simpson = compute_simpson(
            distances.T, indices.T, labels, n_categories, perplexity
        )
        lisi_df[:, i] = 1 / simpson
    return lisi_df


[docs]def compute_simpson(
    distances: np.ndarray,
    indices: np.ndarray,
    labels: pd.Categorical,
    n_categories: int,
    perplexity: float,
    tol: float = 1e-5,
) -> np.ndarray:
    """
    Compute the Simpson's diversity index for each sample using a locally scaled probability distribution.

    This function computes the effective number of categories in each sample's neighborhood,
    based on the similarity-weighted label distribution. It is internally used by `compute_lisi`.

    Parameters
    ----------
    distances : np.ndarray
        Array of shape (n_neighbors, n_samples) with distances to each sample's neighbors.

    indices : np.ndarray
        Array of shape (n_neighbors, n_samples) with indices of nearest neighbors for each sample.

    labels : pd.Categorical
        Categorical labels for all samples, aligned with the rows of `X`.

    n_categories : int
        Number of unique categories in the label.

    perplexity : float
        Perplexity value, used to set the target entropy for neighborhood probability distribution.

    tol : float, optional
        Tolerance for the entropy convergence, by default 1e-5.

    Returns
    -------
    np.ndarray
        Array of shape (n_samples,) with the Simpson's index per sample.
        Values closer to 1 indicate less diversity, higher values indicate more.
    """
    n = distances.shape[1]
    P = np.zeros(distances.shape[0])
    simpson = np.zeros(n)
    logU = np.log(perplexity)
    # Loop through each cell.
    for i in range(n):
        beta = 1
        betamin = -np.inf
        betamax = np.inf
        # Compute Hdiff
        P = np.exp(-distances[:, i] * beta)
        P_sum = np.sum(P)
        if P_sum == 0:
            H = 0
            P = np.zeros(distances.shape[0])
        else:
            H = np.log(P_sum) + beta * np.sum(distances[:, i] * P) / P_sum
            P = P / P_sum
        Hdiff = H - logU
        n_tries = 50
        for t in range(n_tries):
            # Stop when we reach the tolerance
            if abs(Hdiff) < tol:
                break
            # Update beta
            if Hdiff > 0:
                betamin = beta
                if not np.isfinite(betamax):
                    beta *= 2
                else:
                    beta = (beta + betamax) / 2
            else:
                betamax = beta
                if not np.isfinite(betamin):
                    beta /= 2
                else:
                    beta = (beta + betamin) / 2
            # Compute Hdiff
            P = np.exp(-distances[:, i] * beta)
            P_sum = np.sum(P)
            if P_sum == 0:
                H = 0
                P = np.zeros(distances.shape[0])
            else:
                H = np.log(P_sum) + beta * np.sum(distances[:, i] * P) / P_sum
                P = P / P_sum
            Hdiff = H - logU
        # distancesefault value
        if H == 0:
            simpson[i] = -1
        # Simpson's index
        for label_category in labels.categories:
            ix = indices[:, i]
            q = labels[ix] == label_category
            if np.any(q):
                P_sum = np.sum(P[q])
                simpson[i] += P_sum * P_sum
    return simpson