Source code for InPheC_Step1

"""
This script is part of the InPheRNo-ChIP project and is responsible for process and analyze RNA-seq and ChIP-seq data. 
It includes functionality for:

    - Parsing command-line arguments to configure paths and options for data processing.
    - Loading ChIP-seq and RNA-seq data from specified directories: ./Data/
    - Processing and filtering data to ensure consistency among gene and TF sets.
    - Performing Elastic Net regression on gene and TF expression data.

Usage:

    - Run the script from the command line "python InPheC_Step1.py" directly.
    - For detailed argument options, use the help option: `python InPheC_Step1.py --help`

Note: Ensure that all dependencies are installed and the Python environment is correctly set up for running this script.
"""

import argparse
import glob
import logging
import os
import sys
import warnings

import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV
from tqdm import tqdm

from utils_logo import print_logo



[docs]
class Step1ArgumentParser:
    """
    Handles command-line arguments for the InPheRNo-ChIP project, setting up and parsing them.
    
    :param argparse.ArgumentParser parser: Configures the command-line arguments.
    :param argparse.Namespace args: Stores the values of parsed arguments.
    """

    def __init__(self):
        """Initializes the ArgumentParser with a description of the script's purpose and setups the arguments."""
        self.parser = argparse.ArgumentParser(
            description="Configure paths and options for RNA-seq and ChIP-seq data processing."
        )
        self._add_arguments()

    def _add_arguments(self):
        """
        Adds command-line options for the script, specifying directories, files, and other parameters
        required for processing RNA-seq and ChIP-seq data.
        """
        # Directories
        self.parser.add_argument(
            "-id_r",
            "--in_dir_rna",
            default="./Data/RNA_seq",
            help="Path to GEx data directory.",
        )
        self.parser.add_argument(
            "-id_c",
            "--in_dir_chip",
            default="./Data/ChIP_GSE61475_G",
            help="Path to ChIP-seq data directory.",
        )
        self.parser.add_argument(
            "-od",
            "--out_dir",
            default="./step1/",
            help="Path to output directory for step 1.",
        )

        # Input Files
        self.parser.add_argument(
            "-it",
            "--input_tf_addr",
            default="./Data/mapped_hmTFs_legitTFs-HumanTFDB.csv",
            help="CSV file containing list of tfs.",
        )
        self.parser.add_argument(
            "-ie",
            "--input_expression",
            default="RNAseq_expr_sample_gse981-gse361-gse371_voom-zScore.csv",
            help="CSV file with gene and tf expression data.",
        )
        self.parser.add_argument(
            "-igp",
            "--input_gene_phenotype_interest",
            default="RNAseq_pval_gene-phenotype_gse361-gse981-gse371_logFC2_FDR0.01.csv",
            help="CSV file with p-values of gene-phenotype.",
        )

        # Options and Parameters
        self.parser.add_argument(
            "-loi",
            "--lineage_of_interest",
            nargs="+",
            default=["h64", "dEN"],
            help="Prefix for lineages of interest in ChIP-seq filenames.",
        )
        self.parser.add_argument(
            "-cpoi",
            "--chip_pvalue_of_interest",
            default="Distance_P_Value",
            help="P-value of interest, this corresponds to the t-Gene output column names.",
        )

        self.parser.add_argument(
            "-num_coefs",
            "--max_num_coefs",
            default=22,
            type=int,
            help="maximum number of TFs to be selected by Elastic Net, \
                we set to 22 bc all 22 TFs are relevant to the differential process. \
                This number allows the model to consider all TFs while still guarding against overfitting, \
                given the sample size of 28",
        )

        # Output Files
        self.parser.add_argument(
            "-ogp",
            "--output_gene_phenotype",
            default="RNAseq_pval_gene-pheno_intersect_chip_",
            help="Output file with gene-phenotype p-values.",
        )
        self.parser.add_argument(
            "-tgt",
            "--output_gene_tf",
            default="RNAseq_pval_gene-tf_intersect_chip_",
            help="Output file with gene-tf p-values.",
        )
        self.parser.add_argument(
            "-tgpk",
            "--output_chip_tf_gene_peaks",
            default="ChIPseq_pval_gene-tf_minQ_",
            help="Output file prefix with ChIP-seq cleaned data, which contains the minimum p-value across peaks for each lineage",
        )


[docs]
    def parse_args(self):
        """Parses the command-line arguments."""
        return self.parser.parse_args()





[docs]
class DataLoader:
    """
    Loads and prepares RNA-seq and ChIP-seq data from specified directories.

    :param chip_seq_dir: Path to the ChIP-seq data directory.
    :type chip_seq_dir: str
    :param rna_seq_dir: Path to the RNA-seq data directory.
    :type rna_seq_dir: str
    """

    def __init__(self, chip_seq_dir, rna_seq_dir):
        """
        Initializes the DataLoader with specified directories for ChIP-seq and RNA-seq data.
        """
        self.chip_seq_dir = chip_seq_dir
        self.rna_seq_dir = rna_seq_dir


[docs]
    def load_chip_data(self, lineage_of_interest):
        """
        Loads and merges ChIP-seq data files for specified lineages.

        :param lineage_of_interest: Lineages to include in the analysis.
        :type lineage_of_interest: list
        :return: Combined DataFrame of ChIP-seq data across specified lineages.
        :rtype: pandas.DataFrame
        """
        combined_df = pd.DataFrame()
        for lineage in lineage_of_interest:
            file_nms = sorted(glob.glob(os.path.join(self.chip_seq_dir, f"{lineage}*.bed.tsv")))
            for file_nm in file_nms:
                df = pd.read_csv(file_nm, sep="\t")
                df["lineage"] = lineage
                tf_name = os.path.basename(file_nm).split("_")[1]  # Extracting TF name (second element after split)
                df["TF"] = tf_name
                combined_df = pd.concat([combined_df, df], ignore_index=True)
        return combined_df



[docs]
    def load_rna_data(self, f_exp, f_gp):
        """
        Loads RNA-seq expression data and associated gene-phenotype p-values from specified files.

        :param f_exp: Filename of the RNA-seq expression data file.
        :type f_exp: str
        :param f_gp: Filename of the gene-phenotype p-values data file.
        :type f_gp: str
        :return: Tuple containing expression data and gene-phenotype p-values.
        :rtype: (pandas.DataFrame, pandas.DataFrame)
        """
        expr = pd.read_csv(
            os.path.join(self.rna_seq_dir, f_exp), index_col=0, delimiter="," if f_exp.endswith(".csv") else "\t"
        )
        gene_pheno_pval = pd.read_csv(
            os.path.join(self.rna_seq_dir, f_gp), index_col=0, delimiter="," if f_gp.endswith(".csv") else "\t"
        )
        return expr, gene_pheno_pval



[docs]
    def load_tf_list(self, f_tf):
        """
        Loads a list of transcription factors from a specified CSV file.

        :param f_tf: Path to the CSV file containing the list of transcription factors.
        :type f_tf: str
        :return: DataFrame containing the list of transcription factors.
        :rtype: pandas.DataFrame
        """
        tf_list = pd.read_csv(f_tf)
        return tf_list





[docs]
class DataProcessor:
    """
    Processes and filters RNA-seq and ChIP-seq data to ensure consistency among gene and TF sets.
    This class is crucial for preparing the data for subsequent analysis, including filtering based on shared genes and TFs.
    """

    def __init__(self, chip_data, GEx_gene_tf, gene_phenotype, hm_tf_df, chip_pvalue_of_interest):
        """
        Initializes the DataProcessor with various data sources for processing.

        :param chip_data: ChIP-seq data as a pandas DataFrame.
        :type chip_data: pandas.DataFrame
        :param GEx_gene_tf: Gene and TF expression data as a pandas DataFrame.
        :type GEx_gene_tf: pandas.DataFrame
        :param gene_phenotype: Gene-phenotype data as a pandas DataFrame.
        :type gene_phenotype: pandas.DataFrame
        :param hm_tf_df: Human transcription factors data as a pandas DataFrame.
        :type hm_tf_df: pandas.DataFrame
        :param chip_pval_flag: Column name in ChIP-seq data for the p-value of interest.
        :type chip_pval_flag: str
        """
        self.chip_data = chip_data
        self.GEx_gene_tf = GEx_gene_tf
        self.gene_phenotype = gene_phenotype
        self.hm_tf_list = set(hm_tf_df.Symbol.tolist())
        self.gex_genes_tfs = set(list(self.GEx_gene_tf.index.values))
        self.chip_pval_flag = chip_pvalue_of_interest

    def _find_common_genes(self):  # for DE genes, ChIP genes, and GEx genes
        """
        Identifies common genes across RNA-seq and ChIP-seq datasets.

        :return: A set of gene IDs common across all datasets.
        :rtype: set
        """
        chip_genes = set(self.chip_data["Gene_ID"])  # 14982 genes
        de_genes = set(self.gene_phenotype.index.values)  # 1745 DE genes
        intersect = chip_genes & de_genes & self.gex_genes_tfs  # 1624 DE-ChIP-voom genes
        print(
            f" >> Originally, # DE genes: {len(de_genes)}, # ChIP genes: {len(chip_genes)}, # GEx genes: {len(self.gex_genes_tfs)}"
        )
        print(f" >> after intersection, # DE-ChIP genes: {len(intersect)}")
        return intersect

    def _find_common_tfs(self):
        """
        Identifies common transcription factors across RNA-seq and ChIP-seq datasets.

        :return: A set of TF identifiers common to all datasets.
        :rtype: set
        """
        chip_tfs = set(self.chip_data["TF"])
        return chip_tfs & self.hm_tf_list & self.gex_genes_tfs


[docs]
    def filter_data(self):
        """
        Filters RNA-seq and ChIP-seq data to ensure consistency among gene and transcription factor (TF) sets,
        and prepares the data for subsequent analysis steps. This method processes the data by:
        - Identifying common genes and TFs across different datasets.
        - Filtering ChIP-seq data to keep rows that have genes and TFs present in both lineages.
        - Filtering to keep the minimal p-value across peaks for each (TF, gene, lineage) combination.
        - Preparing expression data matrices for genes and TFs for Elastic Net regression.

        :return: A tuple containing:
            - pandas.DataFrame: Filtered ChIP-seq data with minimal p-values across peaks, reset index.
            - pandas.DataFrame: Gene expression data for shared genes.
            - pandas.DataFrame: TF expression data for shared TFs.
            - pandas.DataFrame: Gene-phenotype data for shared genes.
        :rtype: tuple
        """
        shared_genes = self._find_common_genes()  # 1624 DE-ChIP-Pheno genes
        shared_tfs = self._find_common_tfs()

        print(f" Now filtering ChIP-seq data..")
        # filter 1: filter chip to keep rows that have gene on both lineages
        tmp = self.chip_data[self.chip_data["Gene_ID"].isin(shared_genes) & self.chip_data["TF"].isin(shared_tfs)]
        # filter 2: filter chip to keep rows that have TF on both lineages
        filtered_chip = tmp.groupby(["Gene_ID", "TF"]).filter(
            lambda x: x["lineage"].nunique() > 1
        )  # 1405 DE-ChIP-voom genes

        # filter 3: filter chip to keep (TF, gene, lineage) with lowest p-value across PEAKS
        filtered_chip_lowest_pval_across_peaks = (
            filtered_chip.sort_values(self.chip_pval_flag).groupby(["TF", "Gene_ID", "lineage"]).head(1)
        )

        chip_genes = set(filtered_chip_lowest_pval_across_peaks["Gene_ID"].tolist())
        chip_tfs = set(filtered_chip_lowest_pval_across_peaks["TF"].tolist())

        # [GEX data] split to gene and tf expression, prepare for Elastic Net
        expr_gene = self.GEx_gene_tf.loc[list(chip_genes)]  # (1405, 28)
        expr_tf = self.GEx_gene_tf.loc[list(chip_tfs)]  # (22, 28)
        filtered_gp = self.gene_phenotype.loc[list(chip_genes)]  # (1405, 5)

        # find shared genes and tfs after filtering
        refined_shared_genes = set(expr_gene.index.tolist()) & set(chip_genes)
        refined_shared_tfs = set(expr_tf.index.tolist()) & set(chip_tfs)

        print(f" >> (refined) # shared genes in ChIP-seq, DE, and GEx: {len(refined_shared_genes)}")
        print(f" >> (refined) # shared TFs in ChIP-seq, DE, and GEx: {len(refined_shared_tfs)}")

        # assert
        assert expr_gene.shape[0] == len(chip_genes), "Number of genes in expr_gene does not match chip_genes"
        assert expr_tf.shape[0] == len(chip_tfs), "Number of genes in expr_tf does not match chip_tfs"

        return filtered_chip_lowest_pval_across_peaks.reset_index(drop=True), expr_gene, expr_tf, filtered_gp





[docs]
class ElasticNetProcessor:
    """
    Conducts Elastic Net regression to identify relationships between gene expressions and transcription factors
    within the InPheRNo-ChIP project. It dynamically adjusts model parameters to optimize fit and manage convergence.

    :param expr_gene: DataFrame containing gene expression data.
    :type expr_gene: pandas.DataFrame
    :param expr_tf: DataFrame containing TF expression data.
    :type expr_tf: pandas.DataFrame
    :param max_num_coefs: Maximum allowed non-zero coefficients in the model, provides a control over model complexity.
    :type max_num_coefs: int, optional
    """

    def __init__(self, expr_gene, expr_tf, max_num_coefs=None):
        """
        Initializes the processor with gene and TF expression data sets, and sets up the Elastic Net model parameters.

        :param expr_gene: Gene expression data as a pandas DataFrame.
        :param expr_tf: TF expression data as a pandas DataFrame.
        :param max_num_coefs: Optional; sets a cap on the number of non-zero coefficients to prevent overfitting.
        """
        self.expr_gene = expr_gene
        self.expr_tf = expr_tf
        self.l1_rat = 0.5
        self.eps = sys.float_info.min  # 3e-308
        self.max_num_coefs = max_num_coefs
        self.ls_genes = list(self.expr_gene.index.values)
        self.ls_tfs = list(self.expr_tf.index.values)

        assert self.max_num_coefs is not None, "max_num_coefs must be specified"


[docs]
    def perform_elastic_net(self):
        """ 
        Performs Elastic Net regression across all genes, using transcription factor expressions to predict gene activity.

        :return: DataFrame with p-values indicating the significance of associations between genes and TFs.
        :rtype: pandas.DataFrame

        :note: This function is adapted from the InPheRNo paper with updated eps and n_alphas values to improve model performance and stability.
        """
        print(f"Performing Elastic Net regression on gene and TF expression data..")
        X_features = self.expr_tf.values.T
        pvalue_gt_array = (-1) * np.ones((len(self.ls_genes), len(self.ls_tfs)))
        convergence_issues = {}

        for i, gene in enumerate(self.ls_genes):
            logging.info(f"Pvalue_gene_tf: {i}")
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", ConvergenceWarning)

                y = self.expr_gene.iloc[i].values
                EN_model = ElasticNetCV(l1_ratio=self.l1_rat, max_iter=100000)

                #### make sure that number of nonzero coefs do not exceed max_num_coefs
                alphas1, coefs1, _ = EN_model.path(
                    X_features, y, eps=0.001, n_alphas=10000
                )  # this setting gives # non-zero coefs closed to 22.
                num_coefs = np.sum(coefs1 != 0, axis=0)
                # print(num_coefs)
                # print(num_coefs[num_coefs <= max_num_coefs][-1])
                rep_EN = 0
                if num_coefs[-1] < self.max_num_coefs:
                    EN_coef = coefs1[:, -1]
                    selected_ind = np.array(range(len(self.ls_tfs)))[EN_coef != 0]
                else:
                    while (
                        (num_coefs[0] != num_coefs[-1])
                        and (max(num_coefs[num_coefs <= self.max_num_coefs]) != self.max_num_coefs)
                        and (rep_EN < 10)
                    ):
                        rep_EN += 1
                        alpha_min = alphas1[(num_coefs <= self.max_num_coefs)][-1]
                        alpha_max = alphas1[(num_coefs > self.max_num_coefs)][0]
                        alphas3 = np.linspace(alpha_min, alpha_max, 10)
                        alphas1, coefs1, _ = EN_model.path(X_features, y, alphas=alphas3)
                        num_coefs = np.sum(coefs1 != 0, axis=0)
                        # print(num_coefs)
                        # print(num_coefs[num_coefs <= max_num_coefs][-1])
                    print("repeat", rep_EN)
                    if num_coefs[0] == num_coefs[-1]:
                        EN_coef = coefs1[:, 0]
                        selected_ind = np.array(range(len(self.ls_tfs)))[EN_coef != 0]
                    else:
                        EN_coef = coefs1[:, len(num_coefs[num_coefs <= self.max_num_coefs]) - 1]
                        selected_ind = np.array(range(len(self.ls_tfs)))[EN_coef != 0]

                print(len(selected_ind))
                # n_selected_tf = len(selected_ind)
                X_features_new = X_features[:, selected_ind]

                model = sm.OLS(y, X_features_new)
                results = model.fit()

                ts_b = results.tvalues
                if any(item.category == ConvergenceWarning for item in w):
                    convergence_issues[gene] = 1  # Set to 1 to indicate a warning for this gene

            #####prcise estimation of p-vals
            N = np.shape(X_features)[0] - 1  # degrees of freedom in ttest for regression
            pvals_precise = []
            x1_bar = 0  # mean of the first distribution
            n1 = (N + 2) // 2  # num obs first distribution
            n2 = N + 2 - n1  # num obs second distribution
            x2_std = 0
            t_precise = []
            for j in range(len(ts_b)):
                T = ts_b[j]  # T statistic
                x2_bar = -np.sign(T)  # mean of the second distribution
                x1_std = 1 / T * np.sqrt((n1 + n2 - 2) / (n1 - 1) / (1 / n1 + 1 / n2))  # std of first distribution
                (t, p) = ss.ttest_ind_from_stats(x1_bar, x1_std, n1, x2_bar, x2_std, n2)
                if p < self.eps:
                    p = self.eps
                pvals_precise.append(p)
                t_precise.append(t)

            pvalue_gt_array[i, selected_ind] = pvals_precise

        pvalue_gt_df = pd.DataFrame(
            pvalue_gt_array,
            index=self.ls_genes,
            columns=self.ls_tfs,
        )
        return pvalue_gt_df





[docs]
def save_output_files(args, chip_gt, rna_gt, rna_gp):
    """
    Saves processed data to specified output files as part of the InPheRNo-ChIP project. This function handles the
    creation of output directories and manages the naming conventions for output files based on the provided command-line arguments.

    :param args: Parsed command-line arguments containing output file paths and names.
    :type args: argparse.Namespace
    :param chip_gt: Processed ChIP-seq data to be saved.
    :type chip_gt: pandas.DataFrame
    :param rna_gt: Processed RNA-seq gene-TF data to be saved.
    :type rna_gt: pandas.DataFrame
    :param rna_gp: Processed RNA-seq gene-phenotype data to be saved.
    :type rna_gp: pandas.DataFrame
    """
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    addr_out_pvalue_gp = os.path.join(
        args.out_dir,
        f"{args.output_gene_phenotype}{'_'.join(args.lineage_of_interest)}.csv",
    )
    rna_gp.to_csv(addr_out_pvalue_gp, index=True)
    addr_out_pvalue_gt = os.path.join(
        args.out_dir,
        f"{args.output_gene_tf}{'_'.join(args.lineage_of_interest)}.csv",
    )
    rna_gt.to_csv(addr_out_pvalue_gt, index=True)
    addr_out_chip_gt = os.path.join(
        args.out_dir,
        f"{args.output_chip_tf_gene_peaks}{'_'.join(args.lineage_of_interest)}.csv",
    )
    chip_gt.to_csv(addr_out_chip_gt, index=False)
    print(f"Saved 3 output files to {args.out_dir}.")




[docs]
def step1_main():
    """
    Main function for the first step in the InPheRNo-ChIP project pipeline. Orchestrates the execution of data loading,
    processing, Elastic Net regression, and saving the output files. This function acts as the entry point when running the script.
    
    This function follows these steps:
    - 1. Parses command-line arguments.
    - 2. Loads data from specified directories.
    - 3. Processes data to align gene and TF datasets.
    - 4. Performs Elastic Net regression to analyze gene-TF relationships.
    - 5. Saves the processed data to output files.
    
    :raises SystemExit: If the dimensions of gene-phenotype data and results from Elastic Net regression do not match, indicating a potential issue in data alignment or processing.
    """
    parser = Step1ArgumentParser()
    args = parser.parse_args()

    # Load data
    data_loader = DataLoader(args.in_dir_chip, args.in_dir_rna)
    chip_both_lineage = data_loader.load_chip_data(args.lineage_of_interest)
    expr_all, gene_phenotype_interest = data_loader.load_rna_data(
        args.input_expression, args.input_gene_phenotype_interest
    )
    tf_list = data_loader.load_tf_list(args.input_tf_addr)

    # Filter RNA-seq and ChIP-seq data based on common genes and TFs
    processor = DataProcessor(
        chip_both_lineage, expr_all, gene_phenotype_interest, tf_list, args.chip_pvalue_of_interest
    )
    chip_gt, tmp_expr_gene, tmp_expr_tf, rna_gp = processor.filter_data()  # 1405 DE-ChIP-Voom Genes

    # Perform Elastic Net on GENE and TF expression
    en_processor = ElasticNetProcessor(tmp_expr_gene, tmp_expr_tf, args.max_num_coefs)
    rna_gt = en_processor.perform_elastic_net()

    # Save results
    if (rna_gp.index != rna_gt.index).sum() > 0:
        sys.exit("[GEx] Dimensions do not match!")
    save_output_files(args, chip_gt, rna_gt, rna_gp)



if __name__ == "__main__":
    print_logo("Running step 1")
    step1_main()