# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import requests
import json
import time
import scipy.stats as ss
from maayanlab_bioinformatics.normalization.filter import filter_by_expr
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from math import log2, ceil
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
import urllib.request

Metadata file shape: 8 samples, 2 features
RNA-seq data file shape: 27780 genes, 8 samples

The pipeline will be run on 8 samples.

Number of genes after filtering by low expression: 15376

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 2.47 seconds.

Fitting dispersion trend curve...
... done in 5.79 seconds.

Fitting MAP dispersions...
... done in 2.89 seconds.

Fitting LFCs...
... done in 1.34 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 0.91 seconds.

Log2 fold change & Wald test p-value: Class Dex vs Control

Sample sizes <16 generally do not provide good results -- Wilcoxon Rank Sum is skipped.


# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import requests
import json
import time
import scipy.stats as ss
from maayanlab_bioinformatics.normalization.filter import filter_by_expr
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from math import log2, ceil
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
import urllib.request


import warnings
warnings.filterwarnings('ignore')


# Set all variables
meta_class_column_name = 'Class'
control_name = 'Control'
treatment_name = 'Dex'

data_dir = '../RNAseq_data'
meta_data_filename = 'GSE159094_series_matrix.txt'
rnaseq_data_filename = 'GSE159094_counts.txt'


# Load in metadata
try:
    if meta_data_filename.endswith('.csv'): 
        meta_df = pd.read_csv(f"{data_dir}/{meta_data_filename}", index_col=0)
    else:
        meta_df = pd.read_csv(f"{data_dir}/{meta_data_filename}", sep="\t", index_col=0)
    print(f"Metadata file shape: {meta_df.shape[0]} samples, {meta_df.shape[1]} features")
except:
    print("Error! Please ensure the metadata file is in CSV, TXT, or TSV format.")

# Load in data
try:
    if rnaseq_data_filename.endswith('.csv'): 
        expr_df = pd.read_csv(f"{data_dir}/{rnaseq_data_filename}", index_col=0).sort_index()
    else:
        expr_df = pd.read_csv(f"{data_dir}/{rnaseq_data_filename}", sep="\t", index_col=0).sort_index()
    print(f"RNA-seq data file shape: {expr_df.shape[0]} genes, {expr_df.shape[1]} samples")
except:
    print("Error! Please ensure the RNA-seq file is in CSV, TXT, or TSV format.")

# Match samples between the metadata and the dataset
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' was not found in the metadata.")

shared_samples = set(meta_df.index).intersection(expr_df.columns)
meta_df = meta_df[meta_df.index.isin(shared_samples)]
meta_df = pd.concat([
    meta_df[meta_df['Class'] == control_name],
    meta_df[meta_df['Class'] == treatment_name]
]) 
expr_df = expr_df[meta_df.index]
print(f"\nThe pipeline will be run on {len(shared_samples)} samples.")

Metadata file shape: 8 samples, 2 features
RNA-seq data file shape: 27780 genes, 8 samples

The pipeline will be run on 8 samples.


## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
expr_df = filter_by_expr(expr_df, group=meta_df[meta_class_column_name].apply(lambda x: x==treatment_name))
print(f"Number of genes after filtering by low expression: {expr_df.shape[0]}")

Number of genes after filtering by low expression: 15376


# Display meta data
meta_df


# Display expression data
expr_df


signatures = {}


# Function for computing signatures with characteristic direction
def cd_signature(control, treatment, data, metadata, meta_class_column_name):
    
    ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist() #control
    case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
    
    signature = characteristic_direction(
        data.loc[:, ctrl_ids], 
        data.loc[:, case_ids], 
        calculate_sig=True
    )
    signature = signature.sort_values("CD-coefficient", ascending=False)
    signature['Significance'] = signature['CD-coefficient'].apply(abs)
    
    return signature

signatures['cd'] = cd_signature(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Characteristic Direction Signature"))
display(signatures['cd'].drop(columns=['Significance']))


# Function for computing signatures
def limma(control, treatment, data, metadata, meta_class_column_name):

    ctrl_ids = metadata.loc[metadata[meta_class_column_name] == control, :].index.tolist()  # control
    case_ids = metadata.loc[metadata[meta_class_column_name] == treatment, :].index.tolist()  # case

    # run limma
    signature = limma_voom_differential_expression(
        data.loc[:, ctrl_ids],
        data.loc[:, case_ids],
        voom_design=True,
        filter_genes=False
    )
    signature = signature.sort_values("t", ascending=False)
    signature['Significance'] = signature['P.Value']
    signature['StatVal'] = signature['t'].apply(abs)

    # return result
    return signature

signatures['limma'] = limma(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Limma-voom Signature"))
display(signatures['limma'].drop(columns=['Significance', 'StatVal']))


# Function for computing signatures with pyDESeq2
def deseq2(data, metadata, meta_class_column_name):
    dds = DeseqDataSet(
        counts=data.T,
        clinical=metadata,
        design_factors=meta_class_column_name
    )
    dds.deseq2()
    signature = DeseqStats(dds)
    return signature

signatures['pydeseq2'] = deseq2(expr_df, meta_df, meta_class_column_name)
display(Markdown(f"pyDESeq2 Signature"))
signatures['pydeseq2'].summary()
signatures['pydeseq2'].results_df['Significance'] = signatures['pydeseq2'].results_df['pvalue']
signatures['pydeseq2'].results_df['StatVal'] = signatures['pydeseq2'].results_df['stat'].apply(abs)
signatures['pydeseq2'] = signatures['pydeseq2'].results_df.sort_values(by=['stat'], ascending=False)

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 2.47 seconds.

Fitting dispersion trend curve...
... done in 5.79 seconds.

Fitting MAP dispersions...
... done in 2.89 seconds.

Fitting LFCs...
... done in 1.34 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 0.91 seconds.

Log2 fold change & Wald test p-value: Class Dex vs Control


def ranksum(control, treatment, data, metadata, meta_class_column_name):
  res_array = []
  ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
  case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
  for gene in data.index: 
    res = ss.ranksums(
      data.loc[gene, case_ids],
      data.loc[gene, ctrl_ids]
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  signature['StatVal'] = signature['Statistic'].apply(abs)
  return signature.sort_values(by=['Statistic'], ascending=False)


if expr_df.shape[1] <  32: 
  print("Sample sizes <16 generally do not provide good results -- Wilcoxon Rank Sum is skipped. ")
else:
  signatures['ranksum'] = ranksum(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
  display(Markdown(f"Wilcoxon Rank-Sum Test"))
  display(signatures['ranksum'].drop(columns=['Significance', 'StatVal']))

Sample sizes <16 generally do not provide good results -- Wilcoxon Rank Sum is skipped.


def ttest(control, treatment, data, metadata, meta_class_column_name):
  res_array = []
  ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
  case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
  for gene in data.index: 
    res = ss.ttest_ind(
      data.loc[gene, case_ids],
      data.loc[gene, ctrl_ids],
      equal_var = False
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  signature['StatVal'] = signature['Statistic'].apply(abs)
  return signature.sort_values(by=['Statistic'], ascending=False)

signatures['ttest'] = ttest(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Welch's t-test"))
display(signatures['ttest'].drop(columns=['Significance', 'StatVal']))


# Function for computing signatures with fold change
def logFC(control, treatment, data, metadata, meta_class_column_name):
    ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
    case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
    signature = data.loc[:, case_ids].mean(axis=1) / (data.loc[:, ctrl_ids].mean(axis=1) + 0.001)
    signature_df = pd.DataFrame(
        signature.apply(lambda x: log2(x+0.001)) \
        .sort_values(ascending=False), columns=['logFC']
    )
    signature_df['Significance'] = signature_df['logFC'].apply(abs)
    return signature_df

signatures['logfc'] = logFC(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown("Log Fold Change Signature"))
display(signatures['logfc'].drop(columns=['Significance']))


# Get gene lists to put into Enrichr
gene_lists = {'up': {}, 'down': {}, 'combined': {}}
for method in signatures.keys():
    if method == 'ranksum' and expr_df.shape[1] < 16: continue
    gene_lists['up'][method] = signatures[method].head(100).index.tolist()
    gene_lists['down'][method] = signatures[method].tail(100).index.tolist()
    gene_lists['combined'][method] = gene_lists['up'][method] + gene_lists['down'][method]


# Function to get Enrichr Results

def getEnrichrLibrary(library_name): 
    ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'
    resp = requests.get(ENRICHR_URL)
    if not resp.ok: 
        raise Exception(f"Error downloading {library_name} library from Enrichr, please try again.")
    return resp.json()[library_name]['terms']

def getLibraryIter(libdict):
    for k,v in libdict.items():
        if type(v) == list:
            yield k, v
        else:
            yield k, list(v.keys())

def enrich(gene_list, lib_json, name): 
    gene_list = [x.upper() for x in gene_list]
    all_terms = list(lib_json.keys())
    termranks = []
    enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)
    enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]
    sorted_res = sorted(enrich_res, key=lambda x: x[1])
    for i in range(len(sorted_res)): 
        termranks.append([name, sorted_res[i][0], i])
    for t in set(all_terms).difference([x[1] for x in termranks]): 
        i+=1
        termranks.append([name, t, i])
    return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])


chea2022 = getEnrichrLibrary('ChEA_2022')


# Get results
chea2022_results = []

for m in gene_lists['up'].keys(): 
    chea2022_results.append(enrich(gene_lists['up'][m], chea2022, f"{m}:up:ChEA 2022"))
    chea2022_results.append(enrich(gene_lists['down'][m], chea2022, f"{m}:down:ChEA 2022"))
    chea2022_results.append(enrich(gene_lists['combined'][m], chea2022, f"{m}:combined:ChEA 2022"))

chea2022_df = pd.concat(chea2022_results, axis=0)


targets = [
  'NR3C1', 
  'NR1I2',
  'NR0B1'
]

dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: any(t in term for t in targets))]


def createResultsDf(df):
  df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])
  df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])
  df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])
  df['Library'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])


createResultsDf(dex_chea2022_df)


full_df = dex_chea2022_df
full_df = full_df[full_df['Term'].apply(lambda x: 'MLEC' not in x)]

up_df = full_df[full_df['Direction'] == 'up']
down_df = full_df[full_df['Direction'] == 'down']
combined_df = full_df[full_df['Direction'] == 'combined']


# Calculate and sort by mean rank grouping by method and name
display(Markdown("#### Opposite Direction"))

up_df_averages_byTF = up_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of up genes from each method"))
display(up_df_averages_byTF)

down_df_averages_byTF = down_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of down genes from each method"))
display(down_df_averages_byTF)

combined_df_averages_byTF = combined_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of combined up/down genes from each method"))
display(combined_df_averages_byTF)


# bootstrap random results
random_arr_chea2022 = []
for i in range(10):
    rand_100 = sample(expr_df.index.tolist(), 100)
    rand_200 = sample(expr_df.index.tolist(), 200)

    random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))   
    random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))   

rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)
rand_chea2022_df['Library'] = 'ChEA 2022'
rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0].upper())

rand_df = rand_chea2022_df
rand_df = rand_df[rand_df['TF'].isin(['NR3C1', 'NR0B1', 'NR1I2'])]
rand_df['Method'] = 'random'


method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'pydeseq2': '#DC267F',
    'logfc': '#FE6100',
    'ttest': '#FFB000',
    'ranksum': '#B3B3B3',
    'paired-ttest': '#009E73'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf].set_index(['Method'])
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Method']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df.loc[gs]['Rank'].tolist(),
                name=gs, 
                marker_color=method_color_map[gs.split(':')[0]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


method_color_map = {
    'up': 'blue',
    'down': 'crimson',
    'combined': 'purple'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf].set_index(['Direction'])
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Direction']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df.loc[gs]['Rank'].tolist(),
                name=gs, 
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


method_color_map = {
    'up': 'blue',
    'down': 'crimson',
    'combined': 'purple'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf]
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Gene_Set'] == gs]['Rank'],
                name=gs.split(':ChEA')[0], 
                marker_color=method_color_map[gs.split(':')[1]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf].set_index(['Gene_Set'])
    for gs in sub_rand_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_rand_df.loc[gs]['Rank'],
                name=gs,
                marker_color='black'
            )
        )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method:Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'pydeseq2': '#DC267F',
    'logfc': '#FE6100',
    'ttest': '#FFB000',
    'ranksum': '#B3B3B3',
    'paired-ttest': '#009E73'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf]
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Gene_Set'] == gs]['Rank'],
                name=gs.split(':ChEA')[0], 
                marker_color=method_color_map[gs.split(':')[0]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf].set_index(['Gene_Set'])
    for gs in sub_rand_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_rand_df.loc[gs]['Rank'],
                name=gs,
                marker_color='black'
            )
        )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method:Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


target_genesets = []
target_genesets.append({t:chea2022[t] for t in dex_chea2022_df['Term'].unique() if ('MLEC' not in t and 'Wistar' not in t)})


def singleBridgePlot(gsname, signame, target_gs):
    geneset = target_gs[gsname]
    if signame == 'cd' or signame == 'logfc':
        sorted_genes = signatures[signame].sort_values(by='Significance', ascending=False)
    else: 
        sorted_genes = signatures[signame].sort_values(by=['Significance'], ascending=True)
    select = pd.Series(
        [x.upper() in geneset for x in sorted_genes.index.tolist()]
    )
    x, y = bridge_plot(select)
    x = x/len(x)
    return x,y

def randomBridgePlot(gsname, target_gs): 
    all_x = []
    all_y = []

    for _ in range(10):
        rand_sig = sample(
            signatures['logfc'].index.tolist(), 
            signatures['logfc'].shape[0]
        )
        select = pd.Series([
            x.upper() in target_gs[gsname] for x in rand_sig
        ])
        x, y = bridge_plot(select)
        x = x/len(x)
        all_x.append(x)
        all_y.append(y)

    return all_x, all_y


def build_tf_res(target_gs):
    tf_res = {tf: {'random': {'x': [], 'y': []}} for tf in full_df['TF'].unique()}
    for gs in target_gs.keys():
        tf = gs.split(' ')[0].split('_')[0]
        rand_x, rand_y = randomBridgePlot(gs, target_gs)
        tf_res[tf]['random']['x'] += rand_x
        tf_res[tf]['random']['y'] += rand_y
        for sig in signatures.keys():
            temp_x, temp_y = singleBridgePlot(gs, sig, target_gs)
            if sig in tf_res[tf].keys():
                tf_res[tf][sig]['x'].append(temp_x)
                tf_res[tf][sig]['y'].append(temp_y)
            else: 
                tf_res[tf][sig] = {'x': [temp_x], 'y': [temp_y]}
    return tf_res


significance_tf_results = [] 
for i in range(len(target_genesets)): 
    significance_tf_results.append(build_tf_res(target_genesets[i]))


libraries = ['ChEA 2022']


# with opposite direction queries
matplotlib.rcParams.update({'font.size': 12})
for tf in full_df['TF'].unique():
    fig = plt.figure()
    tf_x = {method: [] for method in significance_tf_results[0][tf].keys()}
    tf_y = {method: [] for method in significance_tf_results[0][tf].keys()}
    for i in range(len(target_genesets)):
      for sig in significance_tf_results[i][tf].keys(): 
        tf_x[sig].append(np.mean(significance_tf_results[i][tf][sig]['x'], axis=0))
        tf_y[sig].append(np.mean(significance_tf_results[i][tf][sig]['y'], axis=0))
    for m in tf_x.keys():
      if m == 'random': 
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=m, color='gray')
      elif m == 'cd' or m == 'logfc': 
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=m + ' (abs)')
      else:
          label = m.replace('limma', 'limma-voom')
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=label)

    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.legend(bbox_to_anchor=(1.25, 1))
    plt.title(f"{tf} Target Gene Rankings")
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.show()
    
    if tf == 'NR3C1':
        fig.savefig('../../publicationfigs/Figure2E.png', format='png', dpi=300, bbox_inches='tight')


# with manual libraries
matplotlib.rcParams.update({'font.size': 12})
for tf in full_df['TF'].unique():
    fig = plt.figure()
    tf_x = {method: [] for method in significance_tf_results[0][tf].keys()}
    tf_y = {method: [] for method in significance_tf_results[0][tf].keys()}
    for i in range(len(libraries)):
      for sig in significance_tf_results[i][tf].keys(): 
        tf_x[sig].append(np.mean(significance_tf_results[i][tf][sig]['x'], axis=0))
        tf_y[sig].append(np.mean(significance_tf_results[i][tf][sig]['y'], axis=0))
    for m in tf_x.keys():
      try:
        leading_x = [arr[:ceil(0.01*len(arr))] for arr in tf_x[m]]
        leading_y = [arr[:ceil(0.01*len(arr))] for arr in tf_y[m]]
      except:
        leading_x = tf_x[m][0][:ceil(0.01*len(tf_x[m][0]))]
        leading_y = tf_y[m][0][:ceil(0.01*len(tf_y[m][0]))]
      if m == 'random': 
        plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=m, color='gray')
      elif m == 'cd' or m == 'logfc': 
          plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=m + ' (abs)')
      else:
        label = m.replace('limma', 'limma-voom')
        plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=label)

    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.legend(bbox_to_anchor=(1, 1))
    plt.title(f"{tf} Target Gene Rankings Leading Edge")
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.show()

    if tf == 'NR3C1': 
      fig.savefig('../../publicationfigs/Figure2F.png', format='png', dpi=300, bbox_inches='tight')


rank_comparison = pd.concat([
  signatures['cd']['CD-coefficient'].rename('CD').rank(),
  signatures['limma']['t'].rename('Limma').rank(),
  signatures['pydeseq2']['stat'].rename('DESeq2').rank(),
  signatures['logfc']['logFC'].apply(abs).rename('FC').rank(),
  signatures['ttest']['Statistic'].rename('Ttest').rank()
], axis=1)


expr_comparison = pd.concat([
  signatures['cd']['CD-coefficient'].rename('CD'),
  signatures['limma']['t'].rename('Limma'),
  signatures['pydeseq2']['stat'].rename('DESeq2'),
  signatures['logfc']['logFC'].apply(abs).rename('FC'),
  signatures['ttest']['Statistic'].rename('Ttest')
], axis=1)


sns.set(font_scale=1.4)


expr_correlations = 1-pairwise_distances(
  expr_comparison.T, metric='cosine'
)
np.fill_diagonal(expr_correlations, 1)

hm, hm_ax = plt.subplots(figsize=(10,8))  
sns.heatmap(pd.DataFrame(
  expr_correlations, index=expr_comparison.columns, columns=expr_comparison.columns
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()


rank_correlations = 1-pairwise_distances(
  rank_comparison.T, metric='cosine'
)
np.fill_diagonal(rank_correlations, 1)

hm, hm_ax = plt.subplots(figsize=(10,8))  
sns.heatmap(pd.DataFrame(
  rank_correlations, index=rank_comparison.columns, columns=rank_comparison.columns
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()

	Class	Pair
Sample_geo_accession
GSM4819298	Control	A
GSM4819299	Control	B
GSM4819300	Control	C
GSM4819301	Control	D
GSM4819286	Dex	A
GSM4819287	Dex	B
GSM4819288	Dex	C
GSM4819289	Dex	D

	GSM4819298	GSM4819299	GSM4819300	GSM4819301	GSM4819286	GSM4819287	GSM4819288	GSM4819289
Gene
A1BG	47	34	81	68	47	43	64	108
A1BG-AS1	100	39	131	138	115	64	128	136
A2M	86	152	67	63	142	183	52	76
A2M-AS1	11	7	6	2	9	13	15	10
A4GALT	58	46	23	126	96	47	26	146
...	...	...	...	...	...	...	...	...
ZXDC	1172	1433	1091	1161	1241	1320	1219	1617
ZYG11B	586	400	584	558	568	609	609	816
ZYX	8578	16603	5281	7752	8013	12881	4818	8480
ZZEF1	3961	3932	4041	4233	3854	4072	3957	4973
ZZZ3	1222	1079	1178	1093	1047	880	1210	1460

	CD-coefficient
Gene
TXNIP	0.598894
THBS1	0.388535
ETS1	0.184504
TSC22D3	0.169194
FKBP5	0.150122
...	...
TGFBI	-0.088838
FTH1	-0.090782
B2M	-0.112363
TMSB4X	-0.123791
TPT1	-0.125212

	logFC	AveExpr	t	P.Value	adj.P.Val	B
TRERF1	1.825465	4.508545	13.291168	5.187956e-07	0.004934	6.894803
KLF9	1.867456	6.560516	12.878515	6.717834e-07	0.004934	6.714460
LRRK2	2.912159	6.192054	12.258192	1.005224e-06	0.004934	6.317300
ZBTB16	3.670881	4.851185	11.661825	1.507455e-06	0.004934	5.866904
LINC02207	3.464744	0.844296	11.194183	2.098745e-06	0.004934	4.298915
...	...	...	...	...	...	...
TLR10	-1.285433	2.091784	-7.367492	5.522794e-05	0.014898	2.384327
RGS16	-2.211374	2.980781	-7.487366	4.891621e-05	0.014191	2.527091
TRAF1	-1.407711	6.755081	-7.853439	3.407249e-05	0.012474	2.782042
RUBCNL	-1.524953	5.816443	-8.246933	2.343390e-05	0.010668	3.163307
IL2RB	-1.142484	7.761209	-10.275840	4.171544e-06	0.004934	4.931457

	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj
Gene
A1BG	60.268470	0.095760	0.349195	0.274231	0.783907	0.999945
A1BG-AS1	105.669663	0.067365	0.398423	0.169080	0.865733	0.999945
A2M	104.645454	0.299210	0.494294	0.605327	0.544962	0.997478
A2M-AS1	9.196878	0.824300	0.572923	1.438764	0.150218	0.691082
A4GALT	68.671378	0.221482	0.583034	0.379878	0.704036	0.999945
...	...	...	...	...	...	...
ZXDC	1278.353808	0.100737	0.123178	0.817823	0.413458	0.969259
ZYG11B	587.352193	0.235630	0.146619	1.607090	0.108035	0.598073
ZYX	9124.450095	-0.189578	0.416302	-0.455386	0.648831	0.999945
ZZEF1	4119.306199	0.011666	0.062953	0.185317	0.852980	0.999945
ZZZ3	1140.231483	-0.051144	0.122481	-0.417567	0.676264	0.999945

Benchmarking RNA-seq DEG Methods with GEO Data¶

Load in Data¶

Differential Gene Expression Methods¶

Characteristic Direction¶

Limma-Voom¶

DESeq2 (via pyDESeq2)¶

Wilcoxon Rank Sum Test¶

Welch's t-test¶

(log2) Fold Change¶

Enrichment Analysis with Enrichr¶

Comparing Rankings¶

Opposite Direction¶

Random results¶

Boxplots¶

Bridge plots¶

Signature similarity¶

Correlation of signature expression values¶

Correlation of signature gene ranks¶

	Statistic	Pvalue
Geneid
MIR6883	13.578695	0.000077
ADRB2	12.976908	0.000027
KLRD1	11.149057	0.000036
TRERF1	10.878677	0.000643
NGFR	10.118823	0.000108
...	...	...
GPR15	-7.701463	0.000406
ABAT	-8.150563	0.000503
CD207	-8.198041	0.000583
ADGRE3	-9.049533	0.000154
IL2RB	-16.054562	0.000004

	logFC
Gene
LINC01736	14.269565
LOC100131496	12.358102
ALOX15B	8.811228
GLDN	7.005479
VSTM2L	6.676612
...	...
PTGS2	-5.019101
CCL7	-5.332820
IL1A	-5.484828
TNFSF15	-5.564333
CCL2	-6.440086

			Rank
TF	Library	Method
NR0B1	ChEA 2022	cd	383.0
		logfc	592.0
		ttest	612.0
		limma	662.0
		pydeseq2	675.0
NR1I2	ChEA 2022	pydeseq2	0.0
		limma	12.0
		ttest	12.0
		logfc	31.0
		cd	209.0
NR3C1	ChEA 2022	limma	9.2
		pydeseq2	18.4
		logfc	23.6
		ttest	130.2
		cd	149.0