logo Dexamethasone Benchmark Resource
  • About
  • Code
  • Data
  • Figures
  • Stats

Benchmarking RNA-seq DEG Methods with GEO Data¶

This notebook reads in designated meta data and RNA-seq expression data files from a dexamethasone study and performs several DGE methods on the code.

The results of the DGE methods are benchmarked after enrichment analysis focused on the NR3C1, NR0B1, and NR1I2 transcription factors, known targets of dexamethasone.

# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import requests
import json
import time
import scipy.stats as ss
from maayanlab_bioinformatics.normalization.filter import filter_by_expr
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from math import log2, ceil
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
import urllib.request
import warnings
warnings.filterwarnings('ignore')

Load in Data¶

Using data from GEO from the study "Development of a glucocorticoid exposure signature for SLE" (GSE159094)

# Set all variables
meta_class_column_name = 'Class'
control_name = 'Control'
treatment_name = 'Dex'

data_dir = '../RNAseq_data'
meta_data_filename = 'GSE159094_series_matrix.txt'
rnaseq_data_filename = 'GSE159094_counts.txt'
# Load in metadata
try:
    if meta_data_filename.endswith('.csv'): 
        meta_df = pd.read_csv(f"{data_dir}/{meta_data_filename}", index_col=0)
    else:
        meta_df = pd.read_csv(f"{data_dir}/{meta_data_filename}", sep="\t", index_col=0)
    print(f"Metadata file shape: {meta_df.shape[0]} samples, {meta_df.shape[1]} features")
except:
    print("Error! Please ensure the metadata file is in CSV, TXT, or TSV format.")

# Load in data
try:
    if rnaseq_data_filename.endswith('.csv'): 
        expr_df = pd.read_csv(f"{data_dir}/{rnaseq_data_filename}", index_col=0).sort_index()
    else:
        expr_df = pd.read_csv(f"{data_dir}/{rnaseq_data_filename}", sep="\t", index_col=0).sort_index()
    print(f"RNA-seq data file shape: {expr_df.shape[0]} genes, {expr_df.shape[1]} samples")
except:
    print("Error! Please ensure the RNA-seq file is in CSV, TXT, or TSV format.")

# Match samples between the metadata and the dataset
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' was not found in the metadata.")

shared_samples = set(meta_df.index).intersection(expr_df.columns)
meta_df = meta_df[meta_df.index.isin(shared_samples)]
meta_df = pd.concat([
    meta_df[meta_df['Class'] == control_name],
    meta_df[meta_df['Class'] == treatment_name]
]) 
expr_df = expr_df[meta_df.index]
print(f"\nThe pipeline will be run on {len(shared_samples)} samples.")
Metadata file shape: 8 samples, 2 features
RNA-seq data file shape: 27780 genes, 8 samples

The pipeline will be run on 8 samples.
## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
expr_df = filter_by_expr(expr_df, group=meta_df[meta_class_column_name].apply(lambda x: x==treatment_name))
print(f"Number of genes after filtering by low expression: {expr_df.shape[0]}")
Number of genes after filtering by low expression: 15376
# Display meta data
meta_df
Class Pair
Sample_geo_accession
GSM4819298 Control A
GSM4819299 Control B
GSM4819300 Control C
GSM4819301 Control D
GSM4819286 Dex A
GSM4819287 Dex B
GSM4819288 Dex C
GSM4819289 Dex D
# Display expression data
expr_df
GSM4819298 GSM4819299 GSM4819300 GSM4819301 GSM4819286 GSM4819287 GSM4819288 GSM4819289
Gene
A1BG 47 34 81 68 47 43 64 108
A1BG-AS1 100 39 131 138 115 64 128 136
A2M 86 152 67 63 142 183 52 76
A2M-AS1 11 7 6 2 9 13 15 10
A4GALT 58 46 23 126 96 47 26 146
... ... ... ... ... ... ... ... ...
ZXDC 1172 1433 1091 1161 1241 1320 1219 1617
ZYG11B 586 400 584 558 568 609 609 816
ZYX 8578 16603 5281 7752 8013 12881 4818 8480
ZZEF1 3961 3932 4041 4233 3854 4072 3957 4973
ZZZ3 1222 1079 1178 1093 1047 880 1210 1460

15376 rows × 8 columns

Differential Gene Expression Methods¶

signatures = {}

Characteristic Direction¶

# Function for computing signatures with characteristic direction
def cd_signature(control, treatment, data, metadata, meta_class_column_name):
    
    ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist() #control
    case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
    
    signature = characteristic_direction(
        data.loc[:, ctrl_ids], 
        data.loc[:, case_ids], 
        calculate_sig=True
    )
    signature = signature.sort_values("CD-coefficient", ascending=False)
    signature['Significance'] = signature['CD-coefficient'].apply(abs)
    
    return signature

signatures['cd'] = cd_signature(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Characteristic Direction Signature"))
display(signatures['cd'].drop(columns=['Significance']))

Characteristic Direction Signature

CD-coefficient
Gene
TXNIP 0.598894
THBS1 0.388535
ETS1 0.184504
TSC22D3 0.169194
FKBP5 0.150122
... ...
TGFBI -0.088838
FTH1 -0.090782
B2M -0.112363
TMSB4X -0.123791
TPT1 -0.125212

15376 rows × 1 columns

Limma-Voom¶

# Function for computing signatures
def limma(control, treatment, data, metadata, meta_class_column_name):

    ctrl_ids = metadata.loc[metadata[meta_class_column_name] == control, :].index.tolist()  # control
    case_ids = metadata.loc[metadata[meta_class_column_name] == treatment, :].index.tolist()  # case

    # run limma
    signature = limma_voom_differential_expression(
        data.loc[:, ctrl_ids],
        data.loc[:, case_ids],
        voom_design=True,
        filter_genes=False
    )
    signature = signature.sort_values("t", ascending=False)
    signature['Significance'] = signature['P.Value']
    signature['StatVal'] = signature['t'].apply(abs)

    # return result
    return signature

signatures['limma'] = limma(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Limma-voom Signature"))
display(signatures['limma'].drop(columns=['Significance', 'StatVal']))

Limma-voom Signature

logFC AveExpr t P.Value adj.P.Val B
TRERF1 1.825465 4.508545 13.291168 5.187956e-07 0.004934 6.894803
KLF9 1.867456 6.560516 12.878515 6.717834e-07 0.004934 6.714460
LRRK2 2.912159 6.192054 12.258192 1.005224e-06 0.004934 6.317300
ZBTB16 3.670881 4.851185 11.661825 1.507455e-06 0.004934 5.866904
LINC02207 3.464744 0.844296 11.194183 2.098745e-06 0.004934 4.298915
... ... ... ... ... ... ...
TLR10 -1.285433 2.091784 -7.367492 5.522794e-05 0.014898 2.384327
RGS16 -2.211374 2.980781 -7.487366 4.891621e-05 0.014191 2.527091
TRAF1 -1.407711 6.755081 -7.853439 3.407249e-05 0.012474 2.782042
RUBCNL -1.524953 5.816443 -8.246933 2.343390e-05 0.010668 3.163307
IL2RB -1.142484 7.761209 -10.275840 4.171544e-06 0.004934 4.931457

15376 rows × 6 columns

DESeq2 (via pyDESeq2)¶

# Function for computing signatures with pyDESeq2
def deseq2(data, metadata, meta_class_column_name):
    dds = DeseqDataSet(
        counts=data.T,
        clinical=metadata,
        design_factors=meta_class_column_name
    )
    dds.deseq2()
    signature = DeseqStats(dds)
    return signature

signatures['pydeseq2'] = deseq2(expr_df, meta_df, meta_class_column_name)
display(Markdown(f"pyDESeq2 Signature"))
signatures['pydeseq2'].summary()
signatures['pydeseq2'].results_df['Significance'] = signatures['pydeseq2'].results_df['pvalue']
signatures['pydeseq2'].results_df['StatVal'] = signatures['pydeseq2'].results_df['stat'].apply(abs)
signatures['pydeseq2'] = signatures['pydeseq2'].results_df.sort_values(by=['stat'], ascending=False)
Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 2.47 seconds.

Fitting dispersion trend curve...
... done in 5.79 seconds.

Fitting MAP dispersions...
... done in 2.89 seconds.

Fitting LFCs...
... done in 1.34 seconds.

Refitting 0 outliers.

pyDESeq2 Signature

Running Wald tests...
... done in 0.91 seconds.

Log2 fold change & Wald test p-value: Class Dex vs Control
baseMean log2FoldChange lfcSE stat pvalue padj
Gene
A1BG 60.268470 0.095760 0.349195 0.274231 0.783907 0.999945
A1BG-AS1 105.669663 0.067365 0.398423 0.169080 0.865733 0.999945
A2M 104.645454 0.299210 0.494294 0.605327 0.544962 0.997478
A2M-AS1 9.196878 0.824300 0.572923 1.438764 0.150218 0.691082
A4GALT 68.671378 0.221482 0.583034 0.379878 0.704036 0.999945
... ... ... ... ... ... ...
ZXDC 1278.353808 0.100737 0.123178 0.817823 0.413458 0.969259
ZYG11B 587.352193 0.235630 0.146619 1.607090 0.108035 0.598073
ZYX 9124.450095 -0.189578 0.416302 -0.455386 0.648831 0.999945
ZZEF1 4119.306199 0.011666 0.062953 0.185317 0.852980 0.999945
ZZZ3 1140.231483 -0.051144 0.122481 -0.417567 0.676264 0.999945

15376 rows × 6 columns

Wilcoxon Rank Sum Test¶

def ranksum(control, treatment, data, metadata, meta_class_column_name):
  res_array = []
  ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
  case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
  for gene in data.index: 
    res = ss.ranksums(
      data.loc[gene, case_ids],
      data.loc[gene, ctrl_ids]
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  signature['StatVal'] = signature['Statistic'].apply(abs)
  return signature.sort_values(by=['Statistic'], ascending=False)


if expr_df.shape[1] <  32: 
  print("Sample sizes <16 generally do not provide good results -- Wilcoxon Rank Sum is skipped. ")
else:
  signatures['ranksum'] = ranksum(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
  display(Markdown(f"Wilcoxon Rank-Sum Test"))
  display(signatures['ranksum'].drop(columns=['Significance', 'StatVal']))
Sample sizes <16 generally do not provide good results -- Wilcoxon Rank Sum is skipped. 

Welch's t-test¶

def ttest(control, treatment, data, metadata, meta_class_column_name):
  res_array = []
  ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
  case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
  for gene in data.index: 
    res = ss.ttest_ind(
      data.loc[gene, case_ids],
      data.loc[gene, ctrl_ids],
      equal_var = False
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  signature['StatVal'] = signature['Statistic'].apply(abs)
  return signature.sort_values(by=['Statistic'], ascending=False)

signatures['ttest'] = ttest(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown(f"Welch's t-test"))
display(signatures['ttest'].drop(columns=['Significance', 'StatVal']))

Welch's t-test

Statistic Pvalue
Geneid
MIR6883 13.578695 0.000077
ADRB2 12.976908 0.000027
KLRD1 11.149057 0.000036
TRERF1 10.878677 0.000643
NGFR 10.118823 0.000108
... ... ...
GPR15 -7.701463 0.000406
ABAT -8.150563 0.000503
CD207 -8.198041 0.000583
ADGRE3 -9.049533 0.000154
IL2RB -16.054562 0.000004

15376 rows × 2 columns

(log2) Fold Change¶

# Function for computing signatures with fold change
def logFC(control, treatment, data, metadata, meta_class_column_name):
    ctrl_ids = metadata.loc[metadata[meta_class_column_name]==control, :].index.tolist()
    case_ids = metadata.loc[metadata[meta_class_column_name]==treatment,:].index.tolist() #case
    signature = data.loc[:, case_ids].mean(axis=1) / (data.loc[:, ctrl_ids].mean(axis=1) + 0.001)
    signature_df = pd.DataFrame(
        signature.apply(lambda x: log2(x+0.001)) \
        .sort_values(ascending=False), columns=['logFC']
    )
    signature_df['Significance'] = signature_df['logFC'].apply(abs)
    return signature_df

signatures['logfc'] = logFC(control_name, treatment_name, expr_df, meta_df, meta_class_column_name)
display(Markdown("Log Fold Change Signature"))
display(signatures['logfc'].drop(columns=['Significance']))

Log Fold Change Signature

logFC
Gene
LINC01736 14.269565
LOC100131496 12.358102
ALOX15B 8.811228
GLDN 7.005479
VSTM2L 6.676612
... ...
PTGS2 -5.019101
CCL7 -5.332820
IL1A -5.484828
TNFSF15 -5.564333
CCL2 -6.440086

15376 rows × 1 columns

Enrichment Analysis with Enrichr¶

The function below uses an offline version of Enrichr to compute enrichment analysis.

# Get gene lists to put into Enrichr
gene_lists = {'up': {}, 'down': {}, 'combined': {}}
for method in signatures.keys():
    if method == 'ranksum' and expr_df.shape[1] < 16: continue
    gene_lists['up'][method] = signatures[method].head(100).index.tolist()
    gene_lists['down'][method] = signatures[method].tail(100).index.tolist()
    gene_lists['combined'][method] = gene_lists['up'][method] + gene_lists['down'][method]
# Function to get Enrichr Results

def getEnrichrLibrary(library_name): 
    ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'
    resp = requests.get(ENRICHR_URL)
    if not resp.ok: 
        raise Exception(f"Error downloading {library_name} library from Enrichr, please try again.")
    return resp.json()[library_name]['terms']

def getLibraryIter(libdict):
    for k,v in libdict.items():
        if type(v) == list:
            yield k, v
        else:
            yield k, list(v.keys())

def enrich(gene_list, lib_json, name): 
    gene_list = [x.upper() for x in gene_list]
    all_terms = list(lib_json.keys())
    termranks = []
    enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)
    enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]
    sorted_res = sorted(enrich_res, key=lambda x: x[1])
    for i in range(len(sorted_res)): 
        termranks.append([name, sorted_res[i][0], i])
    for t in set(all_terms).difference([x[1] for x in termranks]): 
        i+=1
        termranks.append([name, t, i])
    return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])
chea2022 = getEnrichrLibrary('ChEA_2022')
# Get results
chea2022_results = []

for m in gene_lists['up'].keys(): 
    chea2022_results.append(enrich(gene_lists['up'][m], chea2022, f"{m}:up:ChEA 2022"))
    chea2022_results.append(enrich(gene_lists['down'][m], chea2022, f"{m}:down:ChEA 2022"))
    chea2022_results.append(enrich(gene_lists['combined'][m], chea2022, f"{m}:combined:ChEA 2022"))

chea2022_df = pd.concat(chea2022_results, axis=0)
targets = [
  'NR3C1', 
  'NR1I2',
  'NR0B1'
]

dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: any(t in term for t in targets))]
def createResultsDf(df):
  df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])
  df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])
  df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])
  df['Library'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])
createResultsDf(dex_chea2022_df)
full_df = dex_chea2022_df
full_df = full_df[full_df['Term'].apply(lambda x: 'MLEC' not in x)]

up_df = full_df[full_df['Direction'] == 'up']
down_df = full_df[full_df['Direction'] == 'down']
combined_df = full_df[full_df['Direction'] == 'combined']

Comparing Rankings¶

Below are tables and graphs with a summary of the rankings results. For each method, the rankings are averaged across the different dexamethasone target terms.

# Calculate and sort by mean rank grouping by method and name
display(Markdown("#### Opposite Direction"))

up_df_averages_byTF = up_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of up genes from each method"))
display(up_df_averages_byTF)

down_df_averages_byTF = down_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of down genes from each method"))
display(down_df_averages_byTF)

combined_df_averages_byTF = combined_df.groupby(['TF', 'Library', 'Method']).mean().sort_values(['TF', 'Library', 'Rank']).apply(lambda x: round(x, 2))
display(Markdown("Mean rankings for each Dex target TF, based on enrichment analysis of combined up/down genes from each method"))
display(combined_df_averages_byTF)

Opposite Direction¶

Mean rankings for each Dex target TF, based on enrichment analysis of up genes from each method

Rank
TF Library Method
NR0B1 ChEA 2022 cd 383.0
logfc 592.0
ttest 612.0
limma 662.0
pydeseq2 675.0
NR1I2 ChEA 2022 pydeseq2 0.0
limma 12.0
ttest 12.0
logfc 31.0
cd 209.0
NR3C1 ChEA 2022 limma 9.2
pydeseq2 18.4
logfc 23.6
ttest 130.2
cd 149.0

Mean rankings for each Dex target TF, based on enrichment analysis of down genes from each method

Rank
TF Library Method
NR0B1 ChEA 2022 cd 173.0
limma 178.0
logfc 200.0
pydeseq2 384.0
ttest 593.0
NR1I2 ChEA 2022 limma 350.0
pydeseq2 352.0
ttest 613.0
logfc 629.0
cd 683.0
NR3C1 ChEA 2022 pydeseq2 187.8
limma 223.0
logfc 274.6
cd 352.6
ttest 419.8

Mean rankings for each Dex target TF, based on enrichment analysis of combined up/down genes from each method

Rank
TF Library Method
NR0B1 ChEA 2022 cd 263.0
logfc 433.0
limma 483.0
pydeseq2 607.0
ttest 675.0
NR1I2 ChEA 2022 pydeseq2 30.0
limma 75.0
ttest 112.0
logfc 231.0
cd 465.0
NR3C1 ChEA 2022 limma 44.2
pydeseq2 51.2
logfc 107.2
ttest 212.4
cd 218.0

Random results¶

As a sanity check, we can bootstrap random results and compare against the results for each of the methods, as even in the case of raw data we would expect to see some prioritization of dexamethasone target genes if the data is correct.

We can achieve this by randomly sampling 250 and 500 genes, respectively, from all genes in the dataset multiple times, and then performing the same process of ranking the enrichment analysis results for the random gene sets.

# bootstrap random results
random_arr_chea2022 = []
for i in range(10):
    rand_100 = sample(expr_df.index.tolist(), 100)
    rand_200 = sample(expr_df.index.tolist(), 200)

    random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))   
    random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))   

rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)
rand_chea2022_df['Library'] = 'ChEA 2022'
rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0].upper())

rand_df = rand_chea2022_df
rand_df = rand_df[rand_df['TF'].isin(['NR3C1', 'NR0B1', 'NR1I2'])]
rand_df['Method'] = 'random'

Boxplots¶

Using boxplots, we can visualize the average rankings of the dexamethasone target terms that are displayed in the tables above. This provides a clearer sense of which methods appear to be performing better (smaller numerical rank) across all the terms.

method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'pydeseq2': '#DC267F',
    'logfc': '#FE6100',
    'ttest': '#FFB000',
    'ranksum': '#B3B3B3',
    'paired-ttest': '#009E73'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf].set_index(['Method'])
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Method']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df.loc[gs]['Rank'].tolist(),
                name=gs, 
                marker_color=method_color_map[gs.split(':')[0]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")
method_color_map = {
    'up': 'blue',
    'down': 'crimson',
    'combined': 'purple'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf].set_index(['Direction'])
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Direction']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df.loc[gs]['Rank'].tolist(),
                name=gs, 
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")
method_color_map = {
    'up': 'blue',
    'down': 'crimson',
    'combined': 'purple'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf]
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Gene_Set'] == gs]['Rank'],
                name=gs.split(':ChEA')[0], 
                marker_color=method_color_map[gs.split(':')[1]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf].set_index(['Gene_Set'])
    for gs in sub_rand_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_rand_df.loc[gs]['Rank'],
                name=gs,
                marker_color='black'
            )
        )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method:Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")
method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'pydeseq2': '#DC267F',
    'logfc': '#FE6100',
    'ttest': '#FFB000',
    'ranksum': '#B3B3B3',
    'paired-ttest': '#009E73'
}
for tf in dex_chea2022_df['TF'].unique():
    sub_df = dex_chea2022_df[dex_chea2022_df['TF']==tf]
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Gene_Set'] == gs]['Rank'],
                name=gs.split(':ChEA')[0], 
                marker_color=method_color_map[gs.split(':')[0]]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf].set_index(['Gene_Set'])
    for gs in sub_rand_df.groupby(['Gene_Set']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_rand_df.loc[gs]['Rank'],
                name=gs,
                marker_color='black'
            )
        )

    fig1.update_layout(
        width=800,
        title_text=f"{tf} Term Rankings from ChEA 2022 for Dex RNA-seq Gene Sets",
        xaxis={
            'title': {'text': 'Method:Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")

Bridge plots¶

The below plots provide the Brownian bridge plots comparing the ranked genes from each method signature with the genes in the target TF gene sets.

target_genesets = []
target_genesets.append({t:chea2022[t] for t in dex_chea2022_df['Term'].unique() if ('MLEC' not in t and 'Wistar' not in t)})
def singleBridgePlot(gsname, signame, target_gs):
    geneset = target_gs[gsname]
    if signame == 'cd' or signame == 'logfc':
        sorted_genes = signatures[signame].sort_values(by='Significance', ascending=False)
    else: 
        sorted_genes = signatures[signame].sort_values(by=['Significance'], ascending=True)
    select = pd.Series(
        [x.upper() in geneset for x in sorted_genes.index.tolist()]
    )
    x, y = bridge_plot(select)
    x = x/len(x)
    return x,y

def randomBridgePlot(gsname, target_gs): 
    all_x = []
    all_y = []

    for _ in range(10):
        rand_sig = sample(
            signatures['logfc'].index.tolist(), 
            signatures['logfc'].shape[0]
        )
        select = pd.Series([
            x.upper() in target_gs[gsname] for x in rand_sig
        ])
        x, y = bridge_plot(select)
        x = x/len(x)
        all_x.append(x)
        all_y.append(y)

    return all_x, all_y
def build_tf_res(target_gs):
    tf_res = {tf: {'random': {'x': [], 'y': []}} for tf in full_df['TF'].unique()}
    for gs in target_gs.keys():
        tf = gs.split(' ')[0].split('_')[0]
        rand_x, rand_y = randomBridgePlot(gs, target_gs)
        tf_res[tf]['random']['x'] += rand_x
        tf_res[tf]['random']['y'] += rand_y
        for sig in signatures.keys():
            temp_x, temp_y = singleBridgePlot(gs, sig, target_gs)
            if sig in tf_res[tf].keys():
                tf_res[tf][sig]['x'].append(temp_x)
                tf_res[tf][sig]['y'].append(temp_y)
            else: 
                tf_res[tf][sig] = {'x': [temp_x], 'y': [temp_y]}
    return tf_res
significance_tf_results = [] 
for i in range(len(target_genesets)): 
    significance_tf_results.append(build_tf_res(target_genesets[i]))
libraries = ['ChEA 2022']
# with opposite direction queries
matplotlib.rcParams.update({'font.size': 12})
for tf in full_df['TF'].unique():
    fig = plt.figure()
    tf_x = {method: [] for method in significance_tf_results[0][tf].keys()}
    tf_y = {method: [] for method in significance_tf_results[0][tf].keys()}
    for i in range(len(target_genesets)):
      for sig in significance_tf_results[i][tf].keys(): 
        tf_x[sig].append(np.mean(significance_tf_results[i][tf][sig]['x'], axis=0))
        tf_y[sig].append(np.mean(significance_tf_results[i][tf][sig]['y'], axis=0))
    for m in tf_x.keys():
      if m == 'random': 
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=m, color='gray')
      elif m == 'cd' or m == 'logfc': 
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=m + ' (abs)')
      else:
          label = m.replace('limma', 'limma-voom')
          plt.plot(np.mean(tf_x[m], axis=0), np.mean(tf_y[m], axis=0), label=label)

    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.legend(bbox_to_anchor=(1.25, 1))
    plt.title(f"{tf} Target Gene Rankings")
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.show()
    
    if tf == 'NR3C1':
        fig.savefig('../../publicationfigs/Figure2E.png', format='png', dpi=300, bbox_inches='tight')
# with manual libraries
matplotlib.rcParams.update({'font.size': 12})
for tf in full_df['TF'].unique():
    fig = plt.figure()
    tf_x = {method: [] for method in significance_tf_results[0][tf].keys()}
    tf_y = {method: [] for method in significance_tf_results[0][tf].keys()}
    for i in range(len(libraries)):
      for sig in significance_tf_results[i][tf].keys(): 
        tf_x[sig].append(np.mean(significance_tf_results[i][tf][sig]['x'], axis=0))
        tf_y[sig].append(np.mean(significance_tf_results[i][tf][sig]['y'], axis=0))
    for m in tf_x.keys():
      try:
        leading_x = [arr[:ceil(0.01*len(arr))] for arr in tf_x[m]]
        leading_y = [arr[:ceil(0.01*len(arr))] for arr in tf_y[m]]
      except:
        leading_x = tf_x[m][0][:ceil(0.01*len(tf_x[m][0]))]
        leading_y = tf_y[m][0][:ceil(0.01*len(tf_y[m][0]))]
      if m == 'random': 
        plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=m, color='gray')
      elif m == 'cd' or m == 'logfc': 
          plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=m + ' (abs)')
      else:
        label = m.replace('limma', 'limma-voom')
        plt.plot(np.mean(leading_x, axis=0), np.mean(leading_y, axis=0), label=label)

    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.legend(bbox_to_anchor=(1, 1))
    plt.title(f"{tf} Target Gene Rankings Leading Edge")
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.show()

    if tf == 'NR3C1': 
      fig.savefig('../../publicationfigs/Figure2F.png', format='png', dpi=300, bbox_inches='tight')

Signature similarity¶

rank_comparison = pd.concat([
  signatures['cd']['CD-coefficient'].rename('CD').rank(),
  signatures['limma']['t'].rename('Limma').rank(),
  signatures['pydeseq2']['stat'].rename('DESeq2').rank(),
  signatures['logfc']['logFC'].apply(abs).rename('FC').rank(),
  signatures['ttest']['Statistic'].rename('Ttest').rank()
], axis=1)
expr_comparison = pd.concat([
  signatures['cd']['CD-coefficient'].rename('CD'),
  signatures['limma']['t'].rename('Limma'),
  signatures['pydeseq2']['stat'].rename('DESeq2'),
  signatures['logfc']['logFC'].apply(abs).rename('FC'),
  signatures['ttest']['Statistic'].rename('Ttest')
], axis=1)
sns.set(font_scale=1.4)

Correlation of signature expression values¶

expr_correlations = 1-pairwise_distances(
  expr_comparison.T, metric='cosine'
)
np.fill_diagonal(expr_correlations, 1)

hm, hm_ax = plt.subplots(figsize=(10,8))  
sns.heatmap(pd.DataFrame(
  expr_correlations, index=expr_comparison.columns, columns=expr_comparison.columns
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()

Correlation of signature gene ranks¶

rank_correlations = 1-pairwise_distances(
  rank_comparison.T, metric='cosine'
)
np.fill_diagonal(rank_correlations, 1)

hm, hm_ax = plt.subplots(figsize=(10,8))  
sns.heatmap(pd.DataFrame(
  rank_correlations, index=rank_comparison.columns, columns=rank_comparison.columns
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()
View source code
Submit an issue