logo Dexamethasone Benchmark Resource
  • About
  • Code
  • Data
  • Figures
  • Stats

Benchmarking the Recovery of Known Targets from L1000 CRISPR KO Data: Batch Version¶

KO Gene: NR0B1

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from IPython.display import display, Markdown
import json
import requests
import time
from random import sample
from math import log2
from maayanlab_bioinformatics.dge import characteristic_direction, limma_voom
from maayanlab_bioinformatics.plotting import bridge_plot
import matplotlib.pyplot as plt
from os.path import exists
import warnings
warnings.filterwarnings('ignore')

Load in Data¶

# Set KO gene
ko_gene = 'NR0B1'

# Set working directory
l1000_data_dir = '../L1000_data'

# Set parameters 
low_expression_threshold = 0.3
try: 
    expr_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv", sep='\t', index_col=0)
except: 
    l1000_data_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_data.tsv", sep='\t')

    l1000_data_list = []
    l1000_meta_list = []
    for row in l1000_data_df.itertuples(): 
        try:
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns:
            l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())
        l1000_data_list.append(temp_df)
    expr_df = pd.concat(l1000_data_list, axis=1)
expr_df.head()
XPR032_A375.311_96H_X1_B38:N14 XPR032_A375.311_96H_X2_B38:N14 XPR032_A375.311_96H_X3_B38:N14 XPR032_A375.311_96H_X1_B38:P20 XPR032_A375.311_96H_X3_B38:P20 XPR032_A549.311_96H_X1_B38:N14 XPR032_A549.311_96H_X2_B38:N14 XPR032_A549.311_96H_X3_B38:N14 XPR032_A549.311_96H_X1_B38:P20 XPR032_A549.311_96H_X2_B38:P20 ... XPR032_U251MG.311_96H_X2_B38:N14 XPR032_U251MG.311_96H_X3_B38:N14 XPR032_U251MG.311_96H_X1_B38:P20 XPR032_U251MG.311_96H_X2_B38:P20 XPR032_U251MG.311_96H_X3_B38:P20 XPR032_YAPC.311_96H_X4.L2_B41:N14 XPR032_YAPC.311_96H_X5.L2_B41:N14 XPR032_YAPC.311_96H_X6.L2_B41:N14 XPR032_YAPC.311_96H_X4.L2_B41:P20 XPR032_YAPC.311_96H_X6.L2_B41:P20
symbol
DDR1 5.255000 5.089600 5.427650 5.166525 5.239900 5.506400 5.4724 5.4069 5.7438 5.462400 ... 5.63815 5.825700 6.23110 5.26800 5.916700 6.679475 6.938175 5.278950 7.19605 7.62890
PAX8 4.913075 5.132925 5.074600 4.669800 5.265650 4.016100 4.6268 4.2078 4.2377 4.310450 ... 4.42210 4.625200 4.62415 4.53150 5.175500 4.408500 4.002800 4.260275 4.15570 3.35210
GUCA1A 5.193900 5.247600 4.621200 5.694850 5.050300 4.331350 4.3682 4.2829 4.3346 4.207350 ... 4.98960 4.619000 4.86720 4.67185 4.904350 5.476200 5.236925 5.535125 5.46450 5.54065
EPHB3 5.674850 6.362250 6.259725 5.560650 6.506450 6.481000 6.4755 6.9908 6.4902 6.621600 ... 7.15630 7.442451 7.25270 6.36430 7.246000 7.240800 7.576700 7.964250 7.88650 7.55025
ESRRA 7.840600 7.572600 7.368150 7.538750 7.622751 7.349851 7.5800 7.1017 7.1989 7.012701 ... 6.22650 6.905600 6.75130 6.59240 7.017099 8.261250 8.548100 7.938550 8.24955 8.39360

5 rows × 52 columns

if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv"): 
    expr_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv' already exists!")
File at '../L1000_data/NR0B1_L1000_CRISPRKO_fulldata.tsv' already exists!
try: 
    meta_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv", sep='\t', index_col=0)
except:
    meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')
if 'batch' not in meta_df.columns:
    meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))
meta_df.head()
tissue disease cell_line pert_name pert_time pert_type data_level creation_time persistent_id pert_dose batch
id
XPR032_A375.311_96H_X1_B38:N14 skin of body melanoma A375.311 NR0B1 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR032_A375.311_96H
XPR032_A375.311_96H_X2_B38:N14 skin of body melanoma A375.311 NR0B1 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR032_A375.311_96H
XPR032_A375.311_96H_X3_B38:N14 skin of body melanoma A375.311 NR0B1 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR032_A375.311_96H
XPR032_A375.311_96H_X1_B38:P20 skin of body melanoma A375.311 NR0B1 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR032_A375.311_96H
XPR032_A375.311_96H_X3_B38:P20 skin of body melanoma A375.311 NR0B1 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR032_A375.311_96H
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv"): 
    meta_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv' already exists!")
File at '../L1000_data/NR0B1_L1000_CRISPRKO_metadata.tsv' already exists!
batches = meta_df['batch'].unique()
len(batches)
9

Load in Control Data¶

ctrl_data_df = pd.read_csv(f"{l1000_data_dir}/L1000_Controls.tsv", sep='\t')
ctrl_data_df = ctrl_data_df[ctrl_data_df['batch'].isin(batches)]
set(batches).difference(ctrl_data_df['batch'])
set()
ctrl_data_df.head()
local_id persistent_id batch
10777 L1000_LINCS_DCIC_2021_XPR032_A375.311_96H_D10_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR032_A375.311_96H
10778 L1000_LINCS_DCIC_2021_XPR032_A375.311_96H_E14_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR032_A375.311_96H
10779 L1000_LINCS_DCIC_2021_XPR032_A375.311_96H_F21_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR032_A375.311_96H
10780 L1000_LINCS_DCIC_2021_XPR032_A375.311_96H_G09_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR032_A375.311_96H
10781 L1000_LINCS_DCIC_2021_XPR032_A375.311_96H_H16_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR032_A375.311_96H

The following step extracts all control profiles from the same batch as the profiles of interest, and make take up to a few minutes to complete.

try: 
    ctrl_expr_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv", sep='\t', index_col=0)
    ctrl_meta_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv", sep='\t', index_col=0)
except:
    ctrl_data_list = []
    ctrl_meta_list = []
    for row in ctrl_data_df.itertuples():
        try: 
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns: 
            ctrl_meta_list.append([col, row.batch])
        ctrl_data_list.append(temp_df)

    ctrl_expr_df = pd.concat(ctrl_data_list, axis=1)
    ctrl_meta_df = pd.DataFrame(ctrl_meta_list, columns=['id', 'batch']).set_index('id')
ctrl_expr_df.head()
XPR032_A375.311_96H_X1_B38:D10 XPR032_A375.311_96H_X2_B38:D10 XPR032_A375.311_96H_X3_B38:D10 XPR032_A375.311_96H_X1_B38:E14 XPR032_A375.311_96H_X2_B38:E14 XPR032_A375.311_96H_X3_B38:E14 XPR032_A375.311_96H_X1_B38:F21 XPR032_A375.311_96H_X2_B38:F21 XPR032_A375.311_96H_X3_B38:F21 XPR032_A375.311_96H_X1_B38:G09 ... XPR032_YAPC.311_96H_X6.L2_B41:I09 XPR032_YAPC.311_96H_X4.L2_B41:K10 XPR032_YAPC.311_96H_X5.L2_B41:K10 XPR032_YAPC.311_96H_X6.L2_B41:K10 XPR032_YAPC.311_96H_X4.L2_B41:K20 XPR032_YAPC.311_96H_X5.L2_B41:K20 XPR032_YAPC.311_96H_X6.L2_B41:K20 XPR032_YAPC.311_96H_X4.L2_B41:P03 XPR032_YAPC.311_96H_X5.L2_B41:P03 XPR032_YAPC.311_96H_X6.L2_B41:P03
symbol
NAT2 8.064151 9.469100 7.601850 8.033700 9.220400 8.24655 8.894925 8.72050 9.17865 8.68495 ... 11.370251 10.716150 10.178425 11.328951 9.41990 11.02845 12.07100 11.243151 9.276251 9.758101
ADA 6.507850 6.016400 7.335375 6.141600 5.903550 6.85880 6.393500 6.74075 6.35025 5.83335 ... 4.890375 5.205100 3.744300 5.539550 5.21020 4.89420 4.83905 4.656200 4.270800 5.249350
CDH2 4.857850 4.623050 5.011100 4.752725 4.761450 4.68530 5.200500 4.61915 4.61665 4.86035 ... 4.382100 4.921475 4.550850 4.611150 4.30870 4.04470 4.89275 5.437850 4.703550 5.628650
AKT3 4.182400 1.805000 4.125150 3.381000 3.420925 4.00585 4.777750 4.22330 6.42225 2.92995 ... 0.000000 1.254300 0.000000 0.000000 0.00000 0.00000 0.00000 0.000000 0.000000 0.000000
MED6 5.369700 6.277175 7.007350 6.533900 6.349325 6.65425 5.729750 6.16220 6.22880 6.02475 ... 6.200250 6.642900 5.949200 7.149099 6.65355 6.47650 6.49045 7.638025 6.261400 7.256250

5 rows × 238 columns

ctrl_meta_df.head()
batch
id
XPR032_A375.311_96H_X1_B38:D10 XPR032_A375.311_96H
XPR032_A375.311_96H_X2_B38:D10 XPR032_A375.311_96H
XPR032_A375.311_96H_X3_B38:D10 XPR032_A375.311_96H
XPR032_A375.311_96H_X1_B38:E14 XPR032_A375.311_96H
XPR032_A375.311_96H_X2_B38:E14 XPR032_A375.311_96H
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv"): 
    ctrl_expr_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv' already exists!")
File at '../L1000_data/NR0B1_L1000_Controls_fulldata.tsv' already exists!
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv"): 
    ctrl_meta_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv' already exists!")
File at '../L1000_data/NR0B1_L1000_Controls_metadata.tsv' already exists!

Normalize Data¶

Combine data and remove duplicate genes¶

combined_df = pd.concat([
    expr_df.groupby(expr_df.index).mean(), 
    ctrl_expr_df.groupby(ctrl_expr_df.index).mean()
], axis=1)
combined_df.head()
XPR032_A375.311_96H_X1_B38:N14 XPR032_A375.311_96H_X2_B38:N14 XPR032_A375.311_96H_X3_B38:N14 XPR032_A375.311_96H_X1_B38:P20 XPR032_A375.311_96H_X3_B38:P20 XPR032_A549.311_96H_X1_B38:N14 XPR032_A549.311_96H_X2_B38:N14 XPR032_A549.311_96H_X3_B38:N14 XPR032_A549.311_96H_X1_B38:P20 XPR032_A549.311_96H_X2_B38:P20 ... XPR032_YAPC.311_96H_X6.L2_B41:I09 XPR032_YAPC.311_96H_X4.L2_B41:K10 XPR032_YAPC.311_96H_X5.L2_B41:K10 XPR032_YAPC.311_96H_X6.L2_B41:K10 XPR032_YAPC.311_96H_X4.L2_B41:K20 XPR032_YAPC.311_96H_X5.L2_B41:K20 XPR032_YAPC.311_96H_X6.L2_B41:K20 XPR032_YAPC.311_96H_X4.L2_B41:P03 XPR032_YAPC.311_96H_X5.L2_B41:P03 XPR032_YAPC.311_96H_X6.L2_B41:P03
symbol
A1CF 5.042300 4.88865 4.503100 5.114400 4.19680 5.7100 5.7842 6.03365 6.00650 5.556000 ... 11.807100 11.71735 12.049049 11.65135 12.454750 11.67805 11.785749 11.98595 12.474900 12.255699
A2M 8.136351 9.75640 7.550050 9.257950 8.29125 8.0900 8.1192 7.12570 6.50205 6.652599 ... 7.871700 7.97915 7.558400 7.86500 7.667699 7.43105 8.419125 8.57765 7.610650 6.937225
A4GALT 5.682650 5.72150 5.574750 5.623775 5.77445 5.9049 5.6885 5.44520 5.58240 5.806400 ... 5.152150 5.92100 5.048950 6.11105 5.527200 5.35310 5.177000 5.31725 5.868650 5.505100
A4GNT 5.345350 5.15115 5.101900 5.487600 4.99815 4.7636 4.7672 4.78310 4.81630 4.724250 ... 10.035549 11.00025 10.085800 10.36765 10.302500 9.89095 9.880751 10.71460 10.327775 10.306700
AAAS 7.843300 8.01230 7.421226 7.221250 7.34165 7.4766 7.2301 7.06610 7.10430 7.214500 ... 6.554150 6.45785 6.389650 6.15305 6.222600 6.66180 6.661350 6.67710 6.641350 6.521500

5 rows × 290 columns

datasets = {
    'metadata': meta_df, 
    'ctrl_metadata': ctrl_meta_df,
    'rawdata': combined_df
}

Compute Signatures: Batch Perturbations vs. Batch Controls¶

batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}
for b in batches: 
    batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()
    batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()
batch_signatures = {'cd': {}, 'limma': {}, 'fc': {}}

Characteristic Direction¶

# Function for computing signatures with characteristic direction
def cd_signature(ctrl_ids, case_ids, dataset, normalization):
  
    signature = characteristic_direction(
        dataset[normalization].loc[:, ctrl_ids], 
        dataset[normalization].loc[:, case_ids], 
        calculate_sig=True
    )
    signature = signature.sort_values("CD-coefficient", ascending=False)
    
    return signature

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['cd'][b] = cd_signature(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        datasets, 
        'rawdata'
    )

Limma¶

# Function for computing signatures
def limma(ctrl_ids, case_ids, dataset, normalization):
    
    signature = limma_voom.limma_voom_differential_expression(
        dataset[normalization].loc[:, ctrl_ids],
        dataset[normalization].loc[:, case_ids],
        filter_genes=False
    )
    signature = signature.sort_values("t", ascending=False)

    return signature

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['limma'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        datasets, 
        'rawdata'
    )
R[write to console]: Loading required package: R.oo

R[write to console]: Loading required package: R.methodsS3

R[write to console]: Loading required package: R.methodsS3

R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

R[write to console]: 
Attaching package: ‘R.oo’


R[write to console]: The following object is masked from ‘package:R.methodsS3’:

    throw


R[write to console]: The following objects are masked from ‘package:methods’:

    getClasses, getMethods


R[write to console]: The following objects are masked from ‘package:base’:

    attach, detach, load, save


R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

R[write to console]: 
Attaching package: ‘R.utils’


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp


R[write to console]: The following objects are masked from ‘package:base’:

    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
    warnings


R[write to console]: 
Attaching package: ‘RCurl’


R[write to console]: The following object is masked from ‘package:R.utils’:

    reset


R[write to console]: The following object is masked from ‘package:R.oo’:

    clone


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    I, expand.grid, unname


R[write to console]: Loading required package: IRanges

R[write to console]: 
Attaching package: ‘IRanges’


R[write to console]: The following object is masked from ‘package:R.oo’:

    trim


R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: GenomeInfoDb

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘Biobase’


R[write to console]: The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians


R[write to console]: 
Attaching package: ‘limma’


R[write to console]: The following object is masked from ‘package:DESeq2’:

    plotMA


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    plotMA


Fold Change¶

# Function for computing signatures with fold change
def logFC(ctrl_ids, case_ids, dataset, normalization):

    case_mean = dataset[normalization].loc[:, case_ids].mean(axis=1)
    ctrl_mean = dataset[normalization].loc[:, ctrl_ids].mean(axis=1)
    
    signature = case_mean / ctrl_mean

    signature = signature.apply(lambda x: log2(x+1)).rename('logFC')
    
    return pd.DataFrame(signature.sort_values(ascending=False))
for b in batches: 
    batch_signatures['fc'][b] = logFC(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        datasets,
        'rawdata'
    )

All signatures¶

batch_signatures['cd_all']= pd.concat([
    df[['CD-coefficient']].rename(columns={'CD-coefficient': b}) for (b, df) in batch_signatures['cd'].items()
], axis=1).sort_index()
batch_signatures['limma_all'] = pd.concat([
    df[['t']].rename(columns={'t': b}) for (b, df) in batch_signatures['limma'].items()
], axis=1).sort_index()
batch_signatures['fc_all'] = pd.concat([
    df.rename(b) for b, df in batch_signatures['fc'].items()
], axis=1).sort_index()

for k in batch_signatures.keys(): 
    if k.endswith('_all'): 
        method = k.split('_')[0].upper()
        display(Markdown(f"All {method} batch signatures"))
        display(batch_signatures[k])

All CD batch signatures

XPR032_AGS.311_96H XPR032_A549.311_96H XPR032_BICR6.311_96H XPR032_ES2.311_96H XPR032_PC3.311B_96H XPR032_U251MG.311_96H XPR032_A375.311_96H XPR032_HT29.311_96H XPR032_YAPC.311_96H
symbol
A1CF -0.017783 -0.016047 -0.020645 -0.022145 -0.020482 -0.019187 -0.020061 -0.020211 -0.017267
A2M 0.005073 0.004325 -0.002482 0.001378 0.010943 0.009085 0.004170 0.003904 0.008592
A4GALT 0.001997 -0.001746 0.002162 -0.001375 0.000410 -0.000217 0.000460 0.000720 0.001741
A4GNT -0.017431 -0.014972 -0.015936 -0.018799 -0.017925 -0.019558 -0.016629 -0.018877 -0.013562
AAAS 0.002736 0.003185 0.001409 0.004325 0.001021 0.003434 0.002492 0.002569 0.003237
... ... ... ... ... ... ... ... ... ...
ZXDB -0.000282 -0.000234 0.000028 -0.002297 0.001384 -0.000324 -0.000004 -0.000076 -0.001289
ZXDC -0.006142 -0.010458 -0.003045 -0.003486 -0.004996 -0.008848 -0.000090 0.002266 -0.002561
ZYX 0.012263 0.009031 0.010796 0.016862 0.014329 0.013464 0.014974 0.007596 0.009235
ZZEF1 -0.006011 -0.013921 -0.008275 -0.012102 -0.005447 -0.010448 -0.010300 -0.009300 -0.013139
ZZZ3 0.004252 0.006931 0.005963 0.010460 0.009018 0.005085 0.007604 0.006008 0.003209

12327 rows × 9 columns

All LIMMA batch signatures

XPR032_AGS.311_96H XPR032_A549.311_96H XPR032_BICR6.311_96H XPR032_ES2.311_96H XPR032_PC3.311B_96H XPR032_U251MG.311_96H XPR032_A375.311_96H XPR032_HT29.311_96H XPR032_YAPC.311_96H
A1CF -31.401609 -50.202253 -92.215935 -57.170962 -40.826071 -40.648398 -43.490817 -52.801217 -32.479343
A2M 4.969393 4.644173 -7.136606 0.107135 10.578271 11.012444 5.799222 4.674300 7.700708
A4GALT 2.188914 -5.635605 10.049661 -2.815912 2.751041 0.346938 -1.500088 -0.499869 4.627847
A4GNT -52.692189 -41.352716 -79.309756 -68.903881 -53.336398 -57.597987 -43.542568 -60.534034 -30.738910
AAAS 12.600847 14.252838 7.151148 11.757109 1.258617 15.252655 10.579879 7.990256 9.632633
... ... ... ... ... ... ... ... ... ...
ZXDB -0.078946 -0.587111 3.251433 -2.533632 0.531741 -1.372360 3.515837 2.519761 -2.505127
ZXDC -6.219017 -16.505548 -9.643654 -5.520325 -8.714572 -23.390602 -2.862318 1.014756 -6.854683
ZYX 22.489053 27.402711 35.119063 21.876607 20.948085 26.892091 22.315721 13.386875 10.346862
ZZEF1 -10.595280 -30.928752 -17.735101 -19.004267 -11.055636 -15.283503 -17.462598 -22.132080 -12.173054
ZZZ3 8.319129 12.323906 19.538155 31.135778 12.643030 12.765400 11.261598 13.941717 5.741134

12327 rows × 9 columns

All FC batch signatures

XPR032_AGS.311_96H XPR032_A549.311_96H XPR032_BICR6.311_96H XPR032_ES2.311_96H XPR032_PC3.311B_96H XPR032_U251MG.311_96H XPR032_A375.311_96H XPR032_HT29.311_96H XPR032_YAPC.311_96H
symbol
A1CF -1.246737 -1.004192 -1.690429 -1.665790 -1.630342 -1.179333 -1.321012 -1.223911 -1.056337
A2M 0.278585 0.205393 -0.272091 0.000797 0.646290 0.502073 0.319819 0.240769 0.393296
A4GALT 0.068180 -0.146484 0.248284 -0.157615 0.139366 0.004167 -0.065231 -0.021841 0.184802
A4GNT -1.126377 -1.144015 -1.158369 -1.293424 -1.172730 -1.301800 -1.089542 -1.185840 -0.852393
AAAS 0.240515 0.271056 0.096593 0.260816 0.036636 0.311805 0.278677 0.179783 0.199099
... ... ... ... ... ... ... ... ... ...
ZXDB -0.003877 -0.017850 0.057785 -0.095970 0.019653 -0.036673 0.108741 0.053650 -0.078633
ZXDC -0.307716 -0.705302 -0.272967 -0.435002 -0.509541 -0.574439 -0.265492 0.043187 -0.251703
ZYX 0.833233 0.651813 0.741641 1.234107 1.058885 0.964060 1.060643 0.631116 0.554679
ZZEF1 -0.488678 -0.787260 -0.613892 -0.653600 -0.583108 -0.635629 -0.679085 -0.670204 -0.674863
ZZZ3 0.218672 0.434406 0.416559 0.806175 0.467929 0.357918 0.403518 0.355551 0.165524

12327 rows × 9 columns

Enrichment Analysis Rankings¶

# Function to get Enrichr Results
def Enrichr_API(enrichr_gene_list, library_name, signame, method, direction):

    all_ranks = []
    all_terms = []
    all_pvalues =[] 
    all_adjusted_pvalues = []

    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(enrichr_gene_list)
    description = f'Dex benchmark {signame} {method} {direction} genes'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error uploading gene list')

    list_data = json.loads(response.text)
    time.sleep(0.5)
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    response = requests.get(
        ENRICHR_URL + f"?userListId={list_data['userListId']}&backgroundType={library_name}"
    )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    enrich_data = json.loads(response.text)
    results_df  = pd.DataFrame(enrich_data[library_name])
    all_ranks.append(list(results_df[0]))
    all_terms.append(list(results_df[1]))
    all_pvalues.append(list(results_df[2]))
    all_adjusted_pvalues.append(list(results_df[6]))

    res_dict = {
        'ranks': all_ranks, 
        'terms': all_terms, 
        'pvals': all_pvalues,
        'adjpvals': all_adjusted_pvalues, 
        'shortid': str(list_data['shortId'])
    }

    return res_dict
# Get gene lists to put into Enrichr
gene_lists = {}
for m in batch_signatures.keys():
    if 'all' in m:
        mname = m.split('_')[0]
        gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}
        for sig in batch_signatures[m]: 
            gene_lists[mname]['up'][sig] = batch_signatures[m][sig].sort_values(ascending=False).index.tolist()[:250]
            gene_lists[mname]['down'][sig] = batch_signatures[m][sig].sort_values(ascending=True).index.tolist()[:250]
            gene_lists[mname]['combined'][sig] = gene_lists[mname]['up'][sig] + gene_lists[mname]['down'][sig]
# Get results
results = {}
for m in gene_lists.keys():
    results[m] = {'up': {}, 'down': {}, 'combined': {}}
    for d in gene_lists[m].keys():
        for sig in gene_lists[m][d].keys():
            results[m][d][sig]= Enrichr_API(
                gene_lists[m][d][sig], 'ChEA_2022', sig, m, d
            )
# Extract dexamethasone target rankings
# Initialize lists for storing target information
ranks = []
display(Markdown(f"## {ko_gene} Term Rankings"))

NR0B1 Term Rankings¶

for m in results.keys(): 
    for d in results[m].keys():
        for sig in results[m][d].keys():
            for i in range(len(results[m][d][sig]['ranks'][0])): 
                if ko_gene in results[m][d][sig]['terms'][0][i]: 
                    ranks.append([
                        f"{sig}:{m}:{d}",
                        results[m][d][sig]['terms'][0][i], 
                        results[m][d][sig]['ranks'][0][i],
                        results[m][d][sig]['pvals'][0][i]
                    ])
full_df = pd.DataFrame(ranks, columns=['Gene_Set', 'Term', 'Rank', 'p-value'])
full_df['Method'] = full_df['Gene_Set'].apply(lambda x: x.split(':')[1])
full_df['Direction'] = full_df['Gene_Set'].apply(lambda x : x.split(':')[2])
full_df['Method_Direction'] = full_df.apply(lambda row: row['Method'] + ':' + row['Direction'], axis=1)
up_df = full_df[full_df['Direction']=='up']
down_df = full_df[full_df['Direction']=='down']
combined_df = full_df[full_df['Direction']=='combined']
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for up genes from each method."))
display(up_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for down genes from each method."))
display(down_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for combined up and down genes from each method."))
display(combined_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))

Mean rank of NR0B1 terms from ChEA 2022 for up genes from each method.

Rank
Method
cd 159.888889
fc 233.555556
limma 330.000000

Mean rank of NR0B1 terms from ChEA 2022 for down genes from each method.

Rank
Method
cd 562.333333
limma 562.666667
fc 595.111111

Mean rank of NR0B1 terms from ChEA 2022 for combined up and down genes from each method.

Rank
Method
cd 276.222222
fc 475.666667
limma 517.666667
color_dict = {
    'up': 'crimson',
    'down': '#636EFA', 
    'combined': 'gray'
}

fig1 = go.Figure()
for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),
            name=gs.split(':')[0].upper() + ':' + gs.split(':')[1],
            marker_color=color_dict[gs.split(':')[1]]
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction",
    xaxis={
        'title': {'text': 'Method:Direction'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")
color_dict = {
    'cd': 'green',
    'limma': '#b76935', 
    'fc': 'purple'
}

fig1 = go.Figure()
for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),
            name=gs.split(':')[0].upper() + ':' + gs.split(':')[1],
            marker_color=color_dict[gs.split(':')[0]]
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction",
    xaxis={
        'title': {'text': 'Method:Direction'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")
full_df['Batch'] = full_df['Gene_Set'].apply(lambda x: x.split(':')[0])
fig1 = go.Figure()
for gs in full_df['Batch'].unique():
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Batch']==gs]['Rank'].tolist(),
            name=gs,
            marker_color='#636EFA'
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Batch",
    xaxis={
        'title': {'text': 'Batch'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")

Bridge plots¶

def getEnrichrGeneSets(libname, termlist): 
    term_dict = {}
    liburl = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={libname}'
    response = requests.get(liburl)
    if not response.ok:
        raise Exception('Error fetching library from Enrichr')
    for term in termlist:
        geneset = response.json()[libname]['terms'][term].keys()
        term_dict[term] = geneset
    return term_dict

target_genesets = getEnrichrGeneSets('ChEA_2022', full_df['Term'].unique())
def methodBridgePlot(gsname, method, colname, abs_val=False): 
    geneset = target_genesets[gsname]
    y = []

    for k in batch_signatures[method].keys():
        if abs_val: 
            sig = batch_signatures[method][k][colname].apply(abs)
        else: 
            sig = batch_signatures[method][k][colname]
        select = pd.Series(
            [x.upper() in geneset for x in sig.sort_values(ascending=False).index.tolist()]
        )   
        x, y_temp = bridge_plot(select)
        y.append(y_temp)
    
    x = x/len(x)
    return x, np.mean(y, axis=0)

def randomBridgePlot(gsname): 
    all_x = []
    all_y = []

    for _ in range(10):
        rand_sig = sample(
            batch_signatures['cd_all'].index.tolist(), 
            batch_signatures['cd_all'].shape[0]
        )
        select = pd.Series([
            x.upper() in target_genesets[gsname] for x in rand_sig
        ])
        x, y = bridge_plot(select)
        x = x/len(x)
        all_x.append(x)
        all_y.append(y)

    return all_x, all_y
colnames = {
    'cd': 'CD-coefficient', 
    'limma': 't', 
    'fc': 'logFC'
}
def build_res(abs_val=False):
    res = {}
    for term in target_genesets.keys():
        res[term] = {'random': {'x': [], 'y': []}}
        rand_x, rand_y = randomBridgePlot(term)
        res[term]['random']['x'] += rand_x
        res[term]['random']['y'] += rand_y
        for method in batch_signatures.keys():
            if 'all' not in method:
                temp_x, temp_y = methodBridgePlot(term, method, colnames[method], abs_val=abs_val)
                if method in res[term].keys():
                    res[term][method]['x'].append(temp_x)
                    res[term][method]['y'].append(temp_y)
                else: 
                    res[term][method] = {'x': [temp_x], 'y': [temp_y]}
    return res

method_results = build_res(abs_val=False)
method_results_abs = build_res(abs_val=True)
def drawBridgePlot(abs_val=False):
    if abs_val:
        res = method_results_abs
    else: 
        res = method_results
    for term in res.keys():
        for sig in res[term].keys(): 
            term_x = np.mean(res[term][sig]['x'], axis=0)
            term_y = np.mean(res[term][sig]['y'], axis=0)
            if sig == 'random': 
                plt.plot(term_x, term_y, label=sig, color='gray')
            elif sig == 'CD': 
                plt.plot(term_x-(0.01*max(term_x)), term_y, label=sig)
            else:
                plt.plot(term_x, term_y, label=sig)

        plt.axhline(y=0, color='black', linestyle='dashed')
        plt.legend(bbox_to_anchor=(1, 1))
        if abs_val:
            plt.title(f"{term.split(' ')[0]} Target Gene Rankings for Each Method (Abs Val)")
        else:
            plt.title(f"{term.split(' ')[0]} Target Gene Rankings for Each Method")
        plt.show()
drawBridgePlot()
View source code
Submit an issue