logo Dexamethasone Benchmark Resource
  • About
  • Code
  • Data
  • Figures
  • Stats

Benchmarking the Recovery of Known Targets from L1000 CRISPR KO Data: Batch Version¶

KO Gene: NR1I2

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from IPython.display import display, Markdown
import json
import requests
import time
from random import sample
from math import log2
from maayanlab_bioinformatics.dge import characteristic_direction, limma_voom
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from os.path import exists
import warnings
warnings.filterwarnings('ignore')

Load in Data¶

# Set KO gene
ko_gene = 'NR1I2'

# Set working directory
l1000_data_dir = '../L1000_data'

# Set parameters 
low_expression_threshold = 0.3
try: 
    expr_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv", sep='\t', index_col=0)
except: 
    l1000_data_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_data.tsv", sep='\t')

    l1000_data_list = []
    l1000_meta_list = []
    for row in l1000_data_df.itertuples(): 
        try:
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns:
            l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())
        l1000_data_list.append(temp_df)
    expr_df = pd.concat(l1000_data_list, axis=1)
expr_df.head()
XPR010_A375.311_96H_X1_B35:F08 XPR010_A375.311_96H_X2_B35:F08 XPR010_A375.311_96H_X3_B35:F08 XPR010_A375.311_96H_X1_B35:F24 XPR010_A375.311_96H_X2_B35:F24 XPR010_A375.311_96H_X3_B35:F24 XPR010_A549.311_96H_X1.L2_B36:F08 XPR010_A549.311_96H_X3_B35:F08 XPR010_A549.311_96H_X1.L2_B36:F24 XPR010_A549.311_96H_X3_B35:F24 ... XPR010_U251MG.311_96H_X3_B35:F08 XPR010_U251MG.311_96H_X1_B35:F24 XPR010_U251MG.311_96H_X2_B35:F24 XPR010_U251MG.311_96H_X3_B35:F24 XPR010_YAPC.311_96H_X1_B35:F08 XPR010_YAPC.311_96H_X2_B35:F08 XPR010_YAPC.311_96H_X3_B35:F08 XPR010_YAPC.311_96H_X1_B35:F24 XPR010_YAPC.311_96H_X2_B35:F24 XPR010_YAPC.311_96H_X3_B35:F24
symbol
DDR1 6.158450 6.202200 6.285650 6.249500 6.071100 6.225575 6.03340 6.26310 6.29285 6.52600 ... 6.41030 6.8449 6.698900 6.50580 6.553050 7.549575 6.84630 6.780425 6.951875 6.821550
PAX8 6.011775 6.106200 5.664900 5.665750 5.471950 5.780600 4.99225 5.30080 4.62410 4.82470 ... 5.23020 4.5078 5.127000 5.00560 5.445525 5.153600 4.25930 5.035250 4.937550 4.004100
GUCA1A 4.835400 4.938175 4.842400 4.697725 4.872650 4.826450 5.21620 5.17145 4.99370 5.02665 ... 4.89295 4.7672 4.806500 4.92165 5.290500 5.622600 5.89155 5.574350 5.208800 5.685100
EPHB3 6.632500 6.971400 6.574300 6.753650 6.427550 6.769800 8.29650 7.51050 7.67320 8.04430 ... 8.03480 8.9509 7.990300 7.93880 7.675250 7.396850 6.47865 7.833150 7.967200 7.705101
ESRRA 7.819700 7.858900 8.497725 8.272900 8.056351 7.891300 8.73630 8.28170 8.84240 8.22250 ... 7.25300 6.8591 7.123001 7.13630 8.613050 8.440750 7.94130 8.899151 8.561750 8.045325

5 rows × 57 columns

if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv"): 
    expr_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv' already exists!")
File at '../L1000_data/NR1I2_L1000_CRISPRKO_fulldata.tsv' already exists!
try: 
    meta_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv", sep='\t', index_col=0)
except:
    meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')
if 'batch' not in meta_df.columns:
    meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))
meta_df.head()
tissue disease cell_line pert_name pert_time pert_type data_level creation_time persistent_id pert_dose batch
id
XPR010_A375.311_96H_X1_B35:F08 skin of body melanoma A375.311 NR1I2 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:F08 skin of body melanoma A375.311 NR1I2 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR010_A375.311_96H
XPR010_A375.311_96H_X3_B35:F08 skin of body melanoma A375.311 NR1I2 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR010_A375.311_96H
XPR010_A375.311_96H_X1_B35:F24 skin of body melanoma A375.311 NR1I2 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:F24 skin of body melanoma A375.311 NR1I2 96 h CRISPR Knockout 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN XPR010_A375.311_96H
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv"): 
    meta_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv' already exists!")
File at '../L1000_data/NR1I2_L1000_CRISPRKO_metadata.tsv' already exists!
batches = meta_df['batch'].unique()
len(batches)
10

Load in Control Data¶

ctrl_data_df = pd.read_csv(f"{l1000_data_dir}/L1000_Controls.tsv", sep='\t')
ctrl_data_df = ctrl_data_df[ctrl_data_df['batch'].isin(batches)]
set(batches).difference(ctrl_data_df['batch'])
set()
ctrl_data_df.head()
local_id persistent_id batch
8910 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_A06_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H
8911 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D05_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H
8912 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D20_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H
8913 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D21_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H
8914 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_E13_... https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H

The following step extracts all control profiles from the same batch as the profiles of interest, and make take up to a few minutes to complete.

try: 
    ctrl_expr_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv", sep='\t', index_col=0)
    ctrl_meta_df = pd.read_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv", sep='\t', index_col=0)
except:
    ctrl_data_list = []
    ctrl_meta_list = []
    for row in ctrl_data_df.itertuples():
        try: 
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns: 
            ctrl_meta_list.append([col, row.batch])
        ctrl_data_list.append(temp_df)

    ctrl_expr_df = pd.concat(ctrl_data_list, axis=1)
    ctrl_meta_df = pd.DataFrame(ctrl_meta_list, columns=['id', 'batch']).set_index('id')
ctrl_expr_df.head()
XPR010_A375.311_96H_X1_B35:A06 XPR010_A375.311_96H_X2_B35:A06 XPR010_A375.311_96H_X3_B35:A06 XPR010_A375.311_96H_X1_B35:D05 XPR010_A375.311_96H_X2_B35:D05 XPR010_A375.311_96H_X3_B35:D05 XPR010_A375.311_96H_X1_B35:D20 XPR010_A375.311_96H_X2_B35:D20 XPR010_A375.311_96H_X3_B35:D20 XPR010_A375.311_96H_X1_B35:D21 ... XPR010_YAPC.311_96H_X3_B35:E14 XPR010_YAPC.311_96H_X1_B35:F04 XPR010_YAPC.311_96H_X2_B35:F04 XPR010_YAPC.311_96H_X3_B35:F04 XPR010_YAPC.311_96H_X1_B35:H24 XPR010_YAPC.311_96H_X2_B35:H24 XPR010_YAPC.311_96H_X3_B35:H24 XPR010_YAPC.311_96H_X1_B35:K17 XPR010_YAPC.311_96H_X2_B35:K17 XPR010_YAPC.311_96H_X3_B35:K17
symbol
NAT2 7.260575 7.280875 7.02010 6.966801 7.45305 7.79865 7.34975 7.165100 7.708175 7.48150 ... 12.6010 8.773951 8.02120 12.157150 7.691299 7.311550 12.085751 8.362350 9.20055 11.97370
ADA 5.064700 5.259250 5.22435 5.144300 5.38295 5.03075 5.07075 5.199075 4.932025 5.28315 ... 4.6644 4.806450 4.66950 3.391350 5.972750 5.615875 4.137550 6.133075 4.75340 4.23540
CDH2 5.706775 5.822050 5.69430 5.853000 5.64155 5.71160 5.98715 5.800150 5.671200 5.80255 ... 6.0781 5.741250 5.86595 5.915725 5.294300 6.179800 6.161100 5.736300 5.83550 5.75765
AKT3 0.896850 2.521300 2.37685 1.295950 3.11635 2.51195 2.38330 1.790250 1.295950 3.40630 ... 0.0000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
MED6 5.884050 5.757250 5.77825 5.589350 6.05750 5.44470 5.63980 5.929700 5.832875 5.86350 ... 6.2660 5.605700 6.17515 5.849500 5.851400 6.168350 5.662300 5.871375 6.37535 6.18240

5 rows × 255 columns

ctrl_meta_df.head()
batch
id
XPR010_A375.311_96H_X1_B35:A06 XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:A06 XPR010_A375.311_96H
XPR010_A375.311_96H_X3_B35:A06 XPR010_A375.311_96H
XPR010_A375.311_96H_X1_B35:D05 XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:D05 XPR010_A375.311_96H
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv"): 
    ctrl_expr_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv' already exists!")
if not exists(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv"): 
    ctrl_meta_df.to_csv(f"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv' already exists!")

Normalize Data¶

Combine data and remove duplicate genes¶

combined_df = pd.concat([
    expr_df.groupby(expr_df.index).mean(), 
    ctrl_expr_df.groupby(ctrl_expr_df.index).mean()
], axis=1)
combined_df.head()
XPR010_A375.311_96H_X1_B35:F08 XPR010_A375.311_96H_X2_B35:F08 XPR010_A375.311_96H_X3_B35:F08 XPR010_A375.311_96H_X1_B35:F24 XPR010_A375.311_96H_X2_B35:F24 XPR010_A375.311_96H_X3_B35:F24 XPR010_A549.311_96H_X1.L2_B36:F08 XPR010_A549.311_96H_X3_B35:F08 XPR010_A549.311_96H_X1.L2_B36:F24 XPR010_A549.311_96H_X3_B35:F24 ... XPR010_YAPC.311_96H_X3_B35:E14 XPR010_YAPC.311_96H_X1_B35:F04 XPR010_YAPC.311_96H_X2_B35:F04 XPR010_YAPC.311_96H_X3_B35:F04 XPR010_YAPC.311_96H_X1_B35:H24 XPR010_YAPC.311_96H_X2_B35:H24 XPR010_YAPC.311_96H_X3_B35:H24 XPR010_YAPC.311_96H_X1_B35:K17 XPR010_YAPC.311_96H_X2_B35:K17 XPR010_YAPC.311_96H_X3_B35:K17
symbol
A1CF 4.45315 4.23635 4.634250 3.93165 3.87275 3.75120 5.3409 4.67510 5.60880 5.23860 ... 10.192900 10.741025 10.62880 10.17440 11.360250 10.609800 10.58320 11.2271 11.164000 10.640949
A2M 7.22560 7.47065 8.596849 7.24335 7.29490 7.76080 8.1608 5.55060 8.57160 5.88430 ... 8.146475 7.649900 8.25165 8.60140 7.470325 7.525375 7.99345 7.0502 8.668000 8.725750
A4GALT 5.43295 5.44650 5.665200 5.29545 5.68145 5.51620 6.7321 6.05495 6.72805 5.74780 ... 5.415850 6.452750 5.52640 5.43765 5.709750 6.054850 4.93555 6.1569 6.114300 4.902300
A4GNT 5.37260 5.28950 5.293250 5.18680 5.30390 5.38095 5.2655 5.52010 5.44740 5.62385 ... 8.344800 8.872550 8.37400 8.79715 8.576900 8.662701 8.30970 8.8629 8.714951 8.285049
AAAS 7.51980 7.64780 7.956600 7.59620 8.07660 7.87480 8.1372 7.55920 8.28530 7.90890 ... 6.719550 6.666550 6.71455 6.97565 6.509600 6.576250 6.78680 6.4280 6.527250 6.776250

5 rows × 312 columns

datasets = {
    'metadata': meta_df, 
    'ctrl_metadata': ctrl_meta_df,
    'rawdata': combined_df
}

Compute Signatures: Batch Perturbations vs. Batch Controls¶

batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}
for b in batches: 
    batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()
    batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()
batch_signatures = {'cd': {}, 'limma': {}, 'fc': {}}

Characteristic Direction¶

# Function for computing signatures with characteristic direction
def cd_signature(ctrl_ids, case_ids, dataset, normalization):
  
    signature = characteristic_direction(
        dataset[normalization].loc[:, ctrl_ids], 
        dataset[normalization].loc[:, case_ids], 
        calculate_sig=True
    )
    signature = signature.sort_values("CD-coefficient", ascending=False)
    
    return signature

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['cd'][b] = cd_signature(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        datasets, 
        'rawdata'
    )

Limma¶

# Function for computing signatures
def limma(ctrl_ids, case_ids, dataset, normalization):
    
    signature = limma_voom.limma_voom_differential_expression(
        dataset[normalization].loc[:, ctrl_ids],
        dataset[normalization].loc[:, case_ids],
        filter_genes=False
    )
    signature = signature.sort_values("t", ascending=False)

    return signature

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['limma'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        datasets, 
        'rawdata'
    )
R[write to console]: Loading required package: R.oo

R[write to console]: Loading required package: R.methodsS3

R[write to console]: Loading required package: R.methodsS3

R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

R[write to console]: 
Attaching package: ‘R.oo’


R[write to console]: The following object is masked from ‘package:R.methodsS3’:

    throw


R[write to console]: The following objects are masked from ‘package:methods’:

    getClasses, getMethods


R[write to console]: The following objects are masked from ‘package:base’:

    attach, detach, load, save


R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

R[write to console]: 
Attaching package: ‘R.utils’


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp


R[write to console]: The following objects are masked from ‘package:base’:

    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
    warnings


R[write to console]: 
Attaching package: ‘RCurl’


R[write to console]: The following object is masked from ‘package:R.utils’:

    reset


R[write to console]: The following object is masked from ‘package:R.oo’:

    clone


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    I, expand.grid, unname


R[write to console]: Loading required package: IRanges

R[write to console]: 
Attaching package: ‘IRanges’


R[write to console]: The following object is masked from ‘package:R.oo’:

    trim


R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: GenomeInfoDb

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘Biobase’


R[write to console]: The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians


R[write to console]: 
Attaching package: ‘limma’


R[write to console]: The following object is masked from ‘package:DESeq2’:

    plotMA


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    plotMA


Fold Change¶

# Function for computing signatures with fold change
def logFC(ctrl_ids, case_ids, dataset, normalization):

    case_mean = dataset[normalization].loc[:, case_ids].mean(axis=1)
    ctrl_mean = dataset[normalization].loc[:, ctrl_ids].mean(axis=1)
    
    signature = case_mean / ctrl_mean

    signature = signature.apply(lambda x: log2(x+1)).rename('logFC')
    
    return pd.DataFrame(signature.sort_values(ascending=False))
for b in batches: 
    batch_signatures['fc'][b] = logFC(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        datasets,
        'rawdata'
    )

All signatures¶

batch_signatures['cd_all']= pd.concat([
    df[['CD-coefficient']].rename(columns={'CD-coefficient': b}) for (b, df) in batch_signatures['cd'].items()
], axis=1).sort_index()
batch_signatures['limma_all'] = pd.concat([
    df[['t']].rename(columns={'t': b}) for (b, df) in batch_signatures['limma'].items()
], axis=1).sort_index()
batch_signatures['fc_all'] = pd.concat([
    df.rename(b) for b, df in batch_signatures['fc'].items()
], axis=1).sort_index()

for k in batch_signatures.keys(): 
    if k.endswith('_all'): 
        method = k.split('_')[0].upper()
        display(Markdown(f"All {method} batch signatures"))
        display(batch_signatures[k])

All CD batch signatures

XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H XPR010_YAPC.311_96H
symbol
A1CF -0.019577 -0.016974 -0.015616 -0.016290 -0.020444 -0.018519 -0.017728 -0.020971 -0.017763 -0.019295
A2M 0.003215 -0.003538 0.001771 -0.001684 -0.001222 -0.002409 0.001087 0.002743 0.001958 0.004480
A4GALT -0.000740 0.000667 0.000689 0.002151 0.000054 -0.001610 -0.000614 0.000041 -0.000140 0.000447
A4GNT -0.011040 -0.008234 -0.014184 -0.013903 -0.014138 -0.015587 -0.013585 -0.012055 -0.013713 -0.009598
AAAS 0.004927 0.005585 0.005249 0.003679 0.005078 0.004642 0.005265 0.004076 0.004921 0.003304
... ... ... ... ... ... ... ... ... ... ...
ZXDB 0.000307 -0.001142 -0.000563 -0.000166 -0.001886 -0.001636 -0.000132 -0.000568 -0.001656 0.000642
ZXDC -0.004677 -0.009372 -0.007265 -0.002839 -0.005829 -0.001725 -0.007841 -0.008623 -0.010583 -0.009344
ZYX 0.018972 0.014685 0.018484 0.012741 0.019393 0.011563 0.008750 0.022170 0.016844 0.015572
ZZEF1 -0.007563 -0.007977 -0.001991 -0.002495 -0.005057 -0.001901 0.006034 -0.004659 0.000046 -0.003763
ZZZ3 0.005001 0.008117 0.002834 0.004480 0.009239 0.003054 -0.000552 0.003891 0.002508 0.001957

12327 rows × 10 columns

All LIMMA batch signatures

XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H XPR010_YAPC.311_96H
A1CF -60.019539 -28.815911 -44.964117 -42.496500 -76.125622 -40.349357 -38.287313 -65.604340 -55.876038 -43.141008
A2M 4.845533 -1.847583 3.406957 -2.852660 -2.416547 -0.537058 1.565255 9.006032 3.824993 2.729793
A4GALT -3.617646 -1.016386 0.559526 6.139557 -2.310624 -4.533725 -3.529359 0.950890 -2.583732 2.579730
A4GNT -42.919025 -19.718965 -50.182363 -30.878052 -47.377980 -36.774901 -24.278347 -24.678781 -38.743252 -27.158267
AAAS 26.282910 13.767572 18.236763 14.897501 24.929960 13.017880 16.846031 16.416623 23.690760 14.218616
... ... ... ... ... ... ... ... ... ... ...
ZXDB 0.235291 -2.809396 -0.987368 1.787694 -6.089417 -4.552719 -0.102972 -0.915882 -4.805944 -6.810436
ZXDC -16.760846 -11.868575 -15.712726 -6.614901 -12.108318 -1.424355 -16.841576 -23.859809 -22.600620 -8.387029
ZYX 36.112290 13.855560 23.705964 20.629454 37.057409 15.960206 11.385518 35.864251 30.875875 25.291581
ZZEF1 -19.567202 -7.184474 -1.622090 -7.559272 -15.772091 -9.197690 3.419958 -8.114486 -3.014485 -9.025222
ZZZ3 17.788706 4.820815 6.428117 8.105470 28.521079 8.612288 3.189601 9.125449 9.904261 2.446274

12327 rows × 10 columns

All FC batch signatures

XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H XPR010_YAPC.311_96H
symbol
A1CF -1.418332 -1.040272 -1.136226 -1.194198 -1.601675 -1.222738 -1.217520 -1.387034 -1.239450 -1.206063
A2M 0.157266 -0.123163 0.240870 -0.164634 -0.072182 -0.026677 0.108978 0.272596 0.159253 0.225869
A4GALT -0.076240 -0.044683 0.009301 0.139864 -0.056234 -0.118184 -0.108340 0.020024 -0.071076 0.106850
A4GNT -0.767693 -0.735056 -0.931857 -0.862885 -0.898295 -0.979380 -0.801131 -0.709310 -0.933009 -0.580846
AAAS 0.345681 0.417361 0.329596 0.214427 0.396350 0.324106 0.314611 0.238884 0.326043 0.263942
... ... ... ... ... ... ... ... ... ... ...
ZXDB -0.002925 -0.080670 -0.024100 0.027726 -0.114968 -0.109545 -0.005998 -0.023018 -0.091592 -0.145058
ZXDC -0.340634 -0.592188 -0.441215 -0.241991 -0.334266 -0.054244 -0.550736 -0.568386 -0.568292 -0.347858
ZYX 1.163029 0.813722 1.214162 0.733082 1.197966 0.854732 0.506321 1.124261 1.042311 1.013207
ZZEF1 -0.414575 -0.481934 -0.053903 -0.337179 -0.374649 -0.437583 0.199973 -0.259164 -0.171636 -0.375338
ZZZ3 0.325976 0.308467 0.110256 0.245853 0.591393 0.206770 0.106190 0.207667 0.236792 0.064721

12327 rows × 10 columns

Enrichment Analysis Rankings¶

# Function to get Enrichr Results
def Enrichr_API(enrichr_gene_list, library_name, signame, method, direction):

    all_ranks = []
    all_terms = []
    all_pvalues =[] 
    all_adjusted_pvalues = []

    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(enrichr_gene_list)
    description = f'Dex benchmark {signame} {method} {direction} genes'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error uploading gene list')

    list_data = json.loads(response.text)
    time.sleep(0.5)
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    response = requests.get(
        ENRICHR_URL + f"?userListId={list_data['userListId']}&backgroundType={library_name}"
    )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    enrich_data = json.loads(response.text)
    results_df  = pd.DataFrame(enrich_data[library_name])
    all_ranks.append(list(results_df[0]))
    all_terms.append(list(results_df[1]))
    all_pvalues.append(list(results_df[2]))
    all_adjusted_pvalues.append(list(results_df[6]))

    res_dict = {
        'ranks': all_ranks, 
        'terms': all_terms, 
        'pvals': all_pvalues,
        'adjpvals': all_adjusted_pvalues, 
        'shortid': str(list_data['shortId'])
    }

    return res_dict
# Get gene lists to put into Enrichr
gene_lists = {}
for m in batch_signatures.keys():
    if 'all' in m:
        mname = m.split('_')[0]
        gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}
        for sig in batch_signatures[m]: 
            gene_lists[mname]['up'][sig] = batch_signatures[m][sig].sort_values(ascending=False).index.tolist()[:250]
            gene_lists[mname]['down'][sig] = batch_signatures[m][sig].sort_values(ascending=True).index.tolist()[:250]
            gene_lists[mname]['combined'][sig] = gene_lists[mname]['up'][sig] + gene_lists[mname]['down'][sig]
# Get results
results = {}
for m in gene_lists.keys():
    results[m] = {'up': {}, 'down': {}, 'combined': {}}
    for d in gene_lists[m].keys():
        for sig in gene_lists[m][d].keys():
            results[m][d][sig]= Enrichr_API(
                gene_lists[m][d][sig], 'ChEA_2022', sig, m, d
            )
# Extract dexamethasone target rankings
# Initialize lists for storing target information
ranks = []
display(Markdown(f"## {ko_gene} Term Rankings"))

NR1I2 Term Rankings¶

for m in results.keys(): 
    for d in results[m].keys():
        for sig in results[m][d].keys():
            for i in range(len(results[m][d][sig]['ranks'][0])): 
                if ko_gene in results[m][d][sig]['terms'][0][i]: 
                    ranks.append([
                        f"{sig}:{m}:{d}",
                        results[m][d][sig]['terms'][0][i], 
                        results[m][d][sig]['ranks'][0][i],
                        results[m][d][sig]['pvals'][0][i]
                    ])
full_df = pd.DataFrame(ranks, columns=['Gene_Set', 'Term', 'Rank', 'p-value'])
full_df['Method'] = full_df['Gene_Set'].apply(lambda x: x.split(':')[1])
full_df['Direction'] = full_df['Gene_Set'].apply(lambda x : x.split(':')[2])
full_df['Method_Direction'] = full_df.apply(lambda row: row['Method'] + ':' + row['Direction'], axis=1)
up_df = full_df[full_df['Direction']=='up']
down_df = full_df[full_df['Direction']=='down']
combined_df = full_df[full_df['Direction']=='combined']
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for up genes from each method."))
display(up_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for down genes from each method."))
display(down_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))
display(Markdown(f"Mean rank of {ko_gene} terms from ChEA 2022 for combined up and down genes from each method."))
display(combined_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']).drop(['p-value'], axis=1))

Mean rank of NR1I2 terms from ChEA 2022 for up genes from each method.

Rank
Method
limma 552.3
fc 575.6
cd 620.9

Mean rank of NR1I2 terms from ChEA 2022 for down genes from each method.

Rank
Method
cd 57.0
limma 126.6
fc 182.5

Mean rank of NR1I2 terms from ChEA 2022 for combined up and down genes from each method.

Rank
Method
cd 330.7
fc 369.2
limma 373.8
color_dict = {
    'up': 'crimson',
    'down': '#636EFA', 
    'combined': 'gray'
}

fig1 = go.Figure()
for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),
            name=gs.split(':')[0].upper() + ':' + gs.split(':')[1],
            marker_color=color_dict[gs.split(':')[1]]
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction",
    xaxis={
        'title': {'text': 'Method:Direction'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")
color_dict = {
    'cd': 'green',
    'limma': '#b76935', 
    'fc': 'purple'
}

fig1 = go.Figure()
for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),
            name=gs.split(':')[0].upper() + ':' + gs.split(':')[1],
            marker_color=color_dict[gs.split(':')[0]]
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction",
    xaxis={
        'title': {'text': 'Method:Direction'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")
full_df['Batch'] = full_df['Gene_Set'].apply(lambda x: x.split(':')[0])
fig1 = go.Figure()
for gs in full_df['Batch'].unique():
    fig1.add_trace(
        go.Box(
            y=full_df[full_df['Batch']==gs]['Rank'].tolist(),
            name=gs,
            marker_color='#636EFA'
        )
    )
fig1.update_layout(
    title_text=f"{ko_gene} Term Rankings for L1000 Gene Sets by Batch",
    xaxis={
        'title': {'text': 'Batch'}, 
    },
    yaxis={
        'title': {'text': 'Rank'}
    },
    showlegend=False
)
fig1.show("png")

Bridge plots¶

def getEnrichrGeneSets(libname, termlist): 
    term_dict = {}
    liburl = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={libname}'
    response = requests.get(liburl)
    if not response.ok:
        raise Exception('Error fetching library from Enrichr')
    for term in termlist:
        geneset = response.json()[libname]['terms'][term].keys()
        term_dict[term] = geneset
    return term_dict

target_genesets = getEnrichrGeneSets('ChEA_2022', full_df['Term'].unique())
def methodBridgePlot(gsname, method, colname, abs_val=False): 
    geneset = target_genesets[gsname]
    y = []

    for k in batch_signatures[method].keys():
        if abs_val: 
            sig = batch_signatures[method][k][colname].apply(abs)
        else: 
            sig = batch_signatures[method][k][colname]
        select = pd.Series(
            [x.upper() in geneset for x in sig.sort_values(ascending=False).index.tolist()]
        )   
        x, y_temp = bridge_plot(select)
        y.append(y_temp)
    
    x = x/len(x)
    return x, np.mean(y, axis=0)

def randomBridgePlot(gsname): 
    all_x = []
    all_y = []

    for _ in range(10):
        rand_sig = sample(
            batch_signatures['cd_all'].index.tolist(), 
            batch_signatures['cd_all'].shape[0]
        )
        select = pd.Series([
            x.upper() in target_genesets[gsname] for x in rand_sig
        ])
        x, y = bridge_plot(select)
        x = x/len(x)
        all_x.append(x)
        all_y.append(y)

    return all_x, all_y
colnames = {
    'cd': 'CD-coefficient', 
    'limma': 't', 
    'fc': 'logFC'
}
def build_res(abs_val=False):
    res = {}
    for term in target_genesets.keys():
        res[term] = {'random': {'x': [], 'y': []}}
        rand_x, rand_y = randomBridgePlot(term)
        res[term]['random']['x'] += rand_x
        res[term]['random']['y'] += rand_y
        for method in batch_signatures.keys():
            if 'all' not in method:
                temp_x, temp_y = methodBridgePlot(term, method, colnames[method], abs_val=abs_val)
                if method in res[term].keys():
                    res[term][method]['x'].append(temp_x)
                    res[term][method]['y'].append(temp_y)
                else: 
                    res[term][method] = {'x': [temp_x], 'y': [temp_y]}
    return res

method_results = build_res(abs_val=False)
method_results_abs = build_res(abs_val=True)
def drawBridgePlot(abs_val=False):
    if abs_val:
        res = method_results_abs
    else: 
        res = method_results
    for term in res.keys():
        for sig in res[term].keys(): 
            term_x = np.mean(res[term][sig]['x'], axis=0)
            term_y = np.mean(res[term][sig]['y'], axis=0)
            if sig == 'random': 
                plt.plot(term_x, term_y, label=sig, color='gray')
            elif sig == 'CD': 
                plt.plot(term_x-(0.01*max(term_x)), term_y, label=sig)
            else:
                plt.plot(term_x, term_y, label=sig)

        plt.axhline(y=0, color='black', linestyle='dashed')
        plt.legend(bbox_to_anchor=(1, 1))
        if abs_val:
            plt.title(f"{term.split(' ')[0]} Target Gene Rankings for Each Method (Abs Val)")
        else:
            plt.title(f"{term.split(' ')[0]} Target Gene Rankings for Each Method")
        plt.show()
drawBridgePlot()
View source code
Submit an issue