logo Dexamethasone Benchmark Resource
  • About
  • Code
  • Data
  • Figures
  • Stats

Benchmarking the Recovery of Known Targets from L1000 Dex Perturbation Data¶

# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import requests
import scipy.stats as ss
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from math import log2
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
from os.path import exists
from scipy.stats import ttest_ind, ranksums
import h5py
import warnings
warnings.filterwarnings('ignore')

Load in Data¶

# Set perturbagen
pert = 'dexamethasone'

# Set working directory
l1000_data_dir = '../L1000_data'
try: 
    expr_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index_col=0)
except: 
    l1000_data_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_data.tsv", sep='\t')

    l1000_data_list = []
    l1000_meta_list = []
    for row in l1000_data_df.itertuples(): 
        try:
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns:
            l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())
        l1000_data_list.append(temp_df)
    expr_df = pd.concat(l1000_data_list, axis=1)
expr_df.head()
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13 CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13 ... CRCGN004_PC3_6H_X2_F1B7_DUO52HI53LO:J21 LPROT002_A375_6H_X1_B22:H07 LPROT002_A375_6H_X1_B22:H09 LPROT002_A375_6H_X1_B22:H11 LPROT002_MCF7_6H_X1_B22:H08 LPROT002_MCF7_6H_X1_B22:H10 LPROT002_MCF7_6H_X1_B22:H12 LPROT004_YAPC_6H_X1_B32:H07 LPROT004_YAPC_6H_X1_B32:H09 LPROT004_YAPC_6H_X1_B32:H11
symbol
NAT2 5.347525 5.31260 5.54900 5.2620 5.3664 5.1632 6.5029 6.1924 5.701800 5.69435 ... 6.521600 3.8543 4.558200 4.8785 4.0917 4.3097 4.594400 5.051500 4.67150 5.2628
ADA 6.814449 6.21185 7.37235 7.9995 7.4117 7.6526 8.4221 7.4373 9.558400 7.31210 ... 8.412251 8.5299 8.172900 7.8001 7.7042 7.5313 7.161799 6.563300 5.72615 6.8200
CDH2 7.497050 8.35885 7.51460 10.1442 11.5089 10.4157 9.9937 6.0723 7.950950 6.61335 ... 3.211200 10.5305 9.309799 9.0100 5.4028 5.9991 5.022750 6.782700 6.52530 6.6286
AKT3 9.710200 11.61895 9.40310 10.4081 9.9464 9.1548 10.6656 9.7118 8.052999 7.07675 ... 7.751399 9.9655 8.703700 9.5298 5.4086 7.9250 5.372250 8.076001 7.89250 8.1305
MED6 7.923500 6.12090 8.32925 8.2268 7.8020 8.0996 8.9311 8.5562 9.422951 8.44575 ... 9.265400 9.7446 9.507300 9.4560 10.4112 9.0910 9.762300 9.515800 9.54300 9.8093

5 rows × 1310 columns

if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv"): 
    expr_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv' already exists!")
File at '../L1000_data/Dexamethasone_L1000_ChemPert_fulldata.tsv' already exists!
try: 
    meta_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index_col=0)
except:
    meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')
if 'batch' not in meta_df.columns:
    meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))
meta_df.head()
tissue disease cell_line pert_name pert_time pert_type data_level creation_time persistent_id pert_dose batch
id
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 skin of body melanoma A375 dexamethasone 6 h Chemical Perturbation 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN CPC004_A375_6H
CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 skin of body melanoma A375 dexamethasone 6 h Chemical Perturbation 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN CPC004_A375_6H
CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 skin of body melanoma A375 dexamethasone 6 h Chemical Perturbation 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN CPC004_A375_6H
CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 kidney NaN HA1E dexamethasone 24 h Chemical Perturbation 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN CPC004_HA1E_24H
CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 kidney NaN HA1E dexamethasone 24 h Chemical Perturbation 3 2021-01-21 https://lincs-dcic.s3.amazonaws.com/LINCS-data... NaN CPC004_HA1E_24H
if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv"): 
    meta_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv' already exists!")
File at '../L1000_data/Dexamethasone_L1000_ChemPert_metadata.tsv' already exists!
all_batches = set(['_'.join(x.split('_')[:3]) for x in meta_df.index.tolist()])
len(all_batches)
240

Load in MODZ Data¶

modz_data_df = pd.read_csv(f'{l1000_data_dir}/dex_modz.gct', sep='\t', header=2).iloc[8:].set_index('pr_gene_symbol')
modz_data_df = modz_data_df.drop(columns=['id', 'pr_gene_title', 'pr_is_lm', 'pr_is_bing', 'self_correlation', 'frac_self_rank', 'p_value', 'gene_space'])
modz_data_df = modz_data_df.astype('float32')
modz_data_df.loc['MIA2'] = modz_data_df.loc[['MIA2', 'CTAGE5']].mean(axis=0)
modz_data_df = modz_data_df.drop(index=['CTAGE5'])
modz_data_df.columns = modz_data_df.columns.map(lambda x: x.split(':')[0])
modz_data_df = modz_data_df[[x for x in modz_data_df.columns if x.endswith('24H') ]]
modz_data_df
CPC006_A375_24H CPC006_A549_24H CPC006_HA1E_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_MCF7_24H CPC006_PC3_24H CPC009_A549_24H CPC009_MCF7_24H CPC009_PC3_24H CPC006_VCAP_24H CPC009_VCAP_24H CRCGN001_HA1E_24H CRCGN004_HA1E_24H
pr_gene_symbol
PSME1 0.50 -1.32 1.37 1.03 0.67 0.47 -0.17 -0.44 1.65 0.07 0.24 0.27 -0.58 0.72
ATF1 1.03 -0.42 0.98 0.26 0.39 0.76 0.09 -0.05 -1.87 0.65 0.10 0.53 -0.78 -0.02
RHEB -0.02 -0.66 -0.27 -0.84 0.41 0.04 0.24 -0.63 1.13 1.06 0.41 -0.48 -0.76 0.05
FOXO3 -0.75 2.80 -1.63 1.05 -0.16 -0.67 -0.94 1.96 -0.21 -0.31 -0.54 0.37 -0.24 0.20
RHOA 0.10 -0.33 -0.37 1.16 -0.11 1.08 0.85 0.02 1.40 0.34 0.35 0.43 -0.45 0.46
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ADGRA2 -0.34 0.24 0.05 -1.21 -0.34 -0.34 -0.16 -0.26 -0.29 -0.25 -0.17 -1.03 0.70 -0.40
CX3CL1 -0.33 -0.18 -0.17 0.21 -0.21 -0.41 0.33 0.07 -0.44 0.38 -0.20 -0.90 0.18 -0.12
ADAP1 -0.48 -0.09 -0.99 0.23 -0.06 -0.54 0.42 -0.43 -0.65 0.53 -0.49 1.17 1.23 0.12
EPS8L1 -0.69 -0.68 -0.87 -0.30 0.15 0.49 -0.58 -0.59 0.88 -0.86 -0.38 0.87 -0.03 -0.87
ACTB -0.20 -0.32 0.14 0.40 -0.73 -0.23 -0.16 0.46 0.37 0.16 -0.03 2.51 -0.60 0.73

12327 rows × 14 columns

Load in Control Data¶

gctx = h5py.File(f"{l1000_data_dir}/Dexamethasone_L1000_Controls_fulldata.gctx", 'r')
ctrl_expr_df = pd.DataFrame(
    data=gctx['0/DATA/0/matrix'][()],
    index=gctx['0/META/COL/id'].asstr()[:],
    columns=gctx['0/META/ROW/id'].asstr()[:]
).T
gctx.close()
overlapping_dex = set(expr_df.columns.tolist()).intersection(ctrl_expr_df.columns.tolist())
ctrl_expr_df = ctrl_expr_df.drop(columns=list(overlapping_dex))
ctrl_expr_df.head()
CPC006_A375_24H_X5_B4_DUO52HI53LO:A17 CPC006_A375_24H_X1_B3_DUO52HI53LO:A18 CPC006_A375_24H_X2_B3_DUO52HI53LO:A18 CPC006_A375_24H_X5_B4_DUO52HI53LO:A18 CPC006_A375_24H_X2_B3_DUO52HI53LO:B17 CPC006_A375_24H_X4_B4_DUO52HI53LO:B17 CPC006_A375_24H_X5_B4_DUO52HI53LO:B17 CPC006_A375_24H_X1_B3_DUO52HI53LO:B18 CPC006_A375_24H_X4_B4_DUO52HI53LO:B18 CPC006_A375_24H_X5_B4_DUO52HI53LO:B18 ... CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12 CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12 CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13 CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13
NAT2 7.89775 8.68995 6.80705 8.04730 7.54740 6.815075 7.633851 7.339425 8.56575 7.03535 ... 4.59600 5.69120 5.410600 5.16645 5.6062 5.12355 5.612900 5.61780 5.964099 5.46440
ADA 3.51090 4.02850 4.11730 3.89205 4.56435 4.913550 4.724325 3.815050 3.26435 3.49990 ... 9.13180 9.69640 10.277700 8.01760 9.1136 8.69590 9.973300 9.98530 8.143050 8.43330
CDH2 5.07510 5.84900 5.84390 5.05145 5.76565 5.012725 4.946300 6.192550 4.91245 4.96025 ... 4.98080 3.02930 3.250800 4.04120 3.6938 5.21580 3.833500 3.92415 3.731000 6.07040
AKT3 3.64940 4.85665 4.31915 5.40425 5.03620 3.253500 4.558100 4.819650 6.11925 4.10250 ... 5.80955 6.67030 7.958799 5.72470 5.8401 6.77600 7.432001 8.05450 5.867200 6.17385
MED6 5.20905 5.51090 6.04015 5.06230 5.91720 4.498100 5.335800 5.388175 5.42000 5.15100 ... 8.67330 7.03495 8.061700 8.36450 8.3464 8.82380 7.499700 8.43520 8.015600 9.17250

5 rows × 14214 columns

# ctrl_expr_df.to_csv('../L1000_data/Dexamethasone_L1000_Control_Profiles.tsv.gz', sep='\t')
ctrl_meta_df = pd.DataFrame(
  data = [ctrl_expr_df.columns.tolist(),['_'.join(x.split('_')[:3]) for x in ctrl_expr_df.columns]],
  index = ['id', 'batch']
).T.set_index('id')
ctrl_meta_df.head()
batch
id
CPC006_A375_24H_X5_B4_DUO52HI53LO:A17 CPC006_A375_24H
CPC006_A375_24H_X1_B3_DUO52HI53LO:A18 CPC006_A375_24H
CPC006_A375_24H_X2_B3_DUO52HI53LO:A18 CPC006_A375_24H
CPC006_A375_24H_X5_B4_DUO52HI53LO:A18 CPC006_A375_24H
CPC006_A375_24H_X2_B3_DUO52HI53LO:B17 CPC006_A375_24H

Process Data¶

Remove duplicate genes¶

combined_expr_df = pd.concat([
    expr_df.groupby(expr_df.index).mean(), 
    ctrl_expr_df.groupby(ctrl_expr_df.index).mean()
], axis=1)
combined_expr_df.head()
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13 CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13 CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13 ... CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12 CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12 CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12 CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13 CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13 CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13
A1CF 3.706150 5.19435 4.10195 4.64380 4.270800 3.9665 4.4174 4.5412 4.5436 4.50700 ... 3.42250 4.26580 3.922100 4.60500 4.60140 3.350000 4.33555 3.6938 4.4059 4.400100
A2M 7.060125 4.31380 7.43550 5.55190 7.082000 6.1772 5.7649 8.5913 6.3916 8.73610 ... 8.69920 11.61970 10.921300 8.47680 10.67320 9.651800 8.33980 8.0536 10.0006 9.749900
A4GALT 6.245200 6.80720 6.30480 7.30625 6.827850 7.1949 6.9762 7.0023 6.8364 6.90265 ... 6.71325 6.60190 7.314400 6.81515 6.59470 6.657900 6.45930 6.8790 6.4290 6.985100
A4GNT 5.558300 5.77020 5.68940 5.67050 5.731700 5.8950 6.0036 6.2611 6.3069 5.24015 ... 5.60620 5.78565 5.702550 6.01230 5.86385 5.448800 6.15945 6.0248 6.0777 5.834650
AAAS 8.256000 6.88010 8.09350 8.25630 8.228301 8.0326 7.9515 7.6869 7.2894 8.11190 ... 7.64870 7.58680 7.712349 7.53270 7.52370 7.898301 8.02430 7.7470 7.8282 7.616199

5 rows × 15524 columns

Get batches¶

batches = all_batches.intersection(ctrl_meta_df['batch'].unique())
for b in batches:
  print(b)
CPC006_A549_24H
CPC006_PC3_24H
CPC009_MCF7_24H
CPC006_HCC515_24H
CPC006_HT29_24H
CPC006_A375_24H
CPC009_A549_24H
CPC009_PC3_24H
CPC006_MCF7_24H
CPC006_HA1E_24H

Compute Signatures: Batch Perturbations vs. Batch Controls¶

batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}
for b in batches: 
    batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()
    batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()
batch_signatures = {
    'cd': {}, 
    'limma': {}, 
    'limma-voom': {},
    'fc': {},
    'ranksum': {},
    'ttest': {},
    'modz': {},
}

MODZ¶

for b in batches: 
  batch_signatures['modz'][b] = pd.concat([
    modz_data_df[b].rename('MODZ'), 
    modz_data_df[b].apply(abs).rename('Significance')
  ], axis=1)

Characteristic Direction¶

# Function for computing signatures with characteristic direction
def cd_signature(ctrl_ids, case_ids, dataset):
  
    signature = characteristic_direction(
        dataset.loc[:, ctrl_ids], 
        dataset.loc[:, case_ids], 
        calculate_sig=True
    )
    signature['Significance'] = signature['CD-coefficient'].apply(abs)
    
    return signature.sort_values(by=['CD-coefficient'], ascending=False)

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['cd'][b] = cd_signature(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        combined_expr_df
    )

Limma¶

# Function for computing signatures
def limma(ctrl_ids, case_ids, dataset, voom):
    
    signature = limma_voom_differential_expression(
        dataset.loc[:, ctrl_ids],
        dataset.loc[:, case_ids],
        voom_design=voom,
        filter_genes=False
    )
    signature['Significance'] = signature['P.Value']

    return signature.sort_values("t", ascending=False)

Note: the following step may take a few minutes to run.

for b in batches: 
    batch_signatures['limma'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df,
        voom=False
    )
R[write to console]: Loading required package: R.oo

R[write to console]: Loading required package: R.methodsS3

R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

R[write to console]: 
Attaching package: ‘R.oo’


R[write to console]: The following object is masked from ‘package:R.methodsS3’:

    throw


R[write to console]: The following objects are masked from ‘package:methods’:

    getClasses, getMethods


R[write to console]: The following objects are masked from ‘package:base’:

    attach, detach, load, save


R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

R[write to console]: 
Attaching package: ‘R.utils’


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp


R[write to console]: The following objects are masked from ‘package:base’:

    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
    warnings


R[write to console]: 
Attaching package: ‘RCurl’


R[write to console]: The following object is masked from ‘package:R.utils’:

    reset


R[write to console]: The following object is masked from ‘package:R.oo’:

    clone


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    I, expand.grid, unname


R[write to console]: Loading required package: IRanges

R[write to console]: 
Attaching package: ‘IRanges’


R[write to console]: The following object is masked from ‘package:R.oo’:

    trim


R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: GenomeInfoDb

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘Biobase’


R[write to console]: The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians


R[write to console]: 
Attaching package: ‘limma’


R[write to console]: The following object is masked from ‘package:DESeq2’:

    plotMA


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    plotMA


for b in batches: 
    batch_signatures['limma-voom'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df,
        voom=True
    )

Wilcoxon Rank-Sum Test¶

def ranksum(ctrl_ids, case_ids, dataset):
  if len(ctrl_ids) + len(case_ids) < 32: 
    print("Warning! Sample sizes < 16 generally do not provide good results. ")
  res_array = []
  for gene in dataset.index: 
    res = ranksums(
      dataset.loc[gene, case_ids],
      dataset.loc[gene, ctrl_ids]
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  return signature.sort_values(by=['Statistic'], ascending=False)
for b in batches: 
    batch_signatures['ranksum'][b] = ranksum(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df
    )

Welch's t-test¶

def ttest(ctrl_ids, case_ids, dataset):
  res_array = []
  for gene in dataset.index: 
    res = ttest_ind(
      dataset.loc[gene, case_ids],
      dataset.loc[gene, ctrl_ids],
      equal_var = False
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  return signature.sort_values(by=['Statistic'], ascending=False)
for b in batches: 
    batch_signatures['ttest'][b] = ttest(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df
    )

(log2) Fold Change¶

# Function for computing signatures with fold change
def logFC(ctrl_ids, case_ids, dataset):

    case_mean = dataset.loc[:, case_ids].mean(axis=1)
    ctrl_mean = dataset.loc[:, ctrl_ids].mean(axis=1)

    signature = case_mean / (ctrl_mean + 0.001)

    signature_df = pd.DataFrame(
        signature.apply(lambda x: log2(x+0.001)), columns=['logFC']
    )
    signature_df['Significance'] = signature_df['logFC'].apply(abs)
    
    return signature_df.sort_values('logFC', ascending=False)
for b in batches: 
    batch_signatures['fc'][b] = logFC(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        combined_expr_df
    )

All Signatures¶

all_signatures = {}

all_signatures['cd_all']= pd.concat([
    df['CD-coefficient'].rename(b) for (b, df) in batch_signatures['cd'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma_all'] = pd.concat([
    df['t'].rename(b) for (b, df) in batch_signatures['limma'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma-voom_all'] = pd.concat([
    df['t'].rename(b) for (b, df) in batch_signatures['limma-voom'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ranksum_all'] = pd.concat([
    df['Statistic'].rename(b) for (b, df) in batch_signatures['ranksum'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ttest_all'] = pd.concat([
    df['Statistic'].rename(b) for (b, df) in batch_signatures['ttest'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['modz_all'] = pd.concat([
    df['MODZ'].rename(b) for (b, df) in batch_signatures['modz'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['fc_all'] = pd.concat([
    df['logFC'].rename(b) for (b, df) in batch_signatures['fc'].items()
], axis=1).sort_index().rename_axis('Gene')

for k in all_signatures.keys(): 
    method = k.split('_')[0].upper()
    display(Markdown(f"All {method} batch signatures"))
    display(all_signatures[k])

All CD batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF 0.000952 0.001854 0.008695 -0.001396 -0.007102 0.013102 0.005242 0.018031 -0.001283 -0.012737
A2M 0.021682 -0.026343 0.022046 0.002863 -0.009718 0.003915 -0.006546 -0.018333 -0.000499 -0.006217
A4GALT 0.006834 0.004920 -0.006121 0.003770 0.000802 0.002384 0.000730 0.003674 0.004756 0.001069
A4GNT -0.001011 0.003500 -0.004465 0.002190 -0.004268 -0.002679 0.000148 0.006255 0.002543 -0.001182
AAAS -0.003752 -0.003620 -0.003427 0.001099 -0.003757 -0.005967 -0.002238 0.002202 0.008218 -0.003162
... ... ... ... ... ... ... ... ... ... ...
ZXDB 0.003825 -0.000073 0.000986 0.000702 -0.004406 0.002464 -0.001844 -0.002293 -0.000160 -0.004288
ZXDC 0.005625 0.004271 0.003541 0.002027 0.008632 0.001144 0.000540 0.000558 0.009406 -0.002195
ZYX -0.002841 -0.014867 -0.003315 0.001899 -0.006242 -0.013534 -0.005301 -0.018816 0.009712 -0.002055
ZZEF1 -0.005954 -0.007576 0.011017 -0.005430 0.006954 0.004589 0.002710 -0.003639 0.004555 -0.006276
ZZZ3 -0.007250 -0.001067 -0.007591 -0.002336 0.010721 -0.004806 -0.009311 0.003312 -0.002183 -0.014607

12327 rows × 10 columns

All LIMMA batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF -0.294205 -0.823307 0.224118 -1.345300 -1.803227 -0.457685 0.488978 -0.066385 -0.496867 -1.385689
A2M 1.287795 -0.226417 0.060950 0.611259 -1.071238 0.199035 -0.957864 -0.837714 -1.592196 -0.675840
A4GALT 0.592904 0.990301 0.837572 1.536314 -0.671224 -0.084085 1.052068 0.306574 1.264702 1.458460
A4GNT -0.143384 -0.811679 -0.182202 -0.432298 -1.127489 -0.521193 -0.875634 -0.312889 -0.982429 -0.717047
AAAS 0.086139 1.050528 0.469454 0.931823 -0.123393 1.261981 -0.698037 1.166193 1.790793 0.821752
... ... ... ... ... ... ... ... ... ... ...
ZXDB 1.740872 -0.855682 1.459864 1.128436 -1.638648 -1.103166 0.583538 -1.900811 -1.288176 -0.708441
ZXDC 0.207162 -0.498437 0.114471 -0.111600 0.964735 0.191507 -0.366940 -0.705952 -0.582374 -0.951470
ZYX 0.515698 -0.059634 0.061114 0.036428 0.500599 0.348942 0.680457 0.156431 0.123771 0.932210
ZZEF1 -0.355968 -0.811313 1.224806 -1.059856 -0.369694 -0.557480 -0.009333 0.326231 -0.257675 -1.265336
ZZZ3 -0.409015 -0.177657 -2.337986 0.017358 0.614833 0.448167 -0.225108 1.262405 0.031205 -0.344133

12327 rows × 10 columns

All LIMMA-VOOM batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF -0.299872 -0.723968 0.233817 -1.151080 -1.426047 -0.432861 0.530413 -0.065431 -0.461192 -1.057338
A2M 1.195174 -0.221229 0.060802 0.612243 -1.045904 0.198667 -0.985443 -0.857667 -1.256354 -0.640593
A4GALT 0.586769 0.992215 0.835355 1.541795 -0.671289 -0.084093 1.041888 0.306185 1.263357 1.457404
A4GNT -0.146902 -0.778560 -0.182461 -0.420382 -1.089057 -0.518001 -0.803284 -0.310522 -0.957077 -0.693777
AAAS 0.086007 1.044281 0.467396 0.924893 -0.123432 1.250010 -0.701899 1.160633 1.762998 0.816807
... ... ... ... ... ... ... ... ... ... ...
ZXDB 1.794068 -0.842987 1.473848 1.158135 -1.582549 -1.086361 0.588070 -1.859902 -1.272800 -0.697761
ZXDC 0.209948 -0.491523 0.114394 -0.111469 0.963724 0.191458 -0.350736 -0.688386 -0.581834 -0.899001
ZYX 0.511624 -0.059739 0.060987 0.036396 0.494438 0.346584 0.666952 0.155801 0.123418 0.894256
ZZEF1 -0.358077 -0.809179 1.215608 -1.062095 -0.369780 -0.559051 -0.009334 0.325266 -0.257917 -1.270832
ZZZ3 -0.412010 -0.178105 -2.364038 0.017409 0.612756 0.446982 -0.225531 1.246159 0.031202 -0.344768

12327 rows × 10 columns

All RANKSUM batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF -0.869339 -0.653255 0.782913 -1.569126 -2.412502 -0.417907 0.782361 0.777130 0.269830 -2.063696
A2M 1.217205 -0.420350 0.079282 1.065474 -1.368122 0.120335 -1.262522 -1.052777 -2.055985 -0.538814
A4GALT 1.838812 1.132939 1.248697 1.542712 -0.807251 -0.227299 1.383402 0.227525 1.710192 1.742911
A4GNT -0.660620 -0.838930 0.052855 0.719418 -1.996264 0.006530 -0.970396 0.126637 -1.091930 -0.575701
AAAS 0.020807 0.966747 0.670597 0.436577 -0.350229 1.663231 -1.302816 1.034626 2.517042 0.741692
... ... ... ... ... ... ... ... ... ... ...
ZXDB 1.862220 -1.568344 1.704571 1.396988 -1.905869 -1.035439 0.849516 -2.128772 -1.379944 -1.143497
ZXDC 0.598199 -0.123685 0.361726 0.368019 1.300430 0.761498 -0.530528 -0.706636 -0.254285 -1.029542
ZYX 0.518223 -0.814134 0.044596 -0.387013 0.241755 -0.109141 1.020763 -0.607014 -0.164244 0.947864
ZZEF1 -0.531877 -0.807345 1.519579 -1.007007 -0.066010 -0.208021 -0.134311 0.497262 0.232875 -1.667160
ZZZ3 -1.148282 -0.231429 -1.641806 -0.232980 0.516304 0.350121 -0.282053 1.427624 -0.231702 -0.563844

12327 rows × 10 columns

All TTEST batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF -0.320777 -3.318610 0.283788 -2.466058 -3.953539 -0.815173 0.600913 -0.305456 -2.726108 -4.637847
A2M 1.029799 -0.350408 0.055592 0.621131 -1.315067 0.178960 -9.727957 -1.619311 -2.894856 -0.607508
A4GALT 0.519774 1.478609 2.107487 2.554163 -1.015374 -0.187363 2.934027 0.825312 1.883732 2.333186
A4GNT -0.246314 -5.220164 -0.197389 -3.498468 -3.900530 -1.452731 -1.084934 -1.255108 -4.730529 -6.601442
AAAS 0.051627 2.359116 0.993238 1.795143 -0.165691 3.840168 -1.004628 1.396759 4.285530 1.741888
... ... ... ... ... ... ... ... ... ... ...
ZXDB 1.817554 -1.892693 1.966396 1.818086 -3.129890 -1.477842 0.748831 -3.603297 -1.666687 -0.594905
ZXDC 0.166390 -1.068104 0.161120 -0.533967 1.703616 0.246233 -2.464852 -1.725556 -1.192863 -1.542090
ZYX 0.591836 -0.279141 0.127387 -0.150160 0.773410 0.498327 13.014875 0.594875 0.128454 2.174616
ZZEF1 -1.006280 -2.492458 1.719959 -2.763837 -0.754950 -2.341365 -0.353694 0.462720 -0.232539 -3.994231
ZZZ3 -1.103177 -0.277392 -1.241546 -0.041727 1.173309 0.500864 -0.571492 1.782880 0.015091 -0.563517

12327 rows × 10 columns

All MODZ batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF 0.44 0.02 1.00 -0.51 -1.47 -0.14 0.34 0.01 0.21 -0.96
A2M 1.64 0.39 0.18 0.85 -0.08 -0.19 -0.72 -0.47 -0.39 -0.41
A4GALT 1.50 0.38 1.32 -0.01 -0.84 -0.13 0.59 -0.31 0.66 0.19
A4GNT 0.63 0.04 0.75 0.38 -1.15 -0.61 -0.71 -0.31 -0.24 -1.03
AAAS 0.12 -0.15 0.75 -0.30 -0.62 0.25 -1.42 0.20 0.38 -0.85
... ... ... ... ... ... ... ... ... ... ...
ZXDB 1.07 -0.42 1.75 0.44 -0.58 -0.64 0.10 -1.07 -0.30 -0.95
ZXDC 0.97 -0.12 0.83 -0.06 0.12 0.10 -0.39 -0.58 0.16 -0.87
ZYX -0.23 -0.39 0.06 -0.07 0.03 -0.58 0.65 -0.28 -0.10 0.19
ZZEF1 0.02 -0.05 1.33 -0.31 -0.37 0.23 -0.16 -0.01 -0.27 -0.50
ZZZ3 0.11 0.08 -1.91 -0.16 -0.35 0.02 -0.42 0.76 -0.21 -0.16

12327 rows × 10 columns

All FC batch signatures

CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
Gene
A1CF -0.039201 -0.158875 0.025282 -0.134439 -0.184498 -0.057449 0.054025 -0.033161 -0.090305 -0.256182
A2M 0.202267 -0.024419 0.020151 0.052870 -0.117227 0.017373 -0.225873 -0.114115 -0.194216 -0.086272
A4GALT 0.031620 0.019167 0.049822 0.046336 -0.013258 -0.001729 0.064774 0.013017 0.028068 0.042684
A4GNT -0.011731 -0.071222 -0.009831 -0.052952 -0.076852 -0.028826 -0.079883 -0.024717 -0.058964 -0.074368
AAAS 0.004224 0.023722 0.029721 0.032498 -0.003322 0.043787 -0.046597 0.040568 0.051654 0.038041
... ... ... ... ... ... ... ... ... ... ...
ZXDB 0.045476 -0.016062 0.065967 0.024613 -0.031598 -0.019754 0.023491 -0.042219 -0.017098 -0.014245
ZXDC 0.009077 -0.019981 0.013045 -0.009909 0.028926 0.003964 -0.043277 -0.036227 -0.028414 -0.053541
ZYX 0.044630 -0.006935 0.007790 -0.006308 0.027131 0.014156 0.065377 0.007357 0.003019 0.062993
ZZEF1 -0.027045 -0.026066 0.077580 -0.059606 -0.017674 -0.032865 -0.005162 0.021032 -0.004313 -0.082674
ZZZ3 -0.030140 -0.004346 -0.178028 -0.000276 0.018240 0.014170 -0.033473 0.068013 0.001494 -0.013333

12327 rows × 10 columns

Enrichment Analysis Rankings¶

# Function to get Enrichr Results

def getEnrichrLibrary(library_name): 
    ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'
    resp = requests.get(ENRICHR_URL)
    if not resp.ok: 
        raise Exception(f"Error downloading {library_name} library from Enrichr, please try again.")
    return resp.json()[library_name]['terms']

def getLibraryIter(libdict):
    for k,v in libdict.items():
        if type(v) == list:
            yield k, v
        else:
            yield k, list(v.keys())

def enrich(gene_list, lib_json, name): 
    all_terms = list(lib_json.keys())
    termranks = []
    enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)
    enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]
    sorted_res = sorted(enrich_res, key=lambda x: x[1])
    for i in range(len(sorted_res)): 
        termranks.append([name, sorted_res[i][0], i])
    for t in set(all_terms).difference([x[1] for x in termranks]): 
        i+=1
        termranks.append([name, t, i])
    return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])
chea2022 = getEnrichrLibrary('ChEA_2022')
# Get gene lists to put into Enrichr
gene_lists = {}
for m in all_signatures.keys():
    mname = m.split('_')[0]
    gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}
    for col in all_signatures[m].columns: 
        gene_lists[mname]['up'][col] = all_signatures[m][col].sort_values(ascending=False).index.tolist()[:100]
        gene_lists[mname]['down'][col] = all_signatures[m][col].sort_values(ascending=True).index.tolist()[:100]
        gene_lists[mname]['combined'][col] = gene_lists[mname]['up'][col] + gene_lists[mname]['down'][col]
# Get results
chea2022_results = []

for m in gene_lists.keys(): 
    for sig in gene_lists[m]['up'].keys(): 
        chea2022_results.append(enrich(gene_lists[m]['up'][sig], chea2022, f"{sig}:{m}:up:ChEA 2022"))
        chea2022_results.append(enrich(gene_lists[m]['down'][sig], chea2022, f"{sig}:{m}:down:ChEA 2022"))
        chea2022_results.append(enrich(gene_lists[m]['combined'][sig], chea2022, f"{sig}:{m}:combined:ChEA 2022"))

chea2022_df = pd.concat(chea2022_results, axis=0)
targets = [
  'NR3C1', 
  'NR1I2',
  'NR0B1'
]

dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: any(t in term for t in targets))]
dex_chea2022_df['Library'] = 'ChEA 2022'
def createResultsDf(df):
  df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])
  df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])
  df['Method_Direction'] = df.apply(lambda row: row.Method + ':' + row.Direction, axis=1)
  df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])
  df['Cell'] = df['Gene_Set'].apply(lambda x: x.split(':')[0].split('_')[1])
  df['Batch'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])
createResultsDf(dex_chea2022_df)
full_df = dex_chea2022_df

up_df = full_df[full_df['Direction'] == 'up']
down_df = full_df[full_df['Direction'] == 'down']
combined_df = full_df[full_df['Direction'] == 'combined']
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for up genes from each method."))
display(up_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for down genes from each method."))
display(down_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for combined up and down genes from each method."))
display(combined_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))

Mean ranks of Dexamethasone target TF terms for up genes from each method.

Rank
TF Library Method
NR0B1 ChEA 2022 cd 172.600000
ttest 182.900000
limma 195.500000
ranksum 210.200000
limma-voom 217.000000
modz 239.100000
fc 269.700000
NR1I2 ChEA 2022 fc 126.700000
cd 129.200000
modz 225.300000
ranksum 248.500000
limma-voom 260.400000
limma 299.300000
ttest 442.500000
NR3C1 ChEA 2022 fc 255.416667
cd 300.950000
limma-voom 306.200000
modz 320.450000
limma 351.150000
ranksum 356.833333
ttest 447.183333

Mean ranks of Dexamethasone target TF terms for down genes from each method.

Rank
TF Library Method
NR0B1 ChEA 2022 ranksum 135.100000
limma-voom 156.600000
cd 161.500000
limma 226.600000
modz 262.300000
ttest 316.100000
fc 354.700000
NR1I2 ChEA 2022 cd 238.200000
ttest 341.800000
modz 364.800000
ranksum 422.300000
limma 429.300000
fc 439.000000
limma-voom 485.900000
NR3C1 ChEA 2022 fc 255.983333
ttest 329.416667
limma 330.116667
cd 344.450000
ranksum 373.633333
limma-voom 424.350000
modz 435.383333

Mean ranks of Dexamethasone target TF terms for combined up and down genes from each method.

Rank
TF Library Method
NR0B1 ChEA 2022 limma-voom 116.200000
ranksum 123.600000
cd 129.000000
limma 171.300000
modz 196.800000
ttest 202.700000
fc 306.200000
NR1I2 ChEA 2022 cd 138.700000
fc 251.000000
modz 258.100000
ranksum 334.100000
limma-voom 355.900000
limma 362.900000
ttest 413.500000
NR3C1 ChEA 2022 fc 205.333333
limma 301.433333
cd 307.450000
limma-voom 336.583333
ranksum 337.883333
modz 353.650000
ttest 397.333333
# bootstrap random results
random_arr_chea2022 = []
for i in range(10):
    rand_100 = sample(combined_expr_df.index.tolist(), 100)
    rand_200 = sample(combined_expr_df.index.tolist(), 200)

    random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))
    random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))

rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)
rand_chea2022_df['Library'] = 'ChEA 2022'
rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0])

rand_df = rand_chea2022_df
rand_df = rand_df[rand_df['TF'].isin(['NR3C1', 'NR0B1', 'NR1I2'])]
rand_df['Method'] = 'random'

Boxplots¶

# with ChEA manual libraries
method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'limma-voom': '#DA79FF',
    'fc': '#DC267F',
    'ttest': '#FE6100',
    'ranksum': '#FFB000',
    'modz': '#009E73'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    order = sub_df.groupby('Method_Direction').mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
    sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
    sub_df['Direction'] = pd.Categorical(sub_df['Direction'], ['up', 'combined', 'down'])
    sub_df = sub_df.sort_values(by=['Direction', 'Method'])

    fig1 = go.Figure()
    for method in sub_df['Method'].unique():
        m_df = sub_df[sub_df['Method'] == method]
        fig1.add_trace(
            go.Box(
                x=m_df[m_df['Method'] == method]['Direction'],
                y=m_df[m_df['Method'] == method]['Rank'],
                name=method.replace('fc', 'logfc'), 
                marker_color=method_color_map[method]
            )
        )
    sub_rand_df = rand_df[rand_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Method'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        width=800,
        boxmode='group',
        boxgap=0.1,
        xaxis={
            'title': {'text': 'Direction'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
        legend_title_text='Method'
    )

    fig1.show("png")
    if tf == 'NR3C1': 
        fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3D_300dpi.png', scale=(800/300))
method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    x_order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').reset_index().rename_axis('Order').reset_index().set_index('Method_Direction')
    x_order['Labels'] = x_order.index.map(lambda x: x.split(':')[0]).tolist()
    fig1 = go.Figure()
    for d in ['up', 'combined', 'down']:
        d_df = sub_df[sub_df['Direction'] == d]
        fig1.add_trace(
            go.Box(
                x=[x_order.loc[x, 'Order'] for x in d_df['Method_Direction']],
                y=d_df['Rank'],
                name=d, 
                marker_color=method_color_map[d]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    sub_rand_df['Order'] = x_order.shape[0] + 1
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Order'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
    )
    fig1.update_xaxes(
        tickmode='array',
        tickangle=45,
        tickvals = x_order['Order'].tolist() + [x_order.shape[0] + 1],
        ticktext=x_order['Labels'].tolist() + ['Random']
    )
    fig1.show("png")
method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    fig1 = go.Figure()
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
    sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
    sub_df['Method'] = sub_df['Method'].apply(lambda x: x.replace('fc', 'logfc'))
    sub_df = sub_df.sort_values(by=['Method'])
    for d in ['up', 'combined', 'down']:
        d_df = sub_df[sub_df['Direction'] == d]
        fig1.add_trace(
            go.Box(
                x=d_df['Method'],
                y=d_df['Rank'],
                name=d, 
                marker_color=method_color_map[d]
            )
        )
    sub_rand_df = rand_df[rand_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Method'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        width=800,
        boxmode='group',
        boxgap=0.1,
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
        legend_title_text="Direction"
    )
    fig1.show("png")
    if tf=='NR3C1': 
        fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3C_300dpi.png', scale=(800/300))
method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'limma-voom': '#DA79FF',
    'fc': '#DC267F',
    'ttest': '#FE6100',
    'ranksum': '#FFB000',
    'modz': '#009E73'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Method']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Method']==gs]['Rank'].tolist(),
                name=gs.replace('fc', 'logfc'), 
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")
# with ChEA manual libraries
method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Direction']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Direction']==gs]['Rank'].tolist(),
                name=gs,
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
        xaxis={
            'title': {'text': 'Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")

Bridge plots¶

chea2022_genesets = {
    t: chea2022[t] for t in dex_chea2022_df['Term'].unique() if ('MLEC' not in t and 'Wistar' not in t)
}
def geneSetBridgePlot(gs, method):
    y = [] 
    for col in all_signatures[f"{method}_all"].columns:
        if method in ['fc', 'cd', 'modz']:
            sig = batch_signatures[method][col]['Significance'].sort_values(ascending=False)
        else:
            sig = batch_signatures[method][col]['Significance'].sort_values(ascending=True)
        select = pd.Series(
            [x.upper() in gs for x in sig.index.tolist()]
        )
        _, y_temp = bridge_plot(select)
        y.append(y_temp)
    y_mean = np.mean(np.array(y), axis=0)

    return y_mean

def methodBridgePlot(gsdict, tf, method):
    y = []
    for gs in gsdict.keys(): 
        if tf not in gs:
            continue
        y_temp = geneSetBridgePlot(gsdict[gs], method)
        y.append(y_temp)

    if len(y) == 0:
        return y
    else:
        y_mean = np.mean(np.array(y), axis=0)
        return y_mean

def randomBridgePlot(gsdict, tf): 
    y = []
    for gs in gsdict.keys(): 
        if tf not in gs: 
            continue
        y_gs = []
        for _ in range(10):
            rand_sig = sample(
                combined_expr_df.index.tolist(), 
                combined_expr_df.shape[0]
            )
            select = pd.Series([
                x.upper() in gsdict[gs] for x in rand_sig
            ])
            _, y_temp = bridge_plot(select)
            y_gs.append(y_temp)
        y.append(np.mean(np.array(y_gs), axis=0))
    if len(y) == 0:
        return y
    else:
        y_mean = np.mean(np.array(y), axis=0)
        return y_mean

def tfBridgePlot(tf, lib_genesets, libnames, leading_edge = False): 
    fig = plt.figure()
    if leading_edge:
        fig = plt.figure(figsize=(5,5))
    else:
        fig = plt.figure(figsize=(8,5))
    for method in batch_signatures.keys(): 
        x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
        y = []
        for lg in lib_genesets:
            y1 = methodBridgePlot(lg, tf, method)
            if len(y1) != 0:
                y.append(y1)
        y_mean = np.mean(np.array(y), axis=0)
        if leading_edge:
            x_mean = x/len(x)
            x_mean = x_mean[x_mean < 0.02]
            plt.plot(x_mean, y_mean[:len(x_mean)], label=method.split('_')[0])
        else:
            plt.plot(x/len(x), y_mean, label=method.split('_')[0])
    rand_x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
    rand_y = []
    for lg in lib_genesets:
        rand_y1 = randomBridgePlot(lg, tf)
        if len(rand_y1) != 0: 
            rand_y.append(rand_y1)
    rand_y_mean = np.mean(np.array(rand_y), axis=0)
    if leading_edge:
        rand_x_mean = rand_x/len(rand_x)
        rand_x_mean = rand_x_mean[rand_x_mean < 0.02]
        plt.plot(rand_x_mean, rand_y_mean[:len(rand_x_mean)], label='random', color='gray')
    else:
        plt.plot(rand_x/len(rand_x), rand_y_mean, label='random', color='gray')

    plt.axhline(y=0, color='black', linestyle='dashed')
    if leading_edge:
        plt.legend(bbox_to_anchor=(1,1))
    else:
        plt.legend(bbox_to_anchor=(1, 1))
    locs, labels = plt.xticks()  # Get the current locations and labels.
    if leading_edge:
        plt.xticks(np.arange(0, 0.02, step=0.005))
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.title(f"{tf} Target Gene Rankings for L1000 Dex Perturbation Signatures\nLibraries: {', '.join(libnames)}")
    plt.show()

Ranking by absolute expression value

tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'])
<Figure size 432x288 with 0 Axes>
tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'], True)
<Figure size 432x288 with 0 Axes>

Signature similarity¶

rank_comparisons = {
  b: pd.concat([
    all_signatures[method][b].rename(method.split('_')[0]).rank() for method in all_signatures.keys()
  ], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}

expr_comparisons = {
  b: pd.concat([
    all_signatures[method][b].rename(method.split('_')[0]) for method in all_signatures.keys()
  ], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}

method_cols = rank_comparisons[list(batches)[0]].columns
sns.set(font_scale=1.2)

Correlation of signature expression values¶

expr_correlations = {
  b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in expr_comparisons.items()
}
for _,m in expr_correlations.items():
  np.fill_diagonal(m, 1)

expr_mean_correlations = np.zeros(shape=expr_correlations[list(batches)[0]].shape)
for i in range(expr_mean_correlations.shape[0]):
  for j in range(expr_mean_correlations.shape[1]): 
    expr_mean_correlations[i,j] = np.mean([m[i,j] for _,m in expr_correlations.items()])

hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
  expr_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()

Correlation of signature gene ranks¶

rank_correlations = {
  b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in rank_comparisons.items()
}
for _,m in rank_correlations.items():
  np.fill_diagonal(m, 1)

rank_mean_correlations = np.zeros(shape=rank_correlations[list(batches)[0]].shape)
for i in range(rank_mean_correlations.shape[0]):
  for j in range(rank_mean_correlations.shape[1]): 
    rank_mean_correlations[i,j] = np.mean([m[i,j] for _,m in rank_correlations.items()])

hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
  rank_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()
View source code
Submit an issue