# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import requests
import scipy.stats as ss
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from math import log2
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
from os.path import exists
from scipy.stats import ttest_ind, ranksums
import h5py

File at '../L1000_data/Dexamethasone_L1000_ChemPert_fulldata.tsv' already exists!

File at '../L1000_data/Dexamethasone_L1000_ChemPert_metadata.tsv' already exists!

240

CPC006_A549_24H
CPC006_PC3_24H
CPC009_MCF7_24H
CPC006_HCC515_24H
CPC006_HT29_24H
CPC006_A375_24H
CPC009_A549_24H
CPC009_PC3_24H
CPC006_MCF7_24H
CPC006_HA1E_24H

R[write to console]: Loading required package: R.oo

R[write to console]: Loading required package: R.methodsS3

R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

R[write to console]: 
Attaching package: ‘R.oo’


R[write to console]: The following object is masked from ‘package:R.methodsS3’:

    throw


R[write to console]: The following objects are masked from ‘package:methods’:

    getClasses, getMethods


R[write to console]: The following objects are masked from ‘package:base’:

    attach, detach, load, save


R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

R[write to console]: 
Attaching package: ‘R.utils’


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp


R[write to console]: The following objects are masked from ‘package:base’:

    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
    warnings


R[write to console]: 
Attaching package: ‘RCurl’


R[write to console]: The following object is masked from ‘package:R.utils’:

    reset


R[write to console]: The following object is masked from ‘package:R.oo’:

    clone


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    I, expand.grid, unname


R[write to console]: Loading required package: IRanges

R[write to console]: 
Attaching package: ‘IRanges’


R[write to console]: The following object is masked from ‘package:R.oo’:

    trim


R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: GenomeInfoDb

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘Biobase’


R[write to console]: The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians


R[write to console]: 
Attaching package: ‘limma’


R[write to console]: The following object is masked from ‘package:DESeq2’:

    plotMA


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    plotMA

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>


# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import requests
import scipy.stats as ss
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from math import log2
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
from os.path import exists
from scipy.stats import ttest_ind, ranksums
import h5py


import warnings
warnings.filterwarnings('ignore')


# Set perturbagen
pert = 'dexamethasone'

# Set working directory
l1000_data_dir = '../L1000_data'


try: 
    expr_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index_col=0)
except: 
    l1000_data_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_data.tsv", sep='\t')

    l1000_data_list = []
    l1000_meta_list = []
    for row in l1000_data_df.itertuples(): 
        try:
            temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
        except:
            print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
            continue
        for col in temp_df.columns:
            l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())
        l1000_data_list.append(temp_df)
    expr_df = pd.concat(l1000_data_list, axis=1)
expr_df.head()


if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv"): 
    expr_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv' already exists!")

File at '../L1000_data/Dexamethasone_L1000_ChemPert_fulldata.tsv' already exists!


try: 
    meta_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index_col=0)
except:
    meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')
if 'batch' not in meta_df.columns:
    meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))
meta_df.head()


if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv"): 
    meta_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index=True)
else: 
    print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv' already exists!")

File at '../L1000_data/Dexamethasone_L1000_ChemPert_metadata.tsv' already exists!


all_batches = set(['_'.join(x.split('_')[:3]) for x in meta_df.index.tolist()])
len(all_batches)

240


modz_data_df = pd.read_csv(f'{l1000_data_dir}/dex_modz.gct', sep='\t', header=2).iloc[8:].set_index('pr_gene_symbol')
modz_data_df = modz_data_df.drop(columns=['id', 'pr_gene_title', 'pr_is_lm', 'pr_is_bing', 'self_correlation', 'frac_self_rank', 'p_value', 'gene_space'])
modz_data_df = modz_data_df.astype('float32')
modz_data_df.loc['MIA2'] = modz_data_df.loc[['MIA2', 'CTAGE5']].mean(axis=0)
modz_data_df = modz_data_df.drop(index=['CTAGE5'])
modz_data_df.columns = modz_data_df.columns.map(lambda x: x.split(':')[0])
modz_data_df = modz_data_df[[x for x in modz_data_df.columns if x.endswith('24H') ]]
modz_data_df


gctx = h5py.File(f"{l1000_data_dir}/Dexamethasone_L1000_Controls_fulldata.gctx", 'r')
ctrl_expr_df = pd.DataFrame(
    data=gctx['0/DATA/0/matrix'][()],
    index=gctx['0/META/COL/id'].asstr()[:],
    columns=gctx['0/META/ROW/id'].asstr()[:]
).T
gctx.close()


overlapping_dex = set(expr_df.columns.tolist()).intersection(ctrl_expr_df.columns.tolist())


ctrl_expr_df = ctrl_expr_df.drop(columns=list(overlapping_dex))
ctrl_expr_df.head()


# ctrl_expr_df.to_csv('../L1000_data/Dexamethasone_L1000_Control_Profiles.tsv.gz', sep='\t')


ctrl_meta_df = pd.DataFrame(
  data = [ctrl_expr_df.columns.tolist(),['_'.join(x.split('_')[:3]) for x in ctrl_expr_df.columns]],
  index = ['id', 'batch']
).T.set_index('id')
ctrl_meta_df.head()


combined_expr_df = pd.concat([
    expr_df.groupby(expr_df.index).mean(), 
    ctrl_expr_df.groupby(ctrl_expr_df.index).mean()
], axis=1)
combined_expr_df.head()


batches = all_batches.intersection(ctrl_meta_df['batch'].unique())
for b in batches:
  print(b)

CPC006_A549_24H
CPC006_PC3_24H
CPC009_MCF7_24H
CPC006_HCC515_24H
CPC006_HT29_24H
CPC006_A375_24H
CPC009_A549_24H
CPC009_PC3_24H
CPC006_MCF7_24H
CPC006_HA1E_24H


batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}
for b in batches: 
    batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()
    batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()


batch_signatures = {
    'cd': {}, 
    'limma': {}, 
    'limma-voom': {},
    'fc': {},
    'ranksum': {},
    'ttest': {},
    'modz': {},
}


for b in batches: 
  batch_signatures['modz'][b] = pd.concat([
    modz_data_df[b].rename('MODZ'), 
    modz_data_df[b].apply(abs).rename('Significance')
  ], axis=1)


# Function for computing signatures with characteristic direction
def cd_signature(ctrl_ids, case_ids, dataset):
  
    signature = characteristic_direction(
        dataset.loc[:, ctrl_ids], 
        dataset.loc[:, case_ids], 
        calculate_sig=True
    )
    signature['Significance'] = signature['CD-coefficient'].apply(abs)
    
    return signature.sort_values(by=['CD-coefficient'], ascending=False)


for b in batches: 
    batch_signatures['cd'][b] = cd_signature(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        combined_expr_df
    )


# Function for computing signatures
def limma(ctrl_ids, case_ids, dataset, voom):
    
    signature = limma_voom_differential_expression(
        dataset.loc[:, ctrl_ids],
        dataset.loc[:, case_ids],
        voom_design=voom,
        filter_genes=False
    )
    signature['Significance'] = signature['P.Value']

    return signature.sort_values("t", ascending=False)


for b in batches: 
    batch_signatures['limma'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df,
        voom=False
    )

R[write to console]: Loading required package: R.oo

R[write to console]: Loading required package: R.methodsS3

R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

R[write to console]: 
Attaching package: ‘R.oo’


R[write to console]: The following object is masked from ‘package:R.methodsS3’:

    throw


R[write to console]: The following objects are masked from ‘package:methods’:

    getClasses, getMethods


R[write to console]: The following objects are masked from ‘package:base’:

    attach, detach, load, save


R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

R[write to console]: 
Attaching package: ‘R.utils’


R[write to console]: The following object is masked from ‘package:utils’:

    timestamp


R[write to console]: The following objects are masked from ‘package:base’:

    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
    warnings


R[write to console]: 
Attaching package: ‘RCurl’


R[write to console]: The following object is masked from ‘package:R.utils’:

    reset


R[write to console]: The following object is masked from ‘package:R.oo’:

    clone


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    I, expand.grid, unname


R[write to console]: Loading required package: IRanges

R[write to console]: 
Attaching package: ‘IRanges’


R[write to console]: The following object is masked from ‘package:R.oo’:

    trim


R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: GenomeInfoDb

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: 
Attaching package: ‘Biobase’


R[write to console]: The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians


R[write to console]: 
Attaching package: ‘limma’


R[write to console]: The following object is masked from ‘package:DESeq2’:

    plotMA


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    plotMA


for b in batches: 
    batch_signatures['limma-voom'][b] = limma(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df,
        voom=True
    )


def ranksum(ctrl_ids, case_ids, dataset):
  if len(ctrl_ids) + len(case_ids) < 32: 
    print("Warning! Sample sizes < 16 generally do not provide good results. ")
  res_array = []
  for gene in dataset.index: 
    res = ranksums(
      dataset.loc[gene, case_ids],
      dataset.loc[gene, ctrl_ids]
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  return signature.sort_values(by=['Statistic'], ascending=False)


for b in batches: 
    batch_signatures['ranksum'][b] = ranksum(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df
    )


def ttest(ctrl_ids, case_ids, dataset):
  res_array = []
  for gene in dataset.index: 
    res = ttest_ind(
      dataset.loc[gene, case_ids],
      dataset.loc[gene, ctrl_ids],
      equal_var = False
    )
    res_array.append([gene, res.statistic, res.pvalue])
  signature = pd.DataFrame(
    res_array, columns=['Geneid', 'Statistic', 'Pvalue']
  ).set_index('Geneid')
  signature['Significance'] = signature['Pvalue']
  return signature.sort_values(by=['Statistic'], ascending=False)


for b in batches: 
    batch_signatures['ttest'][b] = ttest(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'], 
        combined_expr_df
    )


# Function for computing signatures with fold change
def logFC(ctrl_ids, case_ids, dataset):

    case_mean = dataset.loc[:, case_ids].mean(axis=1)
    ctrl_mean = dataset.loc[:, ctrl_ids].mean(axis=1)

    signature = case_mean / (ctrl_mean + 0.001)

    signature_df = pd.DataFrame(
        signature.apply(lambda x: log2(x+0.001)), columns=['logFC']
    )
    signature_df['Significance'] = signature_df['logFC'].apply(abs)
    
    return signature_df.sort_values('logFC', ascending=False)


for b in batches: 
    batch_signatures['fc'][b] = logFC(
        batch_profiles[b]['ctrls'], 
        batch_profiles[b]['perts'],
        combined_expr_df
    )


all_signatures = {}

all_signatures['cd_all']= pd.concat([
    df['CD-coefficient'].rename(b) for (b, df) in batch_signatures['cd'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma_all'] = pd.concat([
    df['t'].rename(b) for (b, df) in batch_signatures['limma'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma-voom_all'] = pd.concat([
    df['t'].rename(b) for (b, df) in batch_signatures['limma-voom'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ranksum_all'] = pd.concat([
    df['Statistic'].rename(b) for (b, df) in batch_signatures['ranksum'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ttest_all'] = pd.concat([
    df['Statistic'].rename(b) for (b, df) in batch_signatures['ttest'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['modz_all'] = pd.concat([
    df['MODZ'].rename(b) for (b, df) in batch_signatures['modz'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['fc_all'] = pd.concat([
    df['logFC'].rename(b) for (b, df) in batch_signatures['fc'].items()
], axis=1).sort_index().rename_axis('Gene')

for k in all_signatures.keys(): 
    method = k.split('_')[0].upper()
    display(Markdown(f"All {method} batch signatures"))
    display(all_signatures[k])


# Function to get Enrichr Results

def getEnrichrLibrary(library_name): 
    ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'
    resp = requests.get(ENRICHR_URL)
    if not resp.ok: 
        raise Exception(f"Error downloading {library_name} library from Enrichr, please try again.")
    return resp.json()[library_name]['terms']

def getLibraryIter(libdict):
    for k,v in libdict.items():
        if type(v) == list:
            yield k, v
        else:
            yield k, list(v.keys())

def enrich(gene_list, lib_json, name): 
    all_terms = list(lib_json.keys())
    termranks = []
    enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)
    enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]
    sorted_res = sorted(enrich_res, key=lambda x: x[1])
    for i in range(len(sorted_res)): 
        termranks.append([name, sorted_res[i][0], i])
    for t in set(all_terms).difference([x[1] for x in termranks]): 
        i+=1
        termranks.append([name, t, i])
    return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])


chea2022 = getEnrichrLibrary('ChEA_2022')


# Get gene lists to put into Enrichr
gene_lists = {}
for m in all_signatures.keys():
    mname = m.split('_')[0]
    gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}
    for col in all_signatures[m].columns: 
        gene_lists[mname]['up'][col] = all_signatures[m][col].sort_values(ascending=False).index.tolist()[:100]
        gene_lists[mname]['down'][col] = all_signatures[m][col].sort_values(ascending=True).index.tolist()[:100]
        gene_lists[mname]['combined'][col] = gene_lists[mname]['up'][col] + gene_lists[mname]['down'][col]


# Get results
chea2022_results = []

for m in gene_lists.keys(): 
    for sig in gene_lists[m]['up'].keys(): 
        chea2022_results.append(enrich(gene_lists[m]['up'][sig], chea2022, f"{sig}:{m}:up:ChEA 2022"))
        chea2022_results.append(enrich(gene_lists[m]['down'][sig], chea2022, f"{sig}:{m}:down:ChEA 2022"))
        chea2022_results.append(enrich(gene_lists[m]['combined'][sig], chea2022, f"{sig}:{m}:combined:ChEA 2022"))

chea2022_df = pd.concat(chea2022_results, axis=0)


targets = [
  'NR3C1', 
  'NR1I2',
  'NR0B1'
]

dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: any(t in term for t in targets))]
dex_chea2022_df['Library'] = 'ChEA 2022'


def createResultsDf(df):
  df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])
  df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])
  df['Method_Direction'] = df.apply(lambda row: row.Method + ':' + row.Direction, axis=1)
  df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])
  df['Cell'] = df['Gene_Set'].apply(lambda x: x.split(':')[0].split('_')[1])
  df['Batch'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])


createResultsDf(dex_chea2022_df)


full_df = dex_chea2022_df

up_df = full_df[full_df['Direction'] == 'up']
down_df = full_df[full_df['Direction'] == 'down']
combined_df = full_df[full_df['Direction'] == 'combined']


display(Markdown(f"Mean ranks of Dexamethasone target TF terms for up genes from each method."))
display(up_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for down genes from each method."))
display(down_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for combined up and down genes from each method."))
display(combined_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))


# bootstrap random results
random_arr_chea2022 = []
for i in range(10):
    rand_100 = sample(combined_expr_df.index.tolist(), 100)
    rand_200 = sample(combined_expr_df.index.tolist(), 200)

    random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))
    random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))

rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)
rand_chea2022_df['Library'] = 'ChEA 2022'
rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0])

rand_df = rand_chea2022_df
rand_df = rand_df[rand_df['TF'].isin(['NR3C1', 'NR0B1', 'NR1I2'])]
rand_df['Method'] = 'random'


# with ChEA manual libraries
method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'limma-voom': '#DA79FF',
    'fc': '#DC267F',
    'ttest': '#FE6100',
    'ranksum': '#FFB000',
    'modz': '#009E73'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    order = sub_df.groupby('Method_Direction').mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
    sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
    sub_df['Direction'] = pd.Categorical(sub_df['Direction'], ['up', 'combined', 'down'])
    sub_df = sub_df.sort_values(by=['Direction', 'Method'])

    fig1 = go.Figure()
    for method in sub_df['Method'].unique():
        m_df = sub_df[sub_df['Method'] == method]
        fig1.add_trace(
            go.Box(
                x=m_df[m_df['Method'] == method]['Direction'],
                y=m_df[m_df['Method'] == method]['Rank'],
                name=method.replace('fc', 'logfc'), 
                marker_color=method_color_map[method]
            )
        )
    sub_rand_df = rand_df[rand_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Method'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        width=800,
        boxmode='group',
        boxgap=0.1,
        xaxis={
            'title': {'text': 'Direction'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
        legend_title_text='Method'
    )

    fig1.show("png")
    if tf == 'NR3C1': 
        fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3D_300dpi.png', scale=(800/300))


method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    x_order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').reset_index().rename_axis('Order').reset_index().set_index('Method_Direction')
    x_order['Labels'] = x_order.index.map(lambda x: x.split(':')[0]).tolist()
    fig1 = go.Figure()
    for d in ['up', 'combined', 'down']:
        d_df = sub_df[sub_df['Direction'] == d]
        fig1.add_trace(
            go.Box(
                x=[x_order.loc[x, 'Order'] for x in d_df['Method_Direction']],
                y=d_df['Rank'],
                name=d, 
                marker_color=method_color_map[d]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    sub_rand_df['Order'] = x_order.shape[0] + 1
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Order'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
    )
    fig1.update_xaxes(
        tickmode='array',
        tickangle=45,
        tickvals = x_order['Order'].tolist() + [x_order.shape[0] + 1],
        ticktext=x_order['Labels'].tolist() + ['Random']
    )
    fig1.show("png")


method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    fig1 = go.Figure()
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
    order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
    sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
    sub_df['Method'] = sub_df['Method'].apply(lambda x: x.replace('fc', 'logfc'))
    sub_df = sub_df.sort_values(by=['Method'])
    for d in ['up', 'combined', 'down']:
        d_df = sub_df[sub_df['Direction'] == d]
        fig1.add_trace(
            go.Box(
                x=d_df['Method'],
                y=d_df['Rank'],
                name=d, 
                marker_color=method_color_map[d]
            )
        )
    sub_rand_df = rand_df[rand_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            x=sub_rand_df['Method'],
            y=sub_rand_df['Rank'],
            name='random',
            marker_color='black'
        )
    )
    fig1.update_layout(
        width=800,
        boxmode='group',
        boxgap=0.1,
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Gene Set Rank'}
        },
        legend_title_text="Direction"
    )
    fig1.show("png")
    if tf=='NR3C1': 
        fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3C_300dpi.png', scale=(800/300))


method_color_map = {
    'random': 'black',
    'cd': '#648FFF',
    'limma': '#785EF0',
    'limma-voom': '#DA79FF',
    'fc': '#DC267F',
    'ttest': '#FE6100',
    'ranksum': '#FFB000',
    'modz': '#009E73'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Method']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Method']==gs]['Rank'].tolist(),
                name=gs.replace('fc', 'logfc'), 
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
        xaxis={
            'title': {'text': 'Method'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


# with ChEA manual libraries
method_color_map = {
    'up': 'blue', 
    'down': 'red', 
    'combined': 'purple'
}
for tf in full_df['TF'].unique():
    sub_df = full_df[full_df['TF'] == tf]
    sub_df = sub_df[sub_df['Cell'] == 'A375']
    sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
    fig1 = go.Figure()
    for gs in sub_df.groupby(['Direction']).mean(numeric_only=True).sort_values('Rank').index:
        fig1.add_trace(
            go.Box(
                y=sub_df[sub_df['Direction']==gs]['Rank'].tolist(),
                name=gs,
                marker_color=method_color_map[gs]
            )
        )
    sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
    fig1.add_trace(
        go.Box(
            y=sub_rand_df['Rank'].tolist(),
            name='random',
            marker_color='black'
        )
    )

    fig1.update_layout(
        title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
        xaxis={
            'title': {'text': 'Direction of Gene Set'}, 
        },
        yaxis={
            'title': {'text': 'Rank'}
        },
        showlegend=False
    )
    fig1.show("png")


chea2022_genesets = {
    t: chea2022[t] for t in dex_chea2022_df['Term'].unique() if ('MLEC' not in t and 'Wistar' not in t)
}


def geneSetBridgePlot(gs, method):
    y = [] 
    for col in all_signatures[f"{method}_all"].columns:
        if method in ['fc', 'cd', 'modz']:
            sig = batch_signatures[method][col]['Significance'].sort_values(ascending=False)
        else:
            sig = batch_signatures[method][col]['Significance'].sort_values(ascending=True)
        select = pd.Series(
            [x.upper() in gs for x in sig.index.tolist()]
        )
        _, y_temp = bridge_plot(select)
        y.append(y_temp)
    y_mean = np.mean(np.array(y), axis=0)

    return y_mean

def methodBridgePlot(gsdict, tf, method):
    y = []
    for gs in gsdict.keys(): 
        if tf not in gs:
            continue
        y_temp = geneSetBridgePlot(gsdict[gs], method)
        y.append(y_temp)

    if len(y) == 0:
        return y
    else:
        y_mean = np.mean(np.array(y), axis=0)
        return y_mean

def randomBridgePlot(gsdict, tf): 
    y = []
    for gs in gsdict.keys(): 
        if tf not in gs: 
            continue
        y_gs = []
        for _ in range(10):
            rand_sig = sample(
                combined_expr_df.index.tolist(), 
                combined_expr_df.shape[0]
            )
            select = pd.Series([
                x.upper() in gsdict[gs] for x in rand_sig
            ])
            _, y_temp = bridge_plot(select)
            y_gs.append(y_temp)
        y.append(np.mean(np.array(y_gs), axis=0))
    if len(y) == 0:
        return y
    else:
        y_mean = np.mean(np.array(y), axis=0)
        return y_mean

def tfBridgePlot(tf, lib_genesets, libnames, leading_edge = False): 
    fig = plt.figure()
    if leading_edge:
        fig = plt.figure(figsize=(5,5))
    else:
        fig = plt.figure(figsize=(8,5))
    for method in batch_signatures.keys(): 
        x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
        y = []
        for lg in lib_genesets:
            y1 = methodBridgePlot(lg, tf, method)
            if len(y1) != 0:
                y.append(y1)
        y_mean = np.mean(np.array(y), axis=0)
        if leading_edge:
            x_mean = x/len(x)
            x_mean = x_mean[x_mean < 0.02]
            plt.plot(x_mean, y_mean[:len(x_mean)], label=method.split('_')[0])
        else:
            plt.plot(x/len(x), y_mean, label=method.split('_')[0])
    rand_x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
    rand_y = []
    for lg in lib_genesets:
        rand_y1 = randomBridgePlot(lg, tf)
        if len(rand_y1) != 0: 
            rand_y.append(rand_y1)
    rand_y_mean = np.mean(np.array(rand_y), axis=0)
    if leading_edge:
        rand_x_mean = rand_x/len(rand_x)
        rand_x_mean = rand_x_mean[rand_x_mean < 0.02]
        plt.plot(rand_x_mean, rand_y_mean[:len(rand_x_mean)], label='random', color='gray')
    else:
        plt.plot(rand_x/len(rand_x), rand_y_mean, label='random', color='gray')

    plt.axhline(y=0, color='black', linestyle='dashed')
    if leading_edge:
        plt.legend(bbox_to_anchor=(1,1))
    else:
        plt.legend(bbox_to_anchor=(1, 1))
    locs, labels = plt.xticks()  # Get the current locations and labels.
    if leading_edge:
        plt.xticks(np.arange(0, 0.02, step=0.005))
    plt.xlabel('Rank')
    plt.ylabel('D(r) - r')
    plt.title(f"{tf} Target Gene Rankings for L1000 Dex Perturbation Signatures\nLibraries: {', '.join(libnames)}")
    plt.show()


tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'])

<Figure size 432x288 with 0 Axes>


tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'], True)

<Figure size 432x288 with 0 Axes>


rank_comparisons = {
  b: pd.concat([
    all_signatures[method][b].rename(method.split('_')[0]).rank() for method in all_signatures.keys()
  ], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}

expr_comparisons = {
  b: pd.concat([
    all_signatures[method][b].rename(method.split('_')[0]) for method in all_signatures.keys()
  ], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}

method_cols = rank_comparisons[list(batches)[0]].columns


sns.set(font_scale=1.2)


expr_correlations = {
  b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in expr_comparisons.items()
}
for _,m in expr_correlations.items():
  np.fill_diagonal(m, 1)

expr_mean_correlations = np.zeros(shape=expr_correlations[list(batches)[0]].shape)
for i in range(expr_mean_correlations.shape[0]):
  for j in range(expr_mean_correlations.shape[1]): 
    expr_mean_correlations[i,j] = np.mean([m[i,j] for _,m in expr_correlations.items()])

hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
  expr_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()


rank_correlations = {
  b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in rank_comparisons.items()
}
for _,m in rank_correlations.items():
  np.fill_diagonal(m, 1)

rank_mean_correlations = np.zeros(shape=rank_correlations[list(batches)[0]].shape)
for i in range(rank_mean_correlations.shape[0]):
  for j in range(rank_mean_correlations.shape[1]): 
    rank_mean_correlations[i,j] = np.mean([m[i,j] for _,m in rank_correlations.items()])

hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
  rank_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()

	CPC004_A375_6H_X1_B3_DUO52HI53LO:E13	CPC004_A375_6H_X2_B3_DUO52HI53LO:E13	CPC004_A375_6H_X3_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13	CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13	...	CRCGN004_PC3_6H_X2_F1B7_DUO52HI53LO:J21	LPROT002_A375_6H_X1_B22:H07	LPROT002_A375_6H_X1_B22:H09	LPROT002_A375_6H_X1_B22:H11	LPROT002_MCF7_6H_X1_B22:H08	LPROT002_MCF7_6H_X1_B22:H10	LPROT002_MCF7_6H_X1_B22:H12	LPROT004_YAPC_6H_X1_B32:H07	LPROT004_YAPC_6H_X1_B32:H09	LPROT004_YAPC_6H_X1_B32:H11
symbol
NAT2	5.347525	5.31260	5.54900	5.2620	5.3664	5.1632	6.5029	6.1924	5.701800	5.69435	...	6.521600	3.8543	4.558200	4.8785	4.0917	4.3097	4.594400	5.051500	4.67150	5.2628
ADA	6.814449	6.21185	7.37235	7.9995	7.4117	7.6526	8.4221	7.4373	9.558400	7.31210	...	8.412251	8.5299	8.172900	7.8001	7.7042	7.5313	7.161799	6.563300	5.72615	6.8200
CDH2	7.497050	8.35885	7.51460	10.1442	11.5089	10.4157	9.9937	6.0723	7.950950	6.61335	...	3.211200	10.5305	9.309799	9.0100	5.4028	5.9991	5.022750	6.782700	6.52530	6.6286
AKT3	9.710200	11.61895	9.40310	10.4081	9.9464	9.1548	10.6656	9.7118	8.052999	7.07675	...	7.751399	9.9655	8.703700	9.5298	5.4086	7.9250	5.372250	8.076001	7.89250	8.1305
MED6	7.923500	6.12090	8.32925	8.2268	7.8020	8.0996	8.9311	8.5562	9.422951	8.44575	...	9.265400	9.7446	9.507300	9.4560	10.4112	9.0910	9.762300	9.515800	9.54300	9.8093

	tissue	disease	cell_line	pert_name	pert_time	pert_type	data_level	creation_time	persistent_id	pert_dose	batch
id
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13	skin of body	melanoma	A375	dexamethasone	6 h	Chemical Perturbation	3	2021-01-21	https://lincs-dcic.s3.amazonaws.com/LINCS-data...	NaN	CPC004_A375_6H
CPC004_A375_6H_X2_B3_DUO52HI53LO:E13	skin of body	melanoma	A375	dexamethasone	6 h	Chemical Perturbation	3	2021-01-21	https://lincs-dcic.s3.amazonaws.com/LINCS-data...	NaN	CPC004_A375_6H
CPC004_A375_6H_X3_B3_DUO52HI53LO:E13	skin of body	melanoma	A375	dexamethasone	6 h	Chemical Perturbation	3	2021-01-21	https://lincs-dcic.s3.amazonaws.com/LINCS-data...	NaN	CPC004_A375_6H
CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13	kidney	NaN	HA1E	dexamethasone	24 h	Chemical Perturbation	3	2021-01-21	https://lincs-dcic.s3.amazonaws.com/LINCS-data...	NaN	CPC004_HA1E_24H
CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13	kidney	NaN	HA1E	dexamethasone	24 h	Chemical Perturbation	3	2021-01-21	https://lincs-dcic.s3.amazonaws.com/LINCS-data...	NaN	CPC004_HA1E_24H

	CPC006_A375_24H	CPC006_A549_24H	CPC006_HA1E_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_MCF7_24H	CPC006_PC3_24H	CPC009_A549_24H	CPC009_MCF7_24H	CPC009_PC3_24H	CPC006_VCAP_24H	CPC009_VCAP_24H	CRCGN001_HA1E_24H	CRCGN004_HA1E_24H
pr_gene_symbol
PSME1	0.50	-1.32	1.37	1.03	0.67	0.47	-0.17	-0.44	1.65	0.07	0.24	0.27	-0.58	0.72
ATF1	1.03	-0.42	0.98	0.26	0.39	0.76	0.09	-0.05	-1.87	0.65	0.10	0.53	-0.78	-0.02
RHEB	-0.02	-0.66	-0.27	-0.84	0.41	0.04	0.24	-0.63	1.13	1.06	0.41	-0.48	-0.76	0.05
FOXO3	-0.75	2.80	-1.63	1.05	-0.16	-0.67	-0.94	1.96	-0.21	-0.31	-0.54	0.37	-0.24	0.20
RHOA	0.10	-0.33	-0.37	1.16	-0.11	1.08	0.85	0.02	1.40	0.34	0.35	0.43	-0.45	0.46
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ADGRA2	-0.34	0.24	0.05	-1.21	-0.34	-0.34	-0.16	-0.26	-0.29	-0.25	-0.17	-1.03	0.70	-0.40
CX3CL1	-0.33	-0.18	-0.17	0.21	-0.21	-0.41	0.33	0.07	-0.44	0.38	-0.20	-0.90	0.18	-0.12
ADAP1	-0.48	-0.09	-0.99	0.23	-0.06	-0.54	0.42	-0.43	-0.65	0.53	-0.49	1.17	1.23	0.12
EPS8L1	-0.69	-0.68	-0.87	-0.30	0.15	0.49	-0.58	-0.59	0.88	-0.86	-0.38	0.87	-0.03	-0.87
ACTB	-0.20	-0.32	0.14	0.40	-0.73	-0.23	-0.16	0.46	0.37	0.16	-0.03	2.51	-0.60	0.73

	CPC006_A375_24H_X5_B4_DUO52HI53LO:A17	CPC006_A375_24H_X1_B3_DUO52HI53LO:A18	CPC006_A375_24H_X2_B3_DUO52HI53LO:A18	CPC006_A375_24H_X5_B4_DUO52HI53LO:A18	CPC006_A375_24H_X2_B3_DUO52HI53LO:B17	CPC006_A375_24H_X4_B4_DUO52HI53LO:B17	CPC006_A375_24H_X5_B4_DUO52HI53LO:B17	CPC006_A375_24H_X1_B3_DUO52HI53LO:B18	CPC006_A375_24H_X4_B4_DUO52HI53LO:B18	CPC006_A375_24H_X5_B4_DUO52HI53LO:B18	...	CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12	CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12	CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13	CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13
NAT2	7.89775	8.68995	6.80705	8.04730	7.54740	6.815075	7.633851	7.339425	8.56575	7.03535	...	4.59600	5.69120	5.410600	5.16645	5.6062	5.12355	5.612900	5.61780	5.964099	5.46440
ADA	3.51090	4.02850	4.11730	3.89205	4.56435	4.913550	4.724325	3.815050	3.26435	3.49990	...	9.13180	9.69640	10.277700	8.01760	9.1136	8.69590	9.973300	9.98530	8.143050	8.43330
CDH2	5.07510	5.84900	5.84390	5.05145	5.76565	5.012725	4.946300	6.192550	4.91245	4.96025	...	4.98080	3.02930	3.250800	4.04120	3.6938	5.21580	3.833500	3.92415	3.731000	6.07040
AKT3	3.64940	4.85665	4.31915	5.40425	5.03620	3.253500	4.558100	4.819650	6.11925	4.10250	...	5.80955	6.67030	7.958799	5.72470	5.8401	6.77600	7.432001	8.05450	5.867200	6.17385
MED6	5.20905	5.51090	6.04015	5.06230	5.91720	4.498100	5.335800	5.388175	5.42000	5.15100	...	8.67330	7.03495	8.061700	8.36450	8.3464	8.82380	7.499700	8.43520	8.015600	9.17250

	CPC004_A375_6H_X1_B3_DUO52HI53LO:E13	CPC004_A375_6H_X2_B3_DUO52HI53LO:E13	CPC004_A375_6H_X3_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13	CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13	CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13	CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13	...	CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12	CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12	CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12	CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13	CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13	CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13
A1CF	3.706150	5.19435	4.10195	4.64380	4.270800	3.9665	4.4174	4.5412	4.5436	4.50700	...	3.42250	4.26580	3.922100	4.60500	4.60140	3.350000	4.33555	3.6938	4.4059	4.400100
A2M	7.060125	4.31380	7.43550	5.55190	7.082000	6.1772	5.7649	8.5913	6.3916	8.73610	...	8.69920	11.61970	10.921300	8.47680	10.67320	9.651800	8.33980	8.0536	10.0006	9.749900
A4GALT	6.245200	6.80720	6.30480	7.30625	6.827850	7.1949	6.9762	7.0023	6.8364	6.90265	...	6.71325	6.60190	7.314400	6.81515	6.59470	6.657900	6.45930	6.8790	6.4290	6.985100
A4GNT	5.558300	5.77020	5.68940	5.67050	5.731700	5.8950	6.0036	6.2611	6.3069	5.24015	...	5.60620	5.78565	5.702550	6.01230	5.86385	5.448800	6.15945	6.0248	6.0777	5.834650
AAAS	8.256000	6.88010	8.09350	8.25630	8.228301	8.0326	7.9515	7.6869	7.2894	8.11190	...	7.64870	7.58680	7.712349	7.53270	7.52370	7.898301	8.02430	7.7470	7.8282	7.616199

Benchmarking the Recovery of Known Targets from L1000 Dex Perturbation Data¶

Load in Data¶

Load in MODZ Data¶

Load in Control Data¶

Process Data¶

Remove duplicate genes¶

Get batches¶

Compute Signatures: Batch Perturbations vs. Batch Controls¶

MODZ¶

Characteristic Direction¶

Limma¶

Wilcoxon Rank-Sum Test¶

Welch's t-test¶

(log2) Fold Change¶

All Signatures¶

Enrichment Analysis Rankings¶

Boxplots¶

Bridge plots¶

Signature similarity¶

Correlation of signature expression values¶

Correlation of signature gene ranks¶

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	0.000952	0.001854	0.008695	-0.001396	-0.007102	0.013102	0.005242	0.018031	-0.001283	-0.012737
A2M	0.021682	-0.026343	0.022046	0.002863	-0.009718	0.003915	-0.006546	-0.018333	-0.000499	-0.006217
A4GALT	0.006834	0.004920	-0.006121	0.003770	0.000802	0.002384	0.000730	0.003674	0.004756	0.001069
A4GNT	-0.001011	0.003500	-0.004465	0.002190	-0.004268	-0.002679	0.000148	0.006255	0.002543	-0.001182
AAAS	-0.003752	-0.003620	-0.003427	0.001099	-0.003757	-0.005967	-0.002238	0.002202	0.008218	-0.003162
...	...	...	...	...	...	...	...	...	...	...
ZXDB	0.003825	-0.000073	0.000986	0.000702	-0.004406	0.002464	-0.001844	-0.002293	-0.000160	-0.004288
ZXDC	0.005625	0.004271	0.003541	0.002027	0.008632	0.001144	0.000540	0.000558	0.009406	-0.002195
ZYX	-0.002841	-0.014867	-0.003315	0.001899	-0.006242	-0.013534	-0.005301	-0.018816	0.009712	-0.002055
ZZEF1	-0.005954	-0.007576	0.011017	-0.005430	0.006954	0.004589	0.002710	-0.003639	0.004555	-0.006276
ZZZ3	-0.007250	-0.001067	-0.007591	-0.002336	0.010721	-0.004806	-0.009311	0.003312	-0.002183	-0.014607

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	-0.294205	-0.823307	0.224118	-1.345300	-1.803227	-0.457685	0.488978	-0.066385	-0.496867	-1.385689
A2M	1.287795	-0.226417	0.060950	0.611259	-1.071238	0.199035	-0.957864	-0.837714	-1.592196	-0.675840
A4GALT	0.592904	0.990301	0.837572	1.536314	-0.671224	-0.084085	1.052068	0.306574	1.264702	1.458460
A4GNT	-0.143384	-0.811679	-0.182202	-0.432298	-1.127489	-0.521193	-0.875634	-0.312889	-0.982429	-0.717047
AAAS	0.086139	1.050528	0.469454	0.931823	-0.123393	1.261981	-0.698037	1.166193	1.790793	0.821752
...	...	...	...	...	...	...	...	...	...	...
ZXDB	1.740872	-0.855682	1.459864	1.128436	-1.638648	-1.103166	0.583538	-1.900811	-1.288176	-0.708441
ZXDC	0.207162	-0.498437	0.114471	-0.111600	0.964735	0.191507	-0.366940	-0.705952	-0.582374	-0.951470
ZYX	0.515698	-0.059634	0.061114	0.036428	0.500599	0.348942	0.680457	0.156431	0.123771	0.932210
ZZEF1	-0.355968	-0.811313	1.224806	-1.059856	-0.369694	-0.557480	-0.009333	0.326231	-0.257675	-1.265336
ZZZ3	-0.409015	-0.177657	-2.337986	0.017358	0.614833	0.448167	-0.225108	1.262405	0.031205	-0.344133

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	-0.299872	-0.723968	0.233817	-1.151080	-1.426047	-0.432861	0.530413	-0.065431	-0.461192	-1.057338
A2M	1.195174	-0.221229	0.060802	0.612243	-1.045904	0.198667	-0.985443	-0.857667	-1.256354	-0.640593
A4GALT	0.586769	0.992215	0.835355	1.541795	-0.671289	-0.084093	1.041888	0.306185	1.263357	1.457404
A4GNT	-0.146902	-0.778560	-0.182461	-0.420382	-1.089057	-0.518001	-0.803284	-0.310522	-0.957077	-0.693777
AAAS	0.086007	1.044281	0.467396	0.924893	-0.123432	1.250010	-0.701899	1.160633	1.762998	0.816807
...	...	...	...	...	...	...	...	...	...	...
ZXDB	1.794068	-0.842987	1.473848	1.158135	-1.582549	-1.086361	0.588070	-1.859902	-1.272800	-0.697761
ZXDC	0.209948	-0.491523	0.114394	-0.111469	0.963724	0.191458	-0.350736	-0.688386	-0.581834	-0.899001
ZYX	0.511624	-0.059739	0.060987	0.036396	0.494438	0.346584	0.666952	0.155801	0.123418	0.894256
ZZEF1	-0.358077	-0.809179	1.215608	-1.062095	-0.369780	-0.559051	-0.009334	0.325266	-0.257917	-1.270832
ZZZ3	-0.412010	-0.178105	-2.364038	0.017409	0.612756	0.446982	-0.225531	1.246159	0.031202	-0.344768

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	-0.869339	-0.653255	0.782913	-1.569126	-2.412502	-0.417907	0.782361	0.777130	0.269830	-2.063696
A2M	1.217205	-0.420350	0.079282	1.065474	-1.368122	0.120335	-1.262522	-1.052777	-2.055985	-0.538814
A4GALT	1.838812	1.132939	1.248697	1.542712	-0.807251	-0.227299	1.383402	0.227525	1.710192	1.742911
A4GNT	-0.660620	-0.838930	0.052855	0.719418	-1.996264	0.006530	-0.970396	0.126637	-1.091930	-0.575701
AAAS	0.020807	0.966747	0.670597	0.436577	-0.350229	1.663231	-1.302816	1.034626	2.517042	0.741692
...	...	...	...	...	...	...	...	...	...	...
ZXDB	1.862220	-1.568344	1.704571	1.396988	-1.905869	-1.035439	0.849516	-2.128772	-1.379944	-1.143497
ZXDC	0.598199	-0.123685	0.361726	0.368019	1.300430	0.761498	-0.530528	-0.706636	-0.254285	-1.029542
ZYX	0.518223	-0.814134	0.044596	-0.387013	0.241755	-0.109141	1.020763	-0.607014	-0.164244	0.947864
ZZEF1	-0.531877	-0.807345	1.519579	-1.007007	-0.066010	-0.208021	-0.134311	0.497262	0.232875	-1.667160
ZZZ3	-1.148282	-0.231429	-1.641806	-0.232980	0.516304	0.350121	-0.282053	1.427624	-0.231702	-0.563844

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	-0.320777	-3.318610	0.283788	-2.466058	-3.953539	-0.815173	0.600913	-0.305456	-2.726108	-4.637847
A2M	1.029799	-0.350408	0.055592	0.621131	-1.315067	0.178960	-9.727957	-1.619311	-2.894856	-0.607508
A4GALT	0.519774	1.478609	2.107487	2.554163	-1.015374	-0.187363	2.934027	0.825312	1.883732	2.333186
A4GNT	-0.246314	-5.220164	-0.197389	-3.498468	-3.900530	-1.452731	-1.084934	-1.255108	-4.730529	-6.601442
AAAS	0.051627	2.359116	0.993238	1.795143	-0.165691	3.840168	-1.004628	1.396759	4.285530	1.741888
...	...	...	...	...	...	...	...	...	...	...
ZXDB	1.817554	-1.892693	1.966396	1.818086	-3.129890	-1.477842	0.748831	-3.603297	-1.666687	-0.594905
ZXDC	0.166390	-1.068104	0.161120	-0.533967	1.703616	0.246233	-2.464852	-1.725556	-1.192863	-1.542090
ZYX	0.591836	-0.279141	0.127387	-0.150160	0.773410	0.498327	13.014875	0.594875	0.128454	2.174616
ZZEF1	-1.006280	-2.492458	1.719959	-2.763837	-0.754950	-2.341365	-0.353694	0.462720	-0.232539	-3.994231
ZZZ3	-1.103177	-0.277392	-1.241546	-0.041727	1.173309	0.500864	-0.571492	1.782880	0.015091	-0.563517

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	0.44	0.02	1.00	-0.51	-1.47	-0.14	0.34	0.01	0.21	-0.96
A2M	1.64	0.39	0.18	0.85	-0.08	-0.19	-0.72	-0.47	-0.39	-0.41
A4GALT	1.50	0.38	1.32	-0.01	-0.84	-0.13	0.59	-0.31	0.66	0.19
A4GNT	0.63	0.04	0.75	0.38	-1.15	-0.61	-0.71	-0.31	-0.24	-1.03
AAAS	0.12	-0.15	0.75	-0.30	-0.62	0.25	-1.42	0.20	0.38	-0.85
...	...	...	...	...	...	...	...	...	...	...
ZXDB	1.07	-0.42	1.75	0.44	-0.58	-0.64	0.10	-1.07	-0.30	-0.95
ZXDC	0.97	-0.12	0.83	-0.06	0.12	0.10	-0.39	-0.58	0.16	-0.87
ZYX	-0.23	-0.39	0.06	-0.07	0.03	-0.58	0.65	-0.28	-0.10	0.19
ZZEF1	0.02	-0.05	1.33	-0.31	-0.37	0.23	-0.16	-0.01	-0.27	-0.50
ZZZ3	0.11	0.08	-1.91	-0.16	-0.35	0.02	-0.42	0.76	-0.21	-0.16

	CPC006_A549_24H	CPC006_PC3_24H	CPC009_MCF7_24H	CPC006_HCC515_24H	CPC006_HT29_24H	CPC006_A375_24H	CPC009_A549_24H	CPC009_PC3_24H	CPC006_MCF7_24H	CPC006_HA1E_24H
Gene
A1CF	-0.039201	-0.158875	0.025282	-0.134439	-0.184498	-0.057449	0.054025	-0.033161	-0.090305	-0.256182
A2M	0.202267	-0.024419	0.020151	0.052870	-0.117227	0.017373	-0.225873	-0.114115	-0.194216	-0.086272
A4GALT	0.031620	0.019167	0.049822	0.046336	-0.013258	-0.001729	0.064774	0.013017	0.028068	0.042684
A4GNT	-0.011731	-0.071222	-0.009831	-0.052952	-0.076852	-0.028826	-0.079883	-0.024717	-0.058964	-0.074368
AAAS	0.004224	0.023722	0.029721	0.032498	-0.003322	0.043787	-0.046597	0.040568	0.051654	0.038041
...	...	...	...	...	...	...	...	...	...	...
ZXDB	0.045476	-0.016062	0.065967	0.024613	-0.031598	-0.019754	0.023491	-0.042219	-0.017098	-0.014245
ZXDC	0.009077	-0.019981	0.013045	-0.009909	0.028926	0.003964	-0.043277	-0.036227	-0.028414	-0.053541
ZYX	0.044630	-0.006935	0.007790	-0.006308	0.027131	0.014156	0.065377	0.007357	0.003019	0.062993
ZZEF1	-0.027045	-0.026066	0.077580	-0.059606	-0.017674	-0.032865	-0.005162	0.021032	-0.004313	-0.082674
ZZZ3	-0.030140	-0.004346	-0.178028	-0.000276	0.018240	0.014170	-0.033473	0.068013	0.001494	-0.013333

			Rank
TF	Library	Method
NR0B1	ChEA 2022	cd	172.600000
		ttest	182.900000
		limma	195.500000
		ranksum	210.200000
		limma-voom	217.000000
		modz	239.100000
		fc	269.700000
NR1I2	ChEA 2022	fc	126.700000
		cd	129.200000
		modz	225.300000
		ranksum	248.500000
		limma-voom	260.400000
		limma	299.300000
		ttest	442.500000
NR3C1	ChEA 2022	fc	255.416667
		cd	300.950000
		limma-voom	306.200000
		modz	320.450000
		limma	351.150000
		ranksum	356.833333
		ttest	447.183333