# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import requests
import scipy.stats as ss
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.enrichment import enrich_crisp
from math import log2
from IPython.display import display, Markdown
from random import sample
from maayanlab_bioinformatics.plotting.bridge import bridge_plot
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import seaborn as sns
from os.path import exists
from scipy.stats import ttest_ind, ranksums
import h5py
import warnings
warnings.filterwarnings('ignore')
# Set perturbagen
pert = 'dexamethasone'
# Set working directory
l1000_data_dir = '../L1000_data'
try:
expr_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index_col=0)
except:
l1000_data_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_data.tsv", sep='\t')
l1000_data_list = []
l1000_meta_list = []
for row in l1000_data_df.itertuples():
try:
temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
except:
print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
continue
for col in temp_df.columns:
l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())
l1000_data_list.append(temp_df)
expr_df = pd.concat(l1000_data_list, axis=1)
expr_df.head()
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 | CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 | CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13 | CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13 | ... | CRCGN004_PC3_6H_X2_F1B7_DUO52HI53LO:J21 | LPROT002_A375_6H_X1_B22:H07 | LPROT002_A375_6H_X1_B22:H09 | LPROT002_A375_6H_X1_B22:H11 | LPROT002_MCF7_6H_X1_B22:H08 | LPROT002_MCF7_6H_X1_B22:H10 | LPROT002_MCF7_6H_X1_B22:H12 | LPROT004_YAPC_6H_X1_B32:H07 | LPROT004_YAPC_6H_X1_B32:H09 | LPROT004_YAPC_6H_X1_B32:H11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
symbol | |||||||||||||||||||||
NAT2 | 5.347525 | 5.31260 | 5.54900 | 5.2620 | 5.3664 | 5.1632 | 6.5029 | 6.1924 | 5.701800 | 5.69435 | ... | 6.521600 | 3.8543 | 4.558200 | 4.8785 | 4.0917 | 4.3097 | 4.594400 | 5.051500 | 4.67150 | 5.2628 |
ADA | 6.814449 | 6.21185 | 7.37235 | 7.9995 | 7.4117 | 7.6526 | 8.4221 | 7.4373 | 9.558400 | 7.31210 | ... | 8.412251 | 8.5299 | 8.172900 | 7.8001 | 7.7042 | 7.5313 | 7.161799 | 6.563300 | 5.72615 | 6.8200 |
CDH2 | 7.497050 | 8.35885 | 7.51460 | 10.1442 | 11.5089 | 10.4157 | 9.9937 | 6.0723 | 7.950950 | 6.61335 | ... | 3.211200 | 10.5305 | 9.309799 | 9.0100 | 5.4028 | 5.9991 | 5.022750 | 6.782700 | 6.52530 | 6.6286 |
AKT3 | 9.710200 | 11.61895 | 9.40310 | 10.4081 | 9.9464 | 9.1548 | 10.6656 | 9.7118 | 8.052999 | 7.07675 | ... | 7.751399 | 9.9655 | 8.703700 | 9.5298 | 5.4086 | 7.9250 | 5.372250 | 8.076001 | 7.89250 | 8.1305 |
MED6 | 7.923500 | 6.12090 | 8.32925 | 8.2268 | 7.8020 | 8.0996 | 8.9311 | 8.5562 | 9.422951 | 8.44575 | ... | 9.265400 | 9.7446 | 9.507300 | 9.4560 | 10.4112 | 9.0910 | 9.762300 | 9.515800 | 9.54300 | 9.8093 |
5 rows × 1310 columns
if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv"):
expr_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv", sep='\t', index=True)
else:
print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_fulldata.tsv' already exists!")
File at '../L1000_data/Dexamethasone_L1000_ChemPert_fulldata.tsv' already exists!
try:
meta_df = pd.read_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index_col=0)
except:
meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')
if 'batch' not in meta_df.columns:
meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))
meta_df.head()
tissue | disease | cell_line | pert_name | pert_time | pert_type | data_level | creation_time | persistent_id | pert_dose | batch | |
---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 | skin of body | melanoma | A375 | dexamethasone | 6 h | Chemical Perturbation | 3 | 2021-01-21 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | CPC004_A375_6H |
CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 | skin of body | melanoma | A375 | dexamethasone | 6 h | Chemical Perturbation | 3 | 2021-01-21 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | CPC004_A375_6H |
CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 | skin of body | melanoma | A375 | dexamethasone | 6 h | Chemical Perturbation | 3 | 2021-01-21 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | CPC004_A375_6H |
CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 | kidney | NaN | HA1E | dexamethasone | 24 h | Chemical Perturbation | 3 | 2021-01-21 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | CPC004_HA1E_24H |
CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 | kidney | NaN | HA1E | dexamethasone | 24 h | Chemical Perturbation | 3 | 2021-01-21 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | CPC004_HA1E_24H |
if not exists(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv"):
meta_df.to_csv(f"{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv", sep='\t', index=True)
else:
print(f"File at '{l1000_data_dir}/{pert.capitalize()}_L1000_ChemPert_metadata.tsv' already exists!")
File at '../L1000_data/Dexamethasone_L1000_ChemPert_metadata.tsv' already exists!
all_batches = set(['_'.join(x.split('_')[:3]) for x in meta_df.index.tolist()])
len(all_batches)
240
modz_data_df = pd.read_csv(f'{l1000_data_dir}/dex_modz.gct', sep='\t', header=2).iloc[8:].set_index('pr_gene_symbol')
modz_data_df = modz_data_df.drop(columns=['id', 'pr_gene_title', 'pr_is_lm', 'pr_is_bing', 'self_correlation', 'frac_self_rank', 'p_value', 'gene_space'])
modz_data_df = modz_data_df.astype('float32')
modz_data_df.loc['MIA2'] = modz_data_df.loc[['MIA2', 'CTAGE5']].mean(axis=0)
modz_data_df = modz_data_df.drop(index=['CTAGE5'])
modz_data_df.columns = modz_data_df.columns.map(lambda x: x.split(':')[0])
modz_data_df = modz_data_df[[x for x in modz_data_df.columns if x.endswith('24H') ]]
modz_data_df
CPC006_A375_24H | CPC006_A549_24H | CPC006_HA1E_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_MCF7_24H | CPC006_PC3_24H | CPC009_A549_24H | CPC009_MCF7_24H | CPC009_PC3_24H | CPC006_VCAP_24H | CPC009_VCAP_24H | CRCGN001_HA1E_24H | CRCGN004_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pr_gene_symbol | ||||||||||||||
PSME1 | 0.50 | -1.32 | 1.37 | 1.03 | 0.67 | 0.47 | -0.17 | -0.44 | 1.65 | 0.07 | 0.24 | 0.27 | -0.58 | 0.72 |
ATF1 | 1.03 | -0.42 | 0.98 | 0.26 | 0.39 | 0.76 | 0.09 | -0.05 | -1.87 | 0.65 | 0.10 | 0.53 | -0.78 | -0.02 |
RHEB | -0.02 | -0.66 | -0.27 | -0.84 | 0.41 | 0.04 | 0.24 | -0.63 | 1.13 | 1.06 | 0.41 | -0.48 | -0.76 | 0.05 |
FOXO3 | -0.75 | 2.80 | -1.63 | 1.05 | -0.16 | -0.67 | -0.94 | 1.96 | -0.21 | -0.31 | -0.54 | 0.37 | -0.24 | 0.20 |
RHOA | 0.10 | -0.33 | -0.37 | 1.16 | -0.11 | 1.08 | 0.85 | 0.02 | 1.40 | 0.34 | 0.35 | 0.43 | -0.45 | 0.46 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ADGRA2 | -0.34 | 0.24 | 0.05 | -1.21 | -0.34 | -0.34 | -0.16 | -0.26 | -0.29 | -0.25 | -0.17 | -1.03 | 0.70 | -0.40 |
CX3CL1 | -0.33 | -0.18 | -0.17 | 0.21 | -0.21 | -0.41 | 0.33 | 0.07 | -0.44 | 0.38 | -0.20 | -0.90 | 0.18 | -0.12 |
ADAP1 | -0.48 | -0.09 | -0.99 | 0.23 | -0.06 | -0.54 | 0.42 | -0.43 | -0.65 | 0.53 | -0.49 | 1.17 | 1.23 | 0.12 |
EPS8L1 | -0.69 | -0.68 | -0.87 | -0.30 | 0.15 | 0.49 | -0.58 | -0.59 | 0.88 | -0.86 | -0.38 | 0.87 | -0.03 | -0.87 |
ACTB | -0.20 | -0.32 | 0.14 | 0.40 | -0.73 | -0.23 | -0.16 | 0.46 | 0.37 | 0.16 | -0.03 | 2.51 | -0.60 | 0.73 |
12327 rows × 14 columns
gctx = h5py.File(f"{l1000_data_dir}/Dexamethasone_L1000_Controls_fulldata.gctx", 'r')
ctrl_expr_df = pd.DataFrame(
data=gctx['0/DATA/0/matrix'][()],
index=gctx['0/META/COL/id'].asstr()[:],
columns=gctx['0/META/ROW/id'].asstr()[:]
).T
gctx.close()
overlapping_dex = set(expr_df.columns.tolist()).intersection(ctrl_expr_df.columns.tolist())
ctrl_expr_df = ctrl_expr_df.drop(columns=list(overlapping_dex))
ctrl_expr_df.head()
CPC006_A375_24H_X5_B4_DUO52HI53LO:A17 | CPC006_A375_24H_X1_B3_DUO52HI53LO:A18 | CPC006_A375_24H_X2_B3_DUO52HI53LO:A18 | CPC006_A375_24H_X5_B4_DUO52HI53LO:A18 | CPC006_A375_24H_X2_B3_DUO52HI53LO:B17 | CPC006_A375_24H_X4_B4_DUO52HI53LO:B17 | CPC006_A375_24H_X5_B4_DUO52HI53LO:B17 | CPC006_A375_24H_X1_B3_DUO52HI53LO:B18 | CPC006_A375_24H_X4_B4_DUO52HI53LO:B18 | CPC006_A375_24H_X5_B4_DUO52HI53LO:B18 | ... | CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12 | CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12 | CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13 | CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
NAT2 | 7.89775 | 8.68995 | 6.80705 | 8.04730 | 7.54740 | 6.815075 | 7.633851 | 7.339425 | 8.56575 | 7.03535 | ... | 4.59600 | 5.69120 | 5.410600 | 5.16645 | 5.6062 | 5.12355 | 5.612900 | 5.61780 | 5.964099 | 5.46440 |
ADA | 3.51090 | 4.02850 | 4.11730 | 3.89205 | 4.56435 | 4.913550 | 4.724325 | 3.815050 | 3.26435 | 3.49990 | ... | 9.13180 | 9.69640 | 10.277700 | 8.01760 | 9.1136 | 8.69590 | 9.973300 | 9.98530 | 8.143050 | 8.43330 |
CDH2 | 5.07510 | 5.84900 | 5.84390 | 5.05145 | 5.76565 | 5.012725 | 4.946300 | 6.192550 | 4.91245 | 4.96025 | ... | 4.98080 | 3.02930 | 3.250800 | 4.04120 | 3.6938 | 5.21580 | 3.833500 | 3.92415 | 3.731000 | 6.07040 |
AKT3 | 3.64940 | 4.85665 | 4.31915 | 5.40425 | 5.03620 | 3.253500 | 4.558100 | 4.819650 | 6.11925 | 4.10250 | ... | 5.80955 | 6.67030 | 7.958799 | 5.72470 | 5.8401 | 6.77600 | 7.432001 | 8.05450 | 5.867200 | 6.17385 |
MED6 | 5.20905 | 5.51090 | 6.04015 | 5.06230 | 5.91720 | 4.498100 | 5.335800 | 5.388175 | 5.42000 | 5.15100 | ... | 8.67330 | 7.03495 | 8.061700 | 8.36450 | 8.3464 | 8.82380 | 7.499700 | 8.43520 | 8.015600 | 9.17250 |
5 rows × 14214 columns
# ctrl_expr_df.to_csv('../L1000_data/Dexamethasone_L1000_Control_Profiles.tsv.gz', sep='\t')
ctrl_meta_df = pd.DataFrame(
data = [ctrl_expr_df.columns.tolist(),['_'.join(x.split('_')[:3]) for x in ctrl_expr_df.columns]],
index = ['id', 'batch']
).T.set_index('id')
ctrl_meta_df.head()
batch | |
---|---|
id | |
CPC006_A375_24H_X5_B4_DUO52HI53LO:A17 | CPC006_A375_24H |
CPC006_A375_24H_X1_B3_DUO52HI53LO:A18 | CPC006_A375_24H |
CPC006_A375_24H_X2_B3_DUO52HI53LO:A18 | CPC006_A375_24H |
CPC006_A375_24H_X5_B4_DUO52HI53LO:A18 | CPC006_A375_24H |
CPC006_A375_24H_X2_B3_DUO52HI53LO:B17 | CPC006_A375_24H |
combined_expr_df = pd.concat([
expr_df.groupby(expr_df.index).mean(),
ctrl_expr_df.groupby(ctrl_expr_df.index).mean()
], axis=1)
combined_expr_df.head()
CPC004_A375_6H_X1_B3_DUO52HI53LO:E13 | CPC004_A375_6H_X2_B3_DUO52HI53LO:E13 | CPC004_A375_6H_X3_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X1_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X2_B3_DUO52HI53LO:E13 | CPC004_HA1E_24H_X3_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X1_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X2_B3_DUO52HI53LO:E13 | CPC004_HA1E_6H_X3_B3_DUO52HI53LO:E13 | CPC004_HCC515_24H_X1_B3_DUO52HI53LO:E13 | ... | CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L12 | CPC009_PC3_24H_X4_B5_DUO52HI53LO:L12 | CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L12 | CPC009_PC3_24H_X1_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X2_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X3_F1B4_DUO52HI53LO:L13 | CPC009_PC3_24H_X4_B5_DUO52HI53LO:L13 | CPC009_PC3_24H_X5_F1B5_DUO52HI53LO:L13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A1CF | 3.706150 | 5.19435 | 4.10195 | 4.64380 | 4.270800 | 3.9665 | 4.4174 | 4.5412 | 4.5436 | 4.50700 | ... | 3.42250 | 4.26580 | 3.922100 | 4.60500 | 4.60140 | 3.350000 | 4.33555 | 3.6938 | 4.4059 | 4.400100 |
A2M | 7.060125 | 4.31380 | 7.43550 | 5.55190 | 7.082000 | 6.1772 | 5.7649 | 8.5913 | 6.3916 | 8.73610 | ... | 8.69920 | 11.61970 | 10.921300 | 8.47680 | 10.67320 | 9.651800 | 8.33980 | 8.0536 | 10.0006 | 9.749900 |
A4GALT | 6.245200 | 6.80720 | 6.30480 | 7.30625 | 6.827850 | 7.1949 | 6.9762 | 7.0023 | 6.8364 | 6.90265 | ... | 6.71325 | 6.60190 | 7.314400 | 6.81515 | 6.59470 | 6.657900 | 6.45930 | 6.8790 | 6.4290 | 6.985100 |
A4GNT | 5.558300 | 5.77020 | 5.68940 | 5.67050 | 5.731700 | 5.8950 | 6.0036 | 6.2611 | 6.3069 | 5.24015 | ... | 5.60620 | 5.78565 | 5.702550 | 6.01230 | 5.86385 | 5.448800 | 6.15945 | 6.0248 | 6.0777 | 5.834650 |
AAAS | 8.256000 | 6.88010 | 8.09350 | 8.25630 | 8.228301 | 8.0326 | 7.9515 | 7.6869 | 7.2894 | 8.11190 | ... | 7.64870 | 7.58680 | 7.712349 | 7.53270 | 7.52370 | 7.898301 | 8.02430 | 7.7470 | 7.8282 | 7.616199 |
5 rows × 15524 columns
batches = all_batches.intersection(ctrl_meta_df['batch'].unique())
for b in batches:
print(b)
CPC006_A549_24H CPC006_PC3_24H CPC009_MCF7_24H CPC006_HCC515_24H CPC006_HT29_24H CPC006_A375_24H CPC009_A549_24H CPC009_PC3_24H CPC006_MCF7_24H CPC006_HA1E_24H
batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}
for b in batches:
batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()
batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()
batch_signatures = {
'cd': {},
'limma': {},
'limma-voom': {},
'fc': {},
'ranksum': {},
'ttest': {},
'modz': {},
}
for b in batches:
batch_signatures['modz'][b] = pd.concat([
modz_data_df[b].rename('MODZ'),
modz_data_df[b].apply(abs).rename('Significance')
], axis=1)
# Function for computing signatures with characteristic direction
def cd_signature(ctrl_ids, case_ids, dataset):
signature = characteristic_direction(
dataset.loc[:, ctrl_ids],
dataset.loc[:, case_ids],
calculate_sig=True
)
signature['Significance'] = signature['CD-coefficient'].apply(abs)
return signature.sort_values(by=['CD-coefficient'], ascending=False)
Note: the following step may take a few minutes to run.
for b in batches:
batch_signatures['cd'][b] = cd_signature(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df
)
# Function for computing signatures
def limma(ctrl_ids, case_ids, dataset, voom):
signature = limma_voom_differential_expression(
dataset.loc[:, ctrl_ids],
dataset.loc[:, case_ids],
voom_design=voom,
filter_genes=False
)
signature['Significance'] = signature['P.Value']
return signature.sort_values("t", ascending=False)
Note: the following step may take a few minutes to run.
for b in batches:
batch_signatures['limma'][b] = limma(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df,
voom=False
)
R[write to console]: Loading required package: R.oo R[write to console]: Loading required package: R.methodsS3 R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help. R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help. R[write to console]: Attaching package: ‘R.oo’ R[write to console]: The following object is masked from ‘package:R.methodsS3’: throw R[write to console]: The following objects are masked from ‘package:methods’: getClasses, getMethods R[write to console]: The following objects are masked from ‘package:base’: attach, detach, load, save R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help. R[write to console]: Attaching package: ‘R.utils’ R[write to console]: The following object is masked from ‘package:utils’: timestamp R[write to console]: The following objects are masked from ‘package:base’: cat, commandArgs, getOption, inherits, isOpen, nullfile, parse, warnings R[write to console]: Attaching package: ‘RCurl’ R[write to console]: The following object is masked from ‘package:R.utils’: reset R[write to console]: The following object is masked from ‘package:R.oo’: clone R[write to console]: Loading required package: S4Vectors R[write to console]: Loading required package: stats4 R[write to console]: Loading required package: BiocGenerics R[write to console]: Loading required package: parallel R[write to console]: Attaching package: ‘BiocGenerics’ R[write to console]: The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB R[write to console]: The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs R[write to console]: The following objects are masked from ‘package:base’: Filter, Find, Map, Position, Reduce, anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted, lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which.max, which.min R[write to console]: Attaching package: ‘S4Vectors’ R[write to console]: The following objects are masked from ‘package:base’: I, expand.grid, unname R[write to console]: Loading required package: IRanges R[write to console]: Attaching package: ‘IRanges’ R[write to console]: The following object is masked from ‘package:R.oo’: trim R[write to console]: Loading required package: GenomicRanges R[write to console]: Loading required package: GenomeInfoDb R[write to console]: Loading required package: SummarizedExperiment R[write to console]: Loading required package: MatrixGenerics R[write to console]: Loading required package: matrixStats R[write to console]: Attaching package: ‘MatrixGenerics’ R[write to console]: The following objects are masked from ‘package:matrixStats’: colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars R[write to console]: Loading required package: Biobase R[write to console]: Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. R[write to console]: Attaching package: ‘Biobase’ R[write to console]: The following object is masked from ‘package:MatrixGenerics’: rowMedians R[write to console]: The following objects are masked from ‘package:matrixStats’: anyMissing, rowMedians R[write to console]: Attaching package: ‘limma’ R[write to console]: The following object is masked from ‘package:DESeq2’: plotMA R[write to console]: The following object is masked from ‘package:BiocGenerics’: plotMA
for b in batches:
batch_signatures['limma-voom'][b] = limma(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df,
voom=True
)
def ranksum(ctrl_ids, case_ids, dataset):
if len(ctrl_ids) + len(case_ids) < 32:
print("Warning! Sample sizes < 16 generally do not provide good results. ")
res_array = []
for gene in dataset.index:
res = ranksums(
dataset.loc[gene, case_ids],
dataset.loc[gene, ctrl_ids]
)
res_array.append([gene, res.statistic, res.pvalue])
signature = pd.DataFrame(
res_array, columns=['Geneid', 'Statistic', 'Pvalue']
).set_index('Geneid')
signature['Significance'] = signature['Pvalue']
return signature.sort_values(by=['Statistic'], ascending=False)
for b in batches:
batch_signatures['ranksum'][b] = ranksum(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df
)
def ttest(ctrl_ids, case_ids, dataset):
res_array = []
for gene in dataset.index:
res = ttest_ind(
dataset.loc[gene, case_ids],
dataset.loc[gene, ctrl_ids],
equal_var = False
)
res_array.append([gene, res.statistic, res.pvalue])
signature = pd.DataFrame(
res_array, columns=['Geneid', 'Statistic', 'Pvalue']
).set_index('Geneid')
signature['Significance'] = signature['Pvalue']
return signature.sort_values(by=['Statistic'], ascending=False)
for b in batches:
batch_signatures['ttest'][b] = ttest(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df
)
# Function for computing signatures with fold change
def logFC(ctrl_ids, case_ids, dataset):
case_mean = dataset.loc[:, case_ids].mean(axis=1)
ctrl_mean = dataset.loc[:, ctrl_ids].mean(axis=1)
signature = case_mean / (ctrl_mean + 0.001)
signature_df = pd.DataFrame(
signature.apply(lambda x: log2(x+0.001)), columns=['logFC']
)
signature_df['Significance'] = signature_df['logFC'].apply(abs)
return signature_df.sort_values('logFC', ascending=False)
for b in batches:
batch_signatures['fc'][b] = logFC(
batch_profiles[b]['ctrls'],
batch_profiles[b]['perts'],
combined_expr_df
)
all_signatures = {}
all_signatures['cd_all']= pd.concat([
df['CD-coefficient'].rename(b) for (b, df) in batch_signatures['cd'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma_all'] = pd.concat([
df['t'].rename(b) for (b, df) in batch_signatures['limma'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['limma-voom_all'] = pd.concat([
df['t'].rename(b) for (b, df) in batch_signatures['limma-voom'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ranksum_all'] = pd.concat([
df['Statistic'].rename(b) for (b, df) in batch_signatures['ranksum'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['ttest_all'] = pd.concat([
df['Statistic'].rename(b) for (b, df) in batch_signatures['ttest'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['modz_all'] = pd.concat([
df['MODZ'].rename(b) for (b, df) in batch_signatures['modz'].items()
], axis=1).sort_index().rename_axis('Gene')
all_signatures['fc_all'] = pd.concat([
df['logFC'].rename(b) for (b, df) in batch_signatures['fc'].items()
], axis=1).sort_index().rename_axis('Gene')
for k in all_signatures.keys():
method = k.split('_')[0].upper()
display(Markdown(f"All {method} batch signatures"))
display(all_signatures[k])
All CD batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | 0.000952 | 0.001854 | 0.008695 | -0.001396 | -0.007102 | 0.013102 | 0.005242 | 0.018031 | -0.001283 | -0.012737 |
A2M | 0.021682 | -0.026343 | 0.022046 | 0.002863 | -0.009718 | 0.003915 | -0.006546 | -0.018333 | -0.000499 | -0.006217 |
A4GALT | 0.006834 | 0.004920 | -0.006121 | 0.003770 | 0.000802 | 0.002384 | 0.000730 | 0.003674 | 0.004756 | 0.001069 |
A4GNT | -0.001011 | 0.003500 | -0.004465 | 0.002190 | -0.004268 | -0.002679 | 0.000148 | 0.006255 | 0.002543 | -0.001182 |
AAAS | -0.003752 | -0.003620 | -0.003427 | 0.001099 | -0.003757 | -0.005967 | -0.002238 | 0.002202 | 0.008218 | -0.003162 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 0.003825 | -0.000073 | 0.000986 | 0.000702 | -0.004406 | 0.002464 | -0.001844 | -0.002293 | -0.000160 | -0.004288 |
ZXDC | 0.005625 | 0.004271 | 0.003541 | 0.002027 | 0.008632 | 0.001144 | 0.000540 | 0.000558 | 0.009406 | -0.002195 |
ZYX | -0.002841 | -0.014867 | -0.003315 | 0.001899 | -0.006242 | -0.013534 | -0.005301 | -0.018816 | 0.009712 | -0.002055 |
ZZEF1 | -0.005954 | -0.007576 | 0.011017 | -0.005430 | 0.006954 | 0.004589 | 0.002710 | -0.003639 | 0.004555 | -0.006276 |
ZZZ3 | -0.007250 | -0.001067 | -0.007591 | -0.002336 | 0.010721 | -0.004806 | -0.009311 | 0.003312 | -0.002183 | -0.014607 |
12327 rows × 10 columns
All LIMMA batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | -0.294205 | -0.823307 | 0.224118 | -1.345300 | -1.803227 | -0.457685 | 0.488978 | -0.066385 | -0.496867 | -1.385689 |
A2M | 1.287795 | -0.226417 | 0.060950 | 0.611259 | -1.071238 | 0.199035 | -0.957864 | -0.837714 | -1.592196 | -0.675840 |
A4GALT | 0.592904 | 0.990301 | 0.837572 | 1.536314 | -0.671224 | -0.084085 | 1.052068 | 0.306574 | 1.264702 | 1.458460 |
A4GNT | -0.143384 | -0.811679 | -0.182202 | -0.432298 | -1.127489 | -0.521193 | -0.875634 | -0.312889 | -0.982429 | -0.717047 |
AAAS | 0.086139 | 1.050528 | 0.469454 | 0.931823 | -0.123393 | 1.261981 | -0.698037 | 1.166193 | 1.790793 | 0.821752 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 1.740872 | -0.855682 | 1.459864 | 1.128436 | -1.638648 | -1.103166 | 0.583538 | -1.900811 | -1.288176 | -0.708441 |
ZXDC | 0.207162 | -0.498437 | 0.114471 | -0.111600 | 0.964735 | 0.191507 | -0.366940 | -0.705952 | -0.582374 | -0.951470 |
ZYX | 0.515698 | -0.059634 | 0.061114 | 0.036428 | 0.500599 | 0.348942 | 0.680457 | 0.156431 | 0.123771 | 0.932210 |
ZZEF1 | -0.355968 | -0.811313 | 1.224806 | -1.059856 | -0.369694 | -0.557480 | -0.009333 | 0.326231 | -0.257675 | -1.265336 |
ZZZ3 | -0.409015 | -0.177657 | -2.337986 | 0.017358 | 0.614833 | 0.448167 | -0.225108 | 1.262405 | 0.031205 | -0.344133 |
12327 rows × 10 columns
All LIMMA-VOOM batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | -0.299872 | -0.723968 | 0.233817 | -1.151080 | -1.426047 | -0.432861 | 0.530413 | -0.065431 | -0.461192 | -1.057338 |
A2M | 1.195174 | -0.221229 | 0.060802 | 0.612243 | -1.045904 | 0.198667 | -0.985443 | -0.857667 | -1.256354 | -0.640593 |
A4GALT | 0.586769 | 0.992215 | 0.835355 | 1.541795 | -0.671289 | -0.084093 | 1.041888 | 0.306185 | 1.263357 | 1.457404 |
A4GNT | -0.146902 | -0.778560 | -0.182461 | -0.420382 | -1.089057 | -0.518001 | -0.803284 | -0.310522 | -0.957077 | -0.693777 |
AAAS | 0.086007 | 1.044281 | 0.467396 | 0.924893 | -0.123432 | 1.250010 | -0.701899 | 1.160633 | 1.762998 | 0.816807 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 1.794068 | -0.842987 | 1.473848 | 1.158135 | -1.582549 | -1.086361 | 0.588070 | -1.859902 | -1.272800 | -0.697761 |
ZXDC | 0.209948 | -0.491523 | 0.114394 | -0.111469 | 0.963724 | 0.191458 | -0.350736 | -0.688386 | -0.581834 | -0.899001 |
ZYX | 0.511624 | -0.059739 | 0.060987 | 0.036396 | 0.494438 | 0.346584 | 0.666952 | 0.155801 | 0.123418 | 0.894256 |
ZZEF1 | -0.358077 | -0.809179 | 1.215608 | -1.062095 | -0.369780 | -0.559051 | -0.009334 | 0.325266 | -0.257917 | -1.270832 |
ZZZ3 | -0.412010 | -0.178105 | -2.364038 | 0.017409 | 0.612756 | 0.446982 | -0.225531 | 1.246159 | 0.031202 | -0.344768 |
12327 rows × 10 columns
All RANKSUM batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | -0.869339 | -0.653255 | 0.782913 | -1.569126 | -2.412502 | -0.417907 | 0.782361 | 0.777130 | 0.269830 | -2.063696 |
A2M | 1.217205 | -0.420350 | 0.079282 | 1.065474 | -1.368122 | 0.120335 | -1.262522 | -1.052777 | -2.055985 | -0.538814 |
A4GALT | 1.838812 | 1.132939 | 1.248697 | 1.542712 | -0.807251 | -0.227299 | 1.383402 | 0.227525 | 1.710192 | 1.742911 |
A4GNT | -0.660620 | -0.838930 | 0.052855 | 0.719418 | -1.996264 | 0.006530 | -0.970396 | 0.126637 | -1.091930 | -0.575701 |
AAAS | 0.020807 | 0.966747 | 0.670597 | 0.436577 | -0.350229 | 1.663231 | -1.302816 | 1.034626 | 2.517042 | 0.741692 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 1.862220 | -1.568344 | 1.704571 | 1.396988 | -1.905869 | -1.035439 | 0.849516 | -2.128772 | -1.379944 | -1.143497 |
ZXDC | 0.598199 | -0.123685 | 0.361726 | 0.368019 | 1.300430 | 0.761498 | -0.530528 | -0.706636 | -0.254285 | -1.029542 |
ZYX | 0.518223 | -0.814134 | 0.044596 | -0.387013 | 0.241755 | -0.109141 | 1.020763 | -0.607014 | -0.164244 | 0.947864 |
ZZEF1 | -0.531877 | -0.807345 | 1.519579 | -1.007007 | -0.066010 | -0.208021 | -0.134311 | 0.497262 | 0.232875 | -1.667160 |
ZZZ3 | -1.148282 | -0.231429 | -1.641806 | -0.232980 | 0.516304 | 0.350121 | -0.282053 | 1.427624 | -0.231702 | -0.563844 |
12327 rows × 10 columns
All TTEST batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | -0.320777 | -3.318610 | 0.283788 | -2.466058 | -3.953539 | -0.815173 | 0.600913 | -0.305456 | -2.726108 | -4.637847 |
A2M | 1.029799 | -0.350408 | 0.055592 | 0.621131 | -1.315067 | 0.178960 | -9.727957 | -1.619311 | -2.894856 | -0.607508 |
A4GALT | 0.519774 | 1.478609 | 2.107487 | 2.554163 | -1.015374 | -0.187363 | 2.934027 | 0.825312 | 1.883732 | 2.333186 |
A4GNT | -0.246314 | -5.220164 | -0.197389 | -3.498468 | -3.900530 | -1.452731 | -1.084934 | -1.255108 | -4.730529 | -6.601442 |
AAAS | 0.051627 | 2.359116 | 0.993238 | 1.795143 | -0.165691 | 3.840168 | -1.004628 | 1.396759 | 4.285530 | 1.741888 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 1.817554 | -1.892693 | 1.966396 | 1.818086 | -3.129890 | -1.477842 | 0.748831 | -3.603297 | -1.666687 | -0.594905 |
ZXDC | 0.166390 | -1.068104 | 0.161120 | -0.533967 | 1.703616 | 0.246233 | -2.464852 | -1.725556 | -1.192863 | -1.542090 |
ZYX | 0.591836 | -0.279141 | 0.127387 | -0.150160 | 0.773410 | 0.498327 | 13.014875 | 0.594875 | 0.128454 | 2.174616 |
ZZEF1 | -1.006280 | -2.492458 | 1.719959 | -2.763837 | -0.754950 | -2.341365 | -0.353694 | 0.462720 | -0.232539 | -3.994231 |
ZZZ3 | -1.103177 | -0.277392 | -1.241546 | -0.041727 | 1.173309 | 0.500864 | -0.571492 | 1.782880 | 0.015091 | -0.563517 |
12327 rows × 10 columns
All MODZ batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | 0.44 | 0.02 | 1.00 | -0.51 | -1.47 | -0.14 | 0.34 | 0.01 | 0.21 | -0.96 |
A2M | 1.64 | 0.39 | 0.18 | 0.85 | -0.08 | -0.19 | -0.72 | -0.47 | -0.39 | -0.41 |
A4GALT | 1.50 | 0.38 | 1.32 | -0.01 | -0.84 | -0.13 | 0.59 | -0.31 | 0.66 | 0.19 |
A4GNT | 0.63 | 0.04 | 0.75 | 0.38 | -1.15 | -0.61 | -0.71 | -0.31 | -0.24 | -1.03 |
AAAS | 0.12 | -0.15 | 0.75 | -0.30 | -0.62 | 0.25 | -1.42 | 0.20 | 0.38 | -0.85 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 1.07 | -0.42 | 1.75 | 0.44 | -0.58 | -0.64 | 0.10 | -1.07 | -0.30 | -0.95 |
ZXDC | 0.97 | -0.12 | 0.83 | -0.06 | 0.12 | 0.10 | -0.39 | -0.58 | 0.16 | -0.87 |
ZYX | -0.23 | -0.39 | 0.06 | -0.07 | 0.03 | -0.58 | 0.65 | -0.28 | -0.10 | 0.19 |
ZZEF1 | 0.02 | -0.05 | 1.33 | -0.31 | -0.37 | 0.23 | -0.16 | -0.01 | -0.27 | -0.50 |
ZZZ3 | 0.11 | 0.08 | -1.91 | -0.16 | -0.35 | 0.02 | -0.42 | 0.76 | -0.21 | -0.16 |
12327 rows × 10 columns
All FC batch signatures
CPC006_A549_24H | CPC006_PC3_24H | CPC009_MCF7_24H | CPC006_HCC515_24H | CPC006_HT29_24H | CPC006_A375_24H | CPC009_A549_24H | CPC009_PC3_24H | CPC006_MCF7_24H | CPC006_HA1E_24H | |
---|---|---|---|---|---|---|---|---|---|---|
Gene | ||||||||||
A1CF | -0.039201 | -0.158875 | 0.025282 | -0.134439 | -0.184498 | -0.057449 | 0.054025 | -0.033161 | -0.090305 | -0.256182 |
A2M | 0.202267 | -0.024419 | 0.020151 | 0.052870 | -0.117227 | 0.017373 | -0.225873 | -0.114115 | -0.194216 | -0.086272 |
A4GALT | 0.031620 | 0.019167 | 0.049822 | 0.046336 | -0.013258 | -0.001729 | 0.064774 | 0.013017 | 0.028068 | 0.042684 |
A4GNT | -0.011731 | -0.071222 | -0.009831 | -0.052952 | -0.076852 | -0.028826 | -0.079883 | -0.024717 | -0.058964 | -0.074368 |
AAAS | 0.004224 | 0.023722 | 0.029721 | 0.032498 | -0.003322 | 0.043787 | -0.046597 | 0.040568 | 0.051654 | 0.038041 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZXDB | 0.045476 | -0.016062 | 0.065967 | 0.024613 | -0.031598 | -0.019754 | 0.023491 | -0.042219 | -0.017098 | -0.014245 |
ZXDC | 0.009077 | -0.019981 | 0.013045 | -0.009909 | 0.028926 | 0.003964 | -0.043277 | -0.036227 | -0.028414 | -0.053541 |
ZYX | 0.044630 | -0.006935 | 0.007790 | -0.006308 | 0.027131 | 0.014156 | 0.065377 | 0.007357 | 0.003019 | 0.062993 |
ZZEF1 | -0.027045 | -0.026066 | 0.077580 | -0.059606 | -0.017674 | -0.032865 | -0.005162 | 0.021032 | -0.004313 | -0.082674 |
ZZZ3 | -0.030140 | -0.004346 | -0.178028 | -0.000276 | 0.018240 | 0.014170 | -0.033473 | 0.068013 | 0.001494 | -0.013333 |
12327 rows × 10 columns
# Function to get Enrichr Results
def getEnrichrLibrary(library_name):
ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'
resp = requests.get(ENRICHR_URL)
if not resp.ok:
raise Exception(f"Error downloading {library_name} library from Enrichr, please try again.")
return resp.json()[library_name]['terms']
def getLibraryIter(libdict):
for k,v in libdict.items():
if type(v) == list:
yield k, v
else:
yield k, list(v.keys())
def enrich(gene_list, lib_json, name):
all_terms = list(lib_json.keys())
termranks = []
enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)
enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]
sorted_res = sorted(enrich_res, key=lambda x: x[1])
for i in range(len(sorted_res)):
termranks.append([name, sorted_res[i][0], i])
for t in set(all_terms).difference([x[1] for x in termranks]):
i+=1
termranks.append([name, t, i])
return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])
chea2022 = getEnrichrLibrary('ChEA_2022')
# Get gene lists to put into Enrichr
gene_lists = {}
for m in all_signatures.keys():
mname = m.split('_')[0]
gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}
for col in all_signatures[m].columns:
gene_lists[mname]['up'][col] = all_signatures[m][col].sort_values(ascending=False).index.tolist()[:100]
gene_lists[mname]['down'][col] = all_signatures[m][col].sort_values(ascending=True).index.tolist()[:100]
gene_lists[mname]['combined'][col] = gene_lists[mname]['up'][col] + gene_lists[mname]['down'][col]
# Get results
chea2022_results = []
for m in gene_lists.keys():
for sig in gene_lists[m]['up'].keys():
chea2022_results.append(enrich(gene_lists[m]['up'][sig], chea2022, f"{sig}:{m}:up:ChEA 2022"))
chea2022_results.append(enrich(gene_lists[m]['down'][sig], chea2022, f"{sig}:{m}:down:ChEA 2022"))
chea2022_results.append(enrich(gene_lists[m]['combined'][sig], chea2022, f"{sig}:{m}:combined:ChEA 2022"))
chea2022_df = pd.concat(chea2022_results, axis=0)
targets = [
'NR3C1',
'NR1I2',
'NR0B1'
]
dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: any(t in term for t in targets))]
dex_chea2022_df['Library'] = 'ChEA 2022'
def createResultsDf(df):
df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])
df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])
df['Method_Direction'] = df.apply(lambda row: row.Method + ':' + row.Direction, axis=1)
df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])
df['Cell'] = df['Gene_Set'].apply(lambda x: x.split(':')[0].split('_')[1])
df['Batch'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])
createResultsDf(dex_chea2022_df)
full_df = dex_chea2022_df
up_df = full_df[full_df['Direction'] == 'up']
down_df = full_df[full_df['Direction'] == 'down']
combined_df = full_df[full_df['Direction'] == 'combined']
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for up genes from each method."))
display(up_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for down genes from each method."))
display(down_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
display(Markdown(f"Mean ranks of Dexamethasone target TF terms for combined up and down genes from each method."))
display(combined_df.groupby(['TF', 'Library', 'Method']).mean(numeric_only=True).sort_values(['TF', 'Library', 'Rank']))
Mean ranks of Dexamethasone target TF terms for up genes from each method.
Rank | |||
---|---|---|---|
TF | Library | Method | |
NR0B1 | ChEA 2022 | cd | 172.600000 |
ttest | 182.900000 | ||
limma | 195.500000 | ||
ranksum | 210.200000 | ||
limma-voom | 217.000000 | ||
modz | 239.100000 | ||
fc | 269.700000 | ||
NR1I2 | ChEA 2022 | fc | 126.700000 |
cd | 129.200000 | ||
modz | 225.300000 | ||
ranksum | 248.500000 | ||
limma-voom | 260.400000 | ||
limma | 299.300000 | ||
ttest | 442.500000 | ||
NR3C1 | ChEA 2022 | fc | 255.416667 |
cd | 300.950000 | ||
limma-voom | 306.200000 | ||
modz | 320.450000 | ||
limma | 351.150000 | ||
ranksum | 356.833333 | ||
ttest | 447.183333 |
Mean ranks of Dexamethasone target TF terms for down genes from each method.
Rank | |||
---|---|---|---|
TF | Library | Method | |
NR0B1 | ChEA 2022 | ranksum | 135.100000 |
limma-voom | 156.600000 | ||
cd | 161.500000 | ||
limma | 226.600000 | ||
modz | 262.300000 | ||
ttest | 316.100000 | ||
fc | 354.700000 | ||
NR1I2 | ChEA 2022 | cd | 238.200000 |
ttest | 341.800000 | ||
modz | 364.800000 | ||
ranksum | 422.300000 | ||
limma | 429.300000 | ||
fc | 439.000000 | ||
limma-voom | 485.900000 | ||
NR3C1 | ChEA 2022 | fc | 255.983333 |
ttest | 329.416667 | ||
limma | 330.116667 | ||
cd | 344.450000 | ||
ranksum | 373.633333 | ||
limma-voom | 424.350000 | ||
modz | 435.383333 |
Mean ranks of Dexamethasone target TF terms for combined up and down genes from each method.
Rank | |||
---|---|---|---|
TF | Library | Method | |
NR0B1 | ChEA 2022 | limma-voom | 116.200000 |
ranksum | 123.600000 | ||
cd | 129.000000 | ||
limma | 171.300000 | ||
modz | 196.800000 | ||
ttest | 202.700000 | ||
fc | 306.200000 | ||
NR1I2 | ChEA 2022 | cd | 138.700000 |
fc | 251.000000 | ||
modz | 258.100000 | ||
ranksum | 334.100000 | ||
limma-voom | 355.900000 | ||
limma | 362.900000 | ||
ttest | 413.500000 | ||
NR3C1 | ChEA 2022 | fc | 205.333333 |
limma | 301.433333 | ||
cd | 307.450000 | ||
limma-voom | 336.583333 | ||
ranksum | 337.883333 | ||
modz | 353.650000 | ||
ttest | 397.333333 |
# bootstrap random results
random_arr_chea2022 = []
for i in range(10):
rand_100 = sample(combined_expr_df.index.tolist(), 100)
rand_200 = sample(combined_expr_df.index.tolist(), 200)
random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))
random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))
rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)
rand_chea2022_df['Library'] = 'ChEA 2022'
rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0])
rand_df = rand_chea2022_df
rand_df = rand_df[rand_df['TF'].isin(['NR3C1', 'NR0B1', 'NR1I2'])]
rand_df['Method'] = 'random'
# with ChEA manual libraries
method_color_map = {
'random': 'black',
'cd': '#648FFF',
'limma': '#785EF0',
'limma-voom': '#DA79FF',
'fc': '#DC267F',
'ttest': '#FE6100',
'ranksum': '#FFB000',
'modz': '#009E73'
}
for tf in full_df['TF'].unique():
sub_df = full_df[full_df['TF'] == tf]
sub_df = sub_df[sub_df['Cell'] == 'A375']
sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
order = sub_df.groupby('Method_Direction').mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
sub_df['Direction'] = pd.Categorical(sub_df['Direction'], ['up', 'combined', 'down'])
sub_df = sub_df.sort_values(by=['Direction', 'Method'])
fig1 = go.Figure()
for method in sub_df['Method'].unique():
m_df = sub_df[sub_df['Method'] == method]
fig1.add_trace(
go.Box(
x=m_df[m_df['Method'] == method]['Direction'],
y=m_df[m_df['Method'] == method]['Rank'],
name=method.replace('fc', 'logfc'),
marker_color=method_color_map[method]
)
)
sub_rand_df = rand_df[rand_df['TF'] == tf]
fig1.add_trace(
go.Box(
x=sub_rand_df['Method'],
y=sub_rand_df['Rank'],
name='random',
marker_color='black'
)
)
fig1.update_layout(
width=800,
boxmode='group',
boxgap=0.1,
xaxis={
'title': {'text': 'Direction'},
},
yaxis={
'title': {'text': 'Gene Set Rank'}
},
legend_title_text='Method'
)
fig1.show("png")
if tf == 'NR3C1':
fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3D_300dpi.png', scale=(800/300))
method_color_map = {
'up': 'blue',
'down': 'red',
'combined': 'purple'
}
for tf in full_df['TF'].unique():
sub_df = full_df[full_df['TF'] == tf]
sub_df = sub_df[sub_df['Cell'] == 'A375']
sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
x_order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').reset_index().rename_axis('Order').reset_index().set_index('Method_Direction')
x_order['Labels'] = x_order.index.map(lambda x: x.split(':')[0]).tolist()
fig1 = go.Figure()
for d in ['up', 'combined', 'down']:
d_df = sub_df[sub_df['Direction'] == d]
fig1.add_trace(
go.Box(
x=[x_order.loc[x, 'Order'] for x in d_df['Method_Direction']],
y=d_df['Rank'],
name=d,
marker_color=method_color_map[d]
)
)
sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
sub_rand_df['Order'] = x_order.shape[0] + 1
fig1.add_trace(
go.Box(
x=sub_rand_df['Order'],
y=sub_rand_df['Rank'],
name='random',
marker_color='black'
)
)
fig1.update_layout(
xaxis={
'title': {'text': 'Method'},
},
yaxis={
'title': {'text': 'Gene Set Rank'}
},
)
fig1.update_xaxes(
tickmode='array',
tickangle=45,
tickvals = x_order['Order'].tolist() + [x_order.shape[0] + 1],
ticktext=x_order['Labels'].tolist() + ['Random']
)
fig1.show("png")
method_color_map = {
'up': 'blue',
'down': 'red',
'combined': 'purple'
}
for tf in full_df['TF'].unique():
fig1 = go.Figure()
sub_df = full_df[full_df['TF'] == tf]
sub_df = sub_df[sub_df['Cell'] == 'A375']
sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'Wistar' not in x)]
order = sub_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()
sub_df['Method'] = pd.Categorical(sub_df['Method'], order)
sub_df['Method'] = sub_df['Method'].apply(lambda x: x.replace('fc', 'logfc'))
sub_df = sub_df.sort_values(by=['Method'])
for d in ['up', 'combined', 'down']:
d_df = sub_df[sub_df['Direction'] == d]
fig1.add_trace(
go.Box(
x=d_df['Method'],
y=d_df['Rank'],
name=d,
marker_color=method_color_map[d]
)
)
sub_rand_df = rand_df[rand_df['TF'] == tf]
fig1.add_trace(
go.Box(
x=sub_rand_df['Method'],
y=sub_rand_df['Rank'],
name='random',
marker_color='black'
)
)
fig1.update_layout(
width=800,
boxmode='group',
boxgap=0.1,
xaxis={
'title': {'text': 'Method'},
},
yaxis={
'title': {'text': 'Gene Set Rank'}
},
legend_title_text="Direction"
)
fig1.show("png")
if tf=='NR3C1':
fig1.write_image('/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/3C_300dpi.png', scale=(800/300))
method_color_map = {
'random': 'black',
'cd': '#648FFF',
'limma': '#785EF0',
'limma-voom': '#DA79FF',
'fc': '#DC267F',
'ttest': '#FE6100',
'ranksum': '#FFB000',
'modz': '#009E73'
}
for tf in full_df['TF'].unique():
sub_df = full_df[full_df['TF'] == tf]
sub_df = sub_df[sub_df['Cell'] == 'A375']
sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
fig1 = go.Figure()
for gs in sub_df.groupby(['Method']).mean(numeric_only=True).sort_values('Rank').index:
fig1.add_trace(
go.Box(
y=sub_df[sub_df['Method']==gs]['Rank'].tolist(),
name=gs.replace('fc', 'logfc'),
marker_color=method_color_map[gs]
)
)
sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
fig1.add_trace(
go.Box(
y=sub_rand_df['Rank'].tolist(),
name='random',
marker_color='black'
)
)
fig1.update_layout(
title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
xaxis={
'title': {'text': 'Method'},
},
yaxis={
'title': {'text': 'Rank'}
},
showlegend=False
)
fig1.show("png")
# with ChEA manual libraries
method_color_map = {
'up': 'blue',
'down': 'red',
'combined': 'purple'
}
for tf in full_df['TF'].unique():
sub_df = full_df[full_df['TF'] == tf]
sub_df = sub_df[sub_df['Cell'] == 'A375']
sub_df = sub_df[sub_df['Term'].apply(lambda x: 'MLEC' not in x and 'WistarRat' not in x)]
fig1 = go.Figure()
for gs in sub_df.groupby(['Direction']).mean(numeric_only=True).sort_values('Rank').index:
fig1.add_trace(
go.Box(
y=sub_df[sub_df['Direction']==gs]['Rank'].tolist(),
name=gs,
marker_color=method_color_map[gs]
)
)
sub_rand_df = rand_chea2022_df[rand_chea2022_df['TF'] == tf]
fig1.add_trace(
go.Box(
y=sub_rand_df['Rank'].tolist(),
name='random',
marker_color='black'
)
)
fig1.update_layout(
title_text=f"{tf} ChEA 2022 Term Rankings for Gene Sets from A375 Cell Line",
xaxis={
'title': {'text': 'Direction of Gene Set'},
},
yaxis={
'title': {'text': 'Rank'}
},
showlegend=False
)
fig1.show("png")
chea2022_genesets = {
t: chea2022[t] for t in dex_chea2022_df['Term'].unique() if ('MLEC' not in t and 'Wistar' not in t)
}
def geneSetBridgePlot(gs, method):
y = []
for col in all_signatures[f"{method}_all"].columns:
if method in ['fc', 'cd', 'modz']:
sig = batch_signatures[method][col]['Significance'].sort_values(ascending=False)
else:
sig = batch_signatures[method][col]['Significance'].sort_values(ascending=True)
select = pd.Series(
[x.upper() in gs for x in sig.index.tolist()]
)
_, y_temp = bridge_plot(select)
y.append(y_temp)
y_mean = np.mean(np.array(y), axis=0)
return y_mean
def methodBridgePlot(gsdict, tf, method):
y = []
for gs in gsdict.keys():
if tf not in gs:
continue
y_temp = geneSetBridgePlot(gsdict[gs], method)
y.append(y_temp)
if len(y) == 0:
return y
else:
y_mean = np.mean(np.array(y), axis=0)
return y_mean
def randomBridgePlot(gsdict, tf):
y = []
for gs in gsdict.keys():
if tf not in gs:
continue
y_gs = []
for _ in range(10):
rand_sig = sample(
combined_expr_df.index.tolist(),
combined_expr_df.shape[0]
)
select = pd.Series([
x.upper() in gsdict[gs] for x in rand_sig
])
_, y_temp = bridge_plot(select)
y_gs.append(y_temp)
y.append(np.mean(np.array(y_gs), axis=0))
if len(y) == 0:
return y
else:
y_mean = np.mean(np.array(y), axis=0)
return y_mean
def tfBridgePlot(tf, lib_genesets, libnames, leading_edge = False):
fig = plt.figure()
if leading_edge:
fig = plt.figure(figsize=(5,5))
else:
fig = plt.figure(figsize=(8,5))
for method in batch_signatures.keys():
x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
y = []
for lg in lib_genesets:
y1 = methodBridgePlot(lg, tf, method)
if len(y1) != 0:
y.append(y1)
y_mean = np.mean(np.array(y), axis=0)
if leading_edge:
x_mean = x/len(x)
x_mean = x_mean[x_mean < 0.02]
plt.plot(x_mean, y_mean[:len(x_mean)], label=method.split('_')[0])
else:
plt.plot(x/len(x), y_mean, label=method.split('_')[0])
rand_x = np.arange(all_signatures[f"{method}_all"].shape[0] + 1)
rand_y = []
for lg in lib_genesets:
rand_y1 = randomBridgePlot(lg, tf)
if len(rand_y1) != 0:
rand_y.append(rand_y1)
rand_y_mean = np.mean(np.array(rand_y), axis=0)
if leading_edge:
rand_x_mean = rand_x/len(rand_x)
rand_x_mean = rand_x_mean[rand_x_mean < 0.02]
plt.plot(rand_x_mean, rand_y_mean[:len(rand_x_mean)], label='random', color='gray')
else:
plt.plot(rand_x/len(rand_x), rand_y_mean, label='random', color='gray')
plt.axhline(y=0, color='black', linestyle='dashed')
if leading_edge:
plt.legend(bbox_to_anchor=(1,1))
else:
plt.legend(bbox_to_anchor=(1, 1))
locs, labels = plt.xticks() # Get the current locations and labels.
if leading_edge:
plt.xticks(np.arange(0, 0.02, step=0.005))
plt.xlabel('Rank')
plt.ylabel('D(r) - r')
plt.title(f"{tf} Target Gene Rankings for L1000 Dex Perturbation Signatures\nLibraries: {', '.join(libnames)}")
plt.show()
Ranking by absolute expression value
tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'])
<Figure size 432x288 with 0 Axes>
tfBridgePlot('NR3C1', [chea2022], ['ChEA 2022'], True)
<Figure size 432x288 with 0 Axes>
rank_comparisons = {
b: pd.concat([
all_signatures[method][b].rename(method.split('_')[0]).rank() for method in all_signatures.keys()
], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}
expr_comparisons = {
b: pd.concat([
all_signatures[method][b].rename(method.split('_')[0]) for method in all_signatures.keys()
], axis=1).dropna(axis=0).sort_index(axis=1)
for b in batches}
method_cols = rank_comparisons[list(batches)[0]].columns
sns.set(font_scale=1.2)
expr_correlations = {
b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in expr_comparisons.items()
}
for _,m in expr_correlations.items():
np.fill_diagonal(m, 1)
expr_mean_correlations = np.zeros(shape=expr_correlations[list(batches)[0]].shape)
for i in range(expr_mean_correlations.shape[0]):
for j in range(expr_mean_correlations.shape[1]):
expr_mean_correlations[i,j] = np.mean([m[i,j] for _,m in expr_correlations.items()])
hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
expr_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()
rank_correlations = {
b: 1 - pairwise_distances(m.T, metric='cosine') for b,m in rank_comparisons.items()
}
for _,m in rank_correlations.items():
np.fill_diagonal(m, 1)
rank_mean_correlations = np.zeros(shape=rank_correlations[list(batches)[0]].shape)
for i in range(rank_mean_correlations.shape[0]):
for j in range(rank_mean_correlations.shape[1]):
rank_mean_correlations[i,j] = np.mean([m[i,j] for _,m in rank_correlations.items()])
hm, hm_ax = plt.subplots(figsize=(10,8))
sns.heatmap(pd.DataFrame(
rank_mean_correlations, index=method_cols, columns=method_cols
), cmap=sns.cm.rocket_r, annot=True)
hm_ax.xaxis.tick_top()