{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Benchmarking the Recovery of Known Drug Targets from L1000 CRISPR KO Data: NR1I2 Version" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "from IPython.display import display, Markdown\n", "import json\n", "import requests\n", "import time\n", "from random import sample\n", "from math import log2\n", "from maayanlab_bioinformatics.dge import characteristic_direction, limma_voom\n", "from maayanlab_bioinformatics.plotting.bridge import bridge_plot\n", "from maayanlab_bioinformatics.enrichment.crisp import enrich_crisp\n", "from pydeseq2.dds import DeseqDataSet\n", "from pydeseq2.ds import DeseqStats\n", "import matplotlib.pyplot as plt\n", "from os.path import exists\n", "from scipy.stats import ttest_ind, ranksums\n", "import h5py" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load in Data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Set KO gene\n", "ko_gene = 'NR1I2'\n", "\n", "# Set working directory\n", "l1000_data_dir = '../L1000_data'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96H_X1_B35:F08XPR010_A375.311_96H_X2_B35:F08XPR010_A375.311_96H_X3_B35:F08XPR010_A375.311_96H_X1_B35:F24XPR010_A375.311_96H_X2_B35:F24XPR010_A375.311_96H_X3_B35:F24XPR010_A549.311_96H_X1.L2_B36:F08XPR010_A549.311_96H_X3_B35:F08XPR010_A549.311_96H_X1.L2_B36:F24XPR010_A549.311_96H_X3_B35:F24...XPR010_U251MG.311_96H_X3_B35:F08XPR010_U251MG.311_96H_X1_B35:F24XPR010_U251MG.311_96H_X2_B35:F24XPR010_U251MG.311_96H_X3_B35:F24XPR010_YAPC.311_96H_X1_B35:F08XPR010_YAPC.311_96H_X2_B35:F08XPR010_YAPC.311_96H_X3_B35:F08XPR010_YAPC.311_96H_X1_B35:F24XPR010_YAPC.311_96H_X2_B35:F24XPR010_YAPC.311_96H_X3_B35:F24
symbol
DDR16.1584506.2022006.2856506.2495006.0711006.2255756.033406.263106.292856.52600...6.410306.84496.6989006.505806.5530507.5495756.846306.7804256.9518756.821550
PAX86.0117756.1062005.6649005.6657505.4719505.7806004.992255.300804.624104.82470...5.230204.50785.1270005.005605.4455255.1536004.259305.0352504.9375504.004100
GUCA1A4.8354004.9381754.8424004.6977254.8726504.8264505.216205.171454.993705.02665...4.892954.76724.8065004.921655.2905005.6226005.891555.5743505.2088005.685100
EPHB36.6325006.9714006.5743006.7536506.4275506.7698008.296507.510507.673208.04430...8.034808.95097.9903007.938807.6752507.3968506.478657.8331507.9672007.705101
ESRRA7.8197007.8589008.4977258.2729008.0563517.8913008.736308.281708.842408.22250...7.253006.85917.1230017.136308.6130508.4407507.941308.8991518.5617508.045325
\n", "

5 rows × 57 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H_X1_B35:F08 XPR010_A375.311_96H_X2_B35:F08 \\\n", "symbol \n", "DDR1 6.158450 6.202200 \n", "PAX8 6.011775 6.106200 \n", "GUCA1A 4.835400 4.938175 \n", "EPHB3 6.632500 6.971400 \n", "ESRRA 7.819700 7.858900 \n", "\n", " XPR010_A375.311_96H_X3_B35:F08 XPR010_A375.311_96H_X1_B35:F24 \\\n", "symbol \n", "DDR1 6.285650 6.249500 \n", "PAX8 5.664900 5.665750 \n", "GUCA1A 4.842400 4.697725 \n", "EPHB3 6.574300 6.753650 \n", "ESRRA 8.497725 8.272900 \n", "\n", " XPR010_A375.311_96H_X2_B35:F24 XPR010_A375.311_96H_X3_B35:F24 \\\n", "symbol \n", "DDR1 6.071100 6.225575 \n", "PAX8 5.471950 5.780600 \n", "GUCA1A 4.872650 4.826450 \n", "EPHB3 6.427550 6.769800 \n", "ESRRA 8.056351 7.891300 \n", "\n", " XPR010_A549.311_96H_X1.L2_B36:F08 XPR010_A549.311_96H_X3_B35:F08 \\\n", "symbol \n", "DDR1 6.03340 6.26310 \n", "PAX8 4.99225 5.30080 \n", "GUCA1A 5.21620 5.17145 \n", "EPHB3 8.29650 7.51050 \n", "ESRRA 8.73630 8.28170 \n", "\n", " XPR010_A549.311_96H_X1.L2_B36:F24 XPR010_A549.311_96H_X3_B35:F24 \\\n", "symbol \n", "DDR1 6.29285 6.52600 \n", "PAX8 4.62410 4.82470 \n", "GUCA1A 4.99370 5.02665 \n", "EPHB3 7.67320 8.04430 \n", "ESRRA 8.84240 8.22250 \n", "\n", " ... XPR010_U251MG.311_96H_X3_B35:F08 \\\n", "symbol ... \n", "DDR1 ... 6.41030 \n", "PAX8 ... 5.23020 \n", "GUCA1A ... 4.89295 \n", "EPHB3 ... 8.03480 \n", "ESRRA ... 7.25300 \n", "\n", " XPR010_U251MG.311_96H_X1_B35:F24 XPR010_U251MG.311_96H_X2_B35:F24 \\\n", "symbol \n", "DDR1 6.8449 6.698900 \n", "PAX8 4.5078 5.127000 \n", "GUCA1A 4.7672 4.806500 \n", "EPHB3 8.9509 7.990300 \n", "ESRRA 6.8591 7.123001 \n", "\n", " XPR010_U251MG.311_96H_X3_B35:F24 XPR010_YAPC.311_96H_X1_B35:F08 \\\n", "symbol \n", "DDR1 6.50580 6.553050 \n", "PAX8 5.00560 5.445525 \n", "GUCA1A 4.92165 5.290500 \n", "EPHB3 7.93880 7.675250 \n", "ESRRA 7.13630 8.613050 \n", "\n", " XPR010_YAPC.311_96H_X2_B35:F08 XPR010_YAPC.311_96H_X3_B35:F08 \\\n", "symbol \n", "DDR1 7.549575 6.84630 \n", "PAX8 5.153600 4.25930 \n", "GUCA1A 5.622600 5.89155 \n", "EPHB3 7.396850 6.47865 \n", "ESRRA 8.440750 7.94130 \n", "\n", " XPR010_YAPC.311_96H_X1_B35:F24 XPR010_YAPC.311_96H_X2_B35:F24 \\\n", "symbol \n", "DDR1 6.780425 6.951875 \n", "PAX8 5.035250 4.937550 \n", "GUCA1A 5.574350 5.208800 \n", "EPHB3 7.833150 7.967200 \n", "ESRRA 8.899151 8.561750 \n", "\n", " XPR010_YAPC.311_96H_X3_B35:F24 \n", "symbol \n", "DDR1 6.821550 \n", "PAX8 4.004100 \n", "GUCA1A 5.685100 \n", "EPHB3 7.705101 \n", "ESRRA 8.045325 \n", "\n", "[5 rows x 57 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "try: \n", " expr_df = pd.read_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv\", sep='\\t', index_col=0)\n", "except: \n", " l1000_data_df = pd.read_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_data.tsv\", sep='\\t')\n", "\n", " l1000_data_list = []\n", " l1000_meta_list = []\n", " for row in l1000_data_df.itertuples(): \n", " try:\n", " temp_df = pd.read_csv(row.persistent_id, sep='\\t', index_col=0)\n", " except:\n", " print(f\"Unable to access data from row {row.Index} at {row.persistent_id}\")\n", " continue\n", " for col in temp_df.columns:\n", " l1000_meta_list.append([col] + l1000_data_df.loc[row.Index].tolist())\n", " l1000_data_list.append(temp_df)\n", " expr_df = pd.concat(l1000_data_list, axis=1)\n", "\n", "expr_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File at '../L1000_data/NR1I2_L1000_CRISPRKO_fulldata.tsv' already exists!\n" ] } ], "source": [ "if not exists(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv\"): \n", " expr_df.to_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv\", sep='\\t', index=True)\n", "else: \n", " print(f\"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_fulldata.tsv' already exists!\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tissuediseasecell_linepert_namepert_timepert_typedata_levelcreation_timepersistent_idpert_dosebatch
id
XPR010_A375.311_96H_X1_B35:F08skin of bodymelanomaA375.311NR1I296 hCRISPR Knockout32021-01-21https://lincs-dcic.s3.amazonaws.com/LINCS-data...NaNXPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:F08skin of bodymelanomaA375.311NR1I296 hCRISPR Knockout32021-01-21https://lincs-dcic.s3.amazonaws.com/LINCS-data...NaNXPR010_A375.311_96H
XPR010_A375.311_96H_X3_B35:F08skin of bodymelanomaA375.311NR1I296 hCRISPR Knockout32021-01-21https://lincs-dcic.s3.amazonaws.com/LINCS-data...NaNXPR010_A375.311_96H
XPR010_A375.311_96H_X1_B35:F24skin of bodymelanomaA375.311NR1I296 hCRISPR Knockout32021-01-21https://lincs-dcic.s3.amazonaws.com/LINCS-data...NaNXPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:F24skin of bodymelanomaA375.311NR1I296 hCRISPR Knockout32021-01-21https://lincs-dcic.s3.amazonaws.com/LINCS-data...NaNXPR010_A375.311_96H
\n", "
" ], "text/plain": [ " tissue disease cell_line pert_name \\\n", "id \n", "XPR010_A375.311_96H_X1_B35:F08 skin of body melanoma A375.311 NR1I2 \n", "XPR010_A375.311_96H_X2_B35:F08 skin of body melanoma A375.311 NR1I2 \n", "XPR010_A375.311_96H_X3_B35:F08 skin of body melanoma A375.311 NR1I2 \n", "XPR010_A375.311_96H_X1_B35:F24 skin of body melanoma A375.311 NR1I2 \n", "XPR010_A375.311_96H_X2_B35:F24 skin of body melanoma A375.311 NR1I2 \n", "\n", " pert_time pert_type data_level \\\n", "id \n", "XPR010_A375.311_96H_X1_B35:F08 96 h CRISPR Knockout 3 \n", "XPR010_A375.311_96H_X2_B35:F08 96 h CRISPR Knockout 3 \n", "XPR010_A375.311_96H_X3_B35:F08 96 h CRISPR Knockout 3 \n", "XPR010_A375.311_96H_X1_B35:F24 96 h CRISPR Knockout 3 \n", "XPR010_A375.311_96H_X2_B35:F24 96 h CRISPR Knockout 3 \n", "\n", " creation_time \\\n", "id \n", "XPR010_A375.311_96H_X1_B35:F08 2021-01-21 \n", "XPR010_A375.311_96H_X2_B35:F08 2021-01-21 \n", "XPR010_A375.311_96H_X3_B35:F08 2021-01-21 \n", "XPR010_A375.311_96H_X1_B35:F24 2021-01-21 \n", "XPR010_A375.311_96H_X2_B35:F24 2021-01-21 \n", "\n", " persistent_id \\\n", "id \n", "XPR010_A375.311_96H_X1_B35:F08 https://lincs-dcic.s3.amazonaws.com/LINCS-data... \n", "XPR010_A375.311_96H_X2_B35:F08 https://lincs-dcic.s3.amazonaws.com/LINCS-data... \n", "XPR010_A375.311_96H_X3_B35:F08 https://lincs-dcic.s3.amazonaws.com/LINCS-data... \n", "XPR010_A375.311_96H_X1_B35:F24 https://lincs-dcic.s3.amazonaws.com/LINCS-data... \n", "XPR010_A375.311_96H_X2_B35:F24 https://lincs-dcic.s3.amazonaws.com/LINCS-data... \n", "\n", " pert_dose batch \n", "id \n", "XPR010_A375.311_96H_X1_B35:F08 NaN XPR010_A375.311_96H \n", "XPR010_A375.311_96H_X2_B35:F08 NaN XPR010_A375.311_96H \n", "XPR010_A375.311_96H_X3_B35:F08 NaN XPR010_A375.311_96H \n", "XPR010_A375.311_96H_X1_B35:F24 NaN XPR010_A375.311_96H \n", "XPR010_A375.311_96H_X2_B35:F24 NaN XPR010_A375.311_96H " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "try: \n", " meta_df = pd.read_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv\", sep='\\t', index_col=0)\n", "except:\n", " meta_df = pd.DataFrame(l1000_meta_list, columns=['id'] + l1000_data_df.columns.tolist()).set_index('id')\n", "if 'batch' not in meta_df.columns:\n", " meta_df['batch'] = meta_df.index.map(lambda x: '_'.join(x.split('_')[:3]))\n", "meta_df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File at '../L1000_data/NR1I2_L1000_CRISPRKO_metadata.tsv' already exists!\n" ] } ], "source": [ "if not exists(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv\"): \n", " meta_df.to_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv\", sep='\\t', index=True)\n", "else: \n", " print(f\"File at '{l1000_data_dir}/{ko_gene}_L1000_CRISPRKO_metadata.tsv' already exists!\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batches = meta_df['batch'].unique()\n", "len(batches)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load in Control Data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "set()" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ctrl_data_df = pd.read_csv(f\"{l1000_data_dir}/L1000_Controls.tsv\", sep='\\t')\n", "ctrl_data_df = ctrl_data_df[ctrl_data_df['batch'].isin(batches)]\n", "set(batches).difference(ctrl_data_df['batch'])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
local_idpersistent_idbatch
8910L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_A06_...https://lincs-dcic.s3.amazonaws.com/LINCS-data...XPR010_A375.311_96H
8911L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D05_...https://lincs-dcic.s3.amazonaws.com/LINCS-data...XPR010_A375.311_96H
8912L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D20_...https://lincs-dcic.s3.amazonaws.com/LINCS-data...XPR010_A375.311_96H
8913L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D21_...https://lincs-dcic.s3.amazonaws.com/LINCS-data...XPR010_A375.311_96H
8914L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_E13_...https://lincs-dcic.s3.amazonaws.com/LINCS-data...XPR010_A375.311_96H
\n", "
" ], "text/plain": [ " local_id \\\n", "8910 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_A06_... \n", "8911 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D05_... \n", "8912 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D20_... \n", "8913 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_D21_... \n", "8914 L1000_LINCS_DCIC_2021_XPR010_A375.311_96H_E13_... \n", "\n", " persistent_id batch \n", "8910 https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H \n", "8911 https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H \n", "8912 https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H \n", "8913 https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H \n", "8914 https://lincs-dcic.s3.amazonaws.com/LINCS-data... XPR010_A375.311_96H " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ctrl_data_df.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "The following step extracts all control profiles from the same batch as the profiles of interest, and make take up to a few minutes to complete. " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "try: \n", " ctrl_expr_df = pd.read_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv\", sep='\\t', index_col=0)\n", " ctrl_meta_df = pd.read_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv\", sep='\\t', index_col=0)\n", "except:\n", " ctrl_data_list = []\n", " ctrl_meta_list = []\n", " for row in ctrl_data_df.itertuples():\n", " try: \n", " temp_df = pd.read_csv(row.persistent_id, sep='\\t', index_col=0)\n", " except:\n", " print(f\"Unable to access data from row {row.Index} at {row.persistent_id}\")\n", " continue\n", " for col in temp_df.columns: \n", " ctrl_meta_list.append([col, row.batch])\n", " ctrl_data_list.append(temp_df)\n", "\n", " ctrl_expr_df = pd.concat(ctrl_data_list, axis=1)\n", " ctrl_meta_df = pd.DataFrame(ctrl_meta_list, columns=['id', 'batch']).set_index('id')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96H_X1_B35:A06XPR010_A375.311_96H_X2_B35:A06XPR010_A375.311_96H_X3_B35:A06XPR010_A375.311_96H_X1_B35:D05XPR010_A375.311_96H_X2_B35:D05XPR010_A375.311_96H_X3_B35:D05XPR010_A375.311_96H_X1_B35:D20XPR010_A375.311_96H_X2_B35:D20XPR010_A375.311_96H_X3_B35:D20XPR010_A375.311_96H_X1_B35:D21...XPR010_YAPC.311_96H_X3_B35:E14XPR010_YAPC.311_96H_X1_B35:F04XPR010_YAPC.311_96H_X2_B35:F04XPR010_YAPC.311_96H_X3_B35:F04XPR010_YAPC.311_96H_X1_B35:H24XPR010_YAPC.311_96H_X2_B35:H24XPR010_YAPC.311_96H_X3_B35:H24XPR010_YAPC.311_96H_X1_B35:K17XPR010_YAPC.311_96H_X2_B35:K17XPR010_YAPC.311_96H_X3_B35:K17
symbol
NAT27.2605757.2808757.020106.9668017.453057.798657.349757.1651007.7081757.48150...12.60108.7739518.0212012.1571507.6912997.31155012.0857518.3623509.2005511.97370
ADA5.0647005.2592505.224355.1443005.382955.030755.070755.1990754.9320255.28315...4.66444.8064504.669503.3913505.9727505.6158754.1375506.1330754.753404.23540
CDH25.7067755.8220505.694305.8530005.641555.711605.987155.8001505.6712005.80255...6.07815.7412505.865955.9157255.2943006.1798006.1611005.7363005.835505.75765
AKT30.8968502.5213002.376851.2959503.116352.511952.383301.7902501.2959503.40630...0.00000.0000000.000000.0000000.0000000.0000000.0000000.0000000.000000.00000
MED65.8840505.7572505.778255.5893506.057505.444705.639805.9297005.8328755.86350...6.26605.6057006.175155.8495005.8514006.1683505.6623005.8713756.375356.18240
\n", "

5 rows × 255 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H_X1_B35:A06 XPR010_A375.311_96H_X2_B35:A06 \\\n", "symbol \n", "NAT2 7.260575 7.280875 \n", "ADA 5.064700 5.259250 \n", "CDH2 5.706775 5.822050 \n", "AKT3 0.896850 2.521300 \n", "MED6 5.884050 5.757250 \n", "\n", " XPR010_A375.311_96H_X3_B35:A06 XPR010_A375.311_96H_X1_B35:D05 \\\n", "symbol \n", "NAT2 7.02010 6.966801 \n", "ADA 5.22435 5.144300 \n", "CDH2 5.69430 5.853000 \n", "AKT3 2.37685 1.295950 \n", "MED6 5.77825 5.589350 \n", "\n", " XPR010_A375.311_96H_X2_B35:D05 XPR010_A375.311_96H_X3_B35:D05 \\\n", "symbol \n", "NAT2 7.45305 7.79865 \n", "ADA 5.38295 5.03075 \n", "CDH2 5.64155 5.71160 \n", "AKT3 3.11635 2.51195 \n", "MED6 6.05750 5.44470 \n", "\n", " XPR010_A375.311_96H_X1_B35:D20 XPR010_A375.311_96H_X2_B35:D20 \\\n", "symbol \n", "NAT2 7.34975 7.165100 \n", "ADA 5.07075 5.199075 \n", "CDH2 5.98715 5.800150 \n", "AKT3 2.38330 1.790250 \n", "MED6 5.63980 5.929700 \n", "\n", " XPR010_A375.311_96H_X3_B35:D20 XPR010_A375.311_96H_X1_B35:D21 ... \\\n", "symbol ... \n", "NAT2 7.708175 7.48150 ... \n", "ADA 4.932025 5.28315 ... \n", "CDH2 5.671200 5.80255 ... \n", "AKT3 1.295950 3.40630 ... \n", "MED6 5.832875 5.86350 ... \n", "\n", " XPR010_YAPC.311_96H_X3_B35:E14 XPR010_YAPC.311_96H_X1_B35:F04 \\\n", "symbol \n", "NAT2 12.6010 8.773951 \n", "ADA 4.6644 4.806450 \n", "CDH2 6.0781 5.741250 \n", "AKT3 0.0000 0.000000 \n", "MED6 6.2660 5.605700 \n", "\n", " XPR010_YAPC.311_96H_X2_B35:F04 XPR010_YAPC.311_96H_X3_B35:F04 \\\n", "symbol \n", "NAT2 8.02120 12.157150 \n", "ADA 4.66950 3.391350 \n", "CDH2 5.86595 5.915725 \n", "AKT3 0.00000 0.000000 \n", "MED6 6.17515 5.849500 \n", "\n", " XPR010_YAPC.311_96H_X1_B35:H24 XPR010_YAPC.311_96H_X2_B35:H24 \\\n", "symbol \n", "NAT2 7.691299 7.311550 \n", "ADA 5.972750 5.615875 \n", "CDH2 5.294300 6.179800 \n", "AKT3 0.000000 0.000000 \n", "MED6 5.851400 6.168350 \n", "\n", " XPR010_YAPC.311_96H_X3_B35:H24 XPR010_YAPC.311_96H_X1_B35:K17 \\\n", "symbol \n", "NAT2 12.085751 8.362350 \n", "ADA 4.137550 6.133075 \n", "CDH2 6.161100 5.736300 \n", "AKT3 0.000000 0.000000 \n", "MED6 5.662300 5.871375 \n", "\n", " XPR010_YAPC.311_96H_X2_B35:K17 XPR010_YAPC.311_96H_X3_B35:K17 \n", "symbol \n", "NAT2 9.20055 11.97370 \n", "ADA 4.75340 4.23540 \n", "CDH2 5.83550 5.75765 \n", "AKT3 0.00000 0.00000 \n", "MED6 6.37535 6.18240 \n", "\n", "[5 rows x 255 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ctrl_expr_df.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batch
id
XPR010_A375.311_96H_X1_B35:A06XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:A06XPR010_A375.311_96H
XPR010_A375.311_96H_X3_B35:A06XPR010_A375.311_96H
XPR010_A375.311_96H_X1_B35:D05XPR010_A375.311_96H
XPR010_A375.311_96H_X2_B35:D05XPR010_A375.311_96H
\n", "
" ], "text/plain": [ " batch\n", "id \n", "XPR010_A375.311_96H_X1_B35:A06 XPR010_A375.311_96H\n", "XPR010_A375.311_96H_X2_B35:A06 XPR010_A375.311_96H\n", "XPR010_A375.311_96H_X3_B35:A06 XPR010_A375.311_96H\n", "XPR010_A375.311_96H_X1_B35:D05 XPR010_A375.311_96H\n", "XPR010_A375.311_96H_X2_B35:D05 XPR010_A375.311_96H" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ctrl_meta_df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File at '../L1000_data/NR1I2_L1000_Controls_fulldata.tsv' already exists!\n" ] } ], "source": [ "if not exists(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv\"): \n", " ctrl_expr_df.to_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv\", sep='\\t', index=True)\n", "else: \n", " print(f\"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_fulldata.tsv' already exists!\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File at '../L1000_data/NR1I2_L1000_Controls_metadata.tsv' already exists!\n" ] } ], "source": [ "if not exists(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv\"): \n", " ctrl_meta_df.to_csv(f\"{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv\", sep='\\t', index=True)\n", "else: \n", " print(f\"File at '{l1000_data_dir}/{ko_gene}_L1000_Controls_metadata.tsv' already exists!\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Process Data" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Combine data and remove duplicate genes" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96H_X1_B35:F08XPR010_A375.311_96H_X2_B35:F08XPR010_A375.311_96H_X3_B35:F08XPR010_A375.311_96H_X1_B35:F24XPR010_A375.311_96H_X2_B35:F24XPR010_A375.311_96H_X3_B35:F24XPR010_A549.311_96H_X1.L2_B36:F08XPR010_A549.311_96H_X3_B35:F08XPR010_A549.311_96H_X1.L2_B36:F24XPR010_A549.311_96H_X3_B35:F24...XPR010_YAPC.311_96H_X3_B35:E14XPR010_YAPC.311_96H_X1_B35:F04XPR010_YAPC.311_96H_X2_B35:F04XPR010_YAPC.311_96H_X3_B35:F04XPR010_YAPC.311_96H_X1_B35:H24XPR010_YAPC.311_96H_X2_B35:H24XPR010_YAPC.311_96H_X3_B35:H24XPR010_YAPC.311_96H_X1_B35:K17XPR010_YAPC.311_96H_X2_B35:K17XPR010_YAPC.311_96H_X3_B35:K17
symbol
A1CF4.453154.236354.6342503.931653.872753.751205.34094.675105.608805.23860...10.19290010.74102510.6288010.1744011.36025010.60980010.5832011.227111.16400010.640949
A2M7.225607.470658.5968497.243357.294907.760808.16085.550608.571605.88430...8.1464757.6499008.251658.601407.4703257.5253757.993457.05028.6680008.725750
A4GALT5.432955.446505.6652005.295455.681455.516206.73216.054956.728055.74780...5.4158506.4527505.526405.437655.7097506.0548504.935556.15696.1143004.902300
A4GNT5.372605.289505.2932505.186805.303905.380955.26555.520105.447405.62385...8.3448008.8725508.374008.797158.5769008.6627018.309708.86298.7149518.285049
AAAS7.519807.647807.9566007.596208.076607.874808.13727.559208.285307.90890...6.7195506.6665506.714556.975656.5096006.5762506.786806.42806.5272506.776250
\n", "

5 rows × 312 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H_X1_B35:F08 XPR010_A375.311_96H_X2_B35:F08 \\\n", "symbol \n", "A1CF 4.45315 4.23635 \n", "A2M 7.22560 7.47065 \n", "A4GALT 5.43295 5.44650 \n", "A4GNT 5.37260 5.28950 \n", "AAAS 7.51980 7.64780 \n", "\n", " XPR010_A375.311_96H_X3_B35:F08 XPR010_A375.311_96H_X1_B35:F24 \\\n", "symbol \n", "A1CF 4.634250 3.93165 \n", "A2M 8.596849 7.24335 \n", "A4GALT 5.665200 5.29545 \n", "A4GNT 5.293250 5.18680 \n", "AAAS 7.956600 7.59620 \n", "\n", " XPR010_A375.311_96H_X2_B35:F24 XPR010_A375.311_96H_X3_B35:F24 \\\n", "symbol \n", "A1CF 3.87275 3.75120 \n", "A2M 7.29490 7.76080 \n", "A4GALT 5.68145 5.51620 \n", "A4GNT 5.30390 5.38095 \n", "AAAS 8.07660 7.87480 \n", "\n", " XPR010_A549.311_96H_X1.L2_B36:F08 XPR010_A549.311_96H_X3_B35:F08 \\\n", "symbol \n", "A1CF 5.3409 4.67510 \n", "A2M 8.1608 5.55060 \n", "A4GALT 6.7321 6.05495 \n", "A4GNT 5.2655 5.52010 \n", "AAAS 8.1372 7.55920 \n", "\n", " XPR010_A549.311_96H_X1.L2_B36:F24 XPR010_A549.311_96H_X3_B35:F24 \\\n", "symbol \n", "A1CF 5.60880 5.23860 \n", "A2M 8.57160 5.88430 \n", "A4GALT 6.72805 5.74780 \n", "A4GNT 5.44740 5.62385 \n", "AAAS 8.28530 7.90890 \n", "\n", " ... XPR010_YAPC.311_96H_X3_B35:E14 XPR010_YAPC.311_96H_X1_B35:F04 \\\n", "symbol ... \n", "A1CF ... 10.192900 10.741025 \n", "A2M ... 8.146475 7.649900 \n", "A4GALT ... 5.415850 6.452750 \n", "A4GNT ... 8.344800 8.872550 \n", "AAAS ... 6.719550 6.666550 \n", "\n", " XPR010_YAPC.311_96H_X2_B35:F04 XPR010_YAPC.311_96H_X3_B35:F04 \\\n", "symbol \n", "A1CF 10.62880 10.17440 \n", "A2M 8.25165 8.60140 \n", "A4GALT 5.52640 5.43765 \n", "A4GNT 8.37400 8.79715 \n", "AAAS 6.71455 6.97565 \n", "\n", " XPR010_YAPC.311_96H_X1_B35:H24 XPR010_YAPC.311_96H_X2_B35:H24 \\\n", "symbol \n", "A1CF 11.360250 10.609800 \n", "A2M 7.470325 7.525375 \n", "A4GALT 5.709750 6.054850 \n", "A4GNT 8.576900 8.662701 \n", "AAAS 6.509600 6.576250 \n", "\n", " XPR010_YAPC.311_96H_X3_B35:H24 XPR010_YAPC.311_96H_X1_B35:K17 \\\n", "symbol \n", "A1CF 10.58320 11.2271 \n", "A2M 7.99345 7.0502 \n", "A4GALT 4.93555 6.1569 \n", "A4GNT 8.30970 8.8629 \n", "AAAS 6.78680 6.4280 \n", "\n", " XPR010_YAPC.311_96H_X2_B35:K17 XPR010_YAPC.311_96H_X3_B35:K17 \n", "symbol \n", "A1CF 11.164000 10.640949 \n", "A2M 8.668000 8.725750 \n", "A4GALT 6.114300 4.902300 \n", "A4GNT 8.714951 8.285049 \n", "AAAS 6.527250 6.776250 \n", "\n", "[5 rows x 312 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_expr_df = pd.concat([\n", " expr_df.groupby(expr_df.index).mean(), \n", " ctrl_expr_df.groupby(ctrl_expr_df.index).mean()\n", "], axis=1)\n", "combined_expr_df.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Compute Signatures: Batch Perturbations vs. Batch Controls" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "batch_profiles = {x: {'perts': [], 'ctrls': []} for x in batches}\n", "for b in batches: \n", " batch_profiles[b]['perts'] = meta_df[meta_df['batch'] == b].index.tolist()\n", " batch_profiles[b]['ctrls'] = ctrl_meta_df[ctrl_meta_df['batch'] == b].index.tolist()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "batch_signatures = {\n", " 'cd': {}, \n", " 'limma': {}, \n", " 'limma-voom': {},\n", " 'fc': {},\n", " 'ranksum': {},\n", " 'ttest': {}\n", "}" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Characteristic Direction" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Function for computing signatures with characteristic direction\n", "def cd_signature(ctrl_ids, case_ids, dataset):\n", " \n", " signature = characteristic_direction(\n", " dataset.loc[:, ctrl_ids], \n", " dataset.loc[:, case_ids], \n", " calculate_sig=True\n", " )\n", " signature['Significance'] = signature['CD-coefficient'].apply(abs)\n", " \n", " return signature.sort_values(by=['CD-coefficient'], ascending=False)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Note: the following step may take a few minutes to run." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "for b in batches: \n", " batch_signatures['cd'][b] = cd_signature(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'],\n", " combined_expr_df\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Limma" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Function for computing signatures\n", "def limma(ctrl_ids, case_ids, dataset, voom):\n", " \n", " signature = limma_voom.limma_voom_differential_expression(\n", " dataset.loc[:, ctrl_ids],\n", " dataset.loc[:, case_ids],\n", " voom_design=voom,\n", " filter_genes=False\n", " )\n", " signature['Significance'] = signature['P.Value']\n", "\n", " return signature.sort_values(\"t\", ascending=False)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Note: the following step may take a few minutes to run." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "R[write to console]: Loading required package: R.oo\n", "\n", "R[write to console]: Loading required package: R.methodsS3\n", "\n", "R[write to console]: R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.\n", "\n", "R[write to console]: R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.\n", "\n", "R[write to console]: \n", "Attaching package: ‘R.oo’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:R.methodsS3’:\n", "\n", " throw\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:methods’:\n", "\n", " getClasses, getMethods\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:base’:\n", "\n", " attach, detach, load, save\n", "\n", "\n", "R[write to console]: R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.\n", "\n", "R[write to console]: \n", "Attaching package: ‘R.utils’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:utils’:\n", "\n", " timestamp\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:base’:\n", "\n", " cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,\n", " warnings\n", "\n", "\n", "R[write to console]: \n", "Attaching package: ‘RCurl’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:R.utils’:\n", "\n", " reset\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:R.oo’:\n", "\n", " clone\n", "\n", "\n", "R[write to console]: Loading required package: S4Vectors\n", "\n", "R[write to console]: Loading required package: stats4\n", "\n", "R[write to console]: Loading required package: BiocGenerics\n", "\n", "R[write to console]: Loading required package: parallel\n", "\n", "R[write to console]: \n", "Attaching package: ‘BiocGenerics’\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:base’:\n", "\n", " Filter, Find, Map, Position, Reduce, anyDuplicated, append,\n", " as.data.frame, basename, cbind, colnames, dirname, do.call,\n", " duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,\n", " lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,\n", " pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,\n", " tapply, union, unique, unsplit, which.max, which.min\n", "\n", "\n", "R[write to console]: \n", "Attaching package: ‘S4Vectors’\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:base’:\n", "\n", " I, expand.grid, unname\n", "\n", "\n", "R[write to console]: Loading required package: IRanges\n", "\n", "R[write to console]: \n", "Attaching package: ‘IRanges’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:R.oo’:\n", "\n", " trim\n", "\n", "\n", "R[write to console]: Loading required package: GenomicRanges\n", "\n", "R[write to console]: Loading required package: GenomeInfoDb\n", "\n", "R[write to console]: Loading required package: SummarizedExperiment\n", "\n", "R[write to console]: Loading required package: MatrixGenerics\n", "\n", "R[write to console]: Loading required package: matrixStats\n", "\n", "R[write to console]: \n", "Attaching package: ‘MatrixGenerics’\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n", "\n", " colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,\n", " colCounts, colCummaxs, colCummins, colCumprods, colCumsums,\n", " colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,\n", " colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,\n", " colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,\n", " colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,\n", " colWeightedMeans, colWeightedMedians, colWeightedSds,\n", " colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,\n", " rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,\n", " rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,\n", " rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,\n", " rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,\n", " rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,\n", " rowWeightedMads, rowWeightedMeans, rowWeightedMedians,\n", " rowWeightedSds, rowWeightedVars\n", "\n", "\n", "R[write to console]: Loading required package: Biobase\n", "\n", "R[write to console]: Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "\n", "R[write to console]: \n", "Attaching package: ‘Biobase’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:MatrixGenerics’:\n", "\n", " rowMedians\n", "\n", "\n", "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n", "\n", " anyMissing, rowMedians\n", "\n", "\n", "R[write to console]: \n", "Attaching package: ‘limma’\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:DESeq2’:\n", "\n", " plotMA\n", "\n", "\n", "R[write to console]: The following object is masked from ‘package:BiocGenerics’:\n", "\n", " plotMA\n", "\n", "\n" ] } ], "source": [ "for b in batches: \n", " batch_signatures['limma'][b] = limma(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'], \n", " combined_expr_df,\n", " voom=False\n", " )" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "for b in batches: \n", " batch_signatures['limma-voom'][b] = limma(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'], \n", " combined_expr_df,\n", " voom=True\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Wilcoxon Rank-Sum Test" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def ranksum(ctrl_ids, case_ids, dataset):\n", " if len(ctrl_ids) + len(case_ids) < 32: \n", " print(\"Warning! Sample sizes < 16 generally do not provide good results. \")\n", " res_array = []\n", " for gene in dataset.index: \n", " res = ranksums(\n", " dataset.loc[gene, case_ids],\n", " dataset.loc[gene, ctrl_ids]\n", " )\n", " res_array.append([gene, res.statistic, res.pvalue])\n", " signature = pd.DataFrame(\n", " res_array, columns=['Geneid', 'Statistic', 'Pvalue']\n", " ).set_index('Geneid')\n", " signature['Significance'] = signature['Pvalue']\n", " return signature.sort_values(by=['Statistic'], ascending=False)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning! Sample sizes < 16 generally do not provide good results. \n", "Warning! Sample sizes < 16 generally do not provide good results. \n", "Warning! Sample sizes < 16 generally do not provide good results. \n" ] } ], "source": [ "for b in batches: \n", " batch_signatures['ranksum'][b] = ranksum(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'], \n", " combined_expr_df\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Welch's t-test" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def ttest(ctrl_ids, case_ids, dataset):\n", " res_array = []\n", " for gene in dataset.index: \n", " res = ttest_ind(\n", " dataset.loc[gene, case_ids],\n", " dataset.loc[gene, ctrl_ids],\n", " equal_var = False\n", " )\n", " res_array.append([gene, res.statistic, res.pvalue])\n", " signature = pd.DataFrame(\n", " res_array, columns=['Geneid', 'Statistic', 'Pvalue']\n", " ).set_index('Geneid')\n", " signature['Significance'] = signature['Pvalue']\n", " return signature.sort_values(by=['Statistic'], ascending=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "for b in batches: \n", " batch_signatures['ttest'][b] = ttest(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'], \n", " combined_expr_df\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### (log2) Fold Change" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# Function for computing signatures with fold change\n", "def logFC(ctrl_ids, case_ids, dataset):\n", "\n", " case_mean = dataset.loc[:, case_ids].mean(axis=1)\n", " ctrl_mean = dataset.loc[:, ctrl_ids].mean(axis=1)\n", "\n", " signature = case_mean / (ctrl_mean + 0.001)\n", "\n", " signature_df = pd.DataFrame(\n", " signature.apply(lambda x: log2(x+0.001)), columns=['logFC']\n", " )\n", " signature_df['Significance'] = signature_df['logFC'].apply(abs)\n", " \n", " return signature_df.sort_values('logFC', ascending=False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "for b in batches: \n", " batch_signatures['fc'][b] = logFC(\n", " batch_profiles[b]['ctrls'], \n", " batch_profiles[b]['perts'],\n", " combined_expr_df\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## All signatures" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "All CD batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-0.019577-0.016974-0.015616-0.016290-0.020444-0.018519-0.017728-0.020971-0.017763-0.019295
A2M0.003215-0.0035380.001771-0.001684-0.001222-0.0024090.0010870.0027430.0019580.004480
A4GALT-0.0007400.0006670.0006890.0021510.000054-0.001610-0.0006140.000041-0.0001400.000447
A4GNT-0.011040-0.008234-0.014184-0.013903-0.014138-0.015587-0.013585-0.012055-0.013713-0.009598
AAAS0.0049270.0055850.0052490.0036790.0050780.0046420.0052650.0040760.0049210.003304
.................................
ZXDB0.000307-0.001142-0.000563-0.000166-0.001886-0.001636-0.000132-0.000568-0.0016560.000642
ZXDC-0.004677-0.009372-0.007265-0.002839-0.005829-0.001725-0.007841-0.008623-0.010583-0.009344
ZYX0.0189720.0146850.0184840.0127410.0193930.0115630.0087500.0221700.0168440.015572
ZZEF1-0.007563-0.007977-0.001991-0.002495-0.005057-0.0019010.006034-0.0046590.000046-0.003763
ZZZ30.0050010.0081170.0028340.0044800.0092390.003054-0.0005520.0038910.0025080.001957
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -0.019577 -0.016974 -0.015616 \n", "A2M 0.003215 -0.003538 0.001771 \n", "A4GALT -0.000740 0.000667 0.000689 \n", "A4GNT -0.011040 -0.008234 -0.014184 \n", "AAAS 0.004927 0.005585 0.005249 \n", "... ... ... ... \n", "ZXDB 0.000307 -0.001142 -0.000563 \n", "ZXDC -0.004677 -0.009372 -0.007265 \n", "ZYX 0.018972 0.014685 0.018484 \n", "ZZEF1 -0.007563 -0.007977 -0.001991 \n", "ZZZ3 0.005001 0.008117 0.002834 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -0.016290 -0.020444 -0.018519 \n", "A2M -0.001684 -0.001222 -0.002409 \n", "A4GALT 0.002151 0.000054 -0.001610 \n", "A4GNT -0.013903 -0.014138 -0.015587 \n", "AAAS 0.003679 0.005078 0.004642 \n", "... ... ... ... \n", "ZXDB -0.000166 -0.001886 -0.001636 \n", "ZXDC -0.002839 -0.005829 -0.001725 \n", "ZYX 0.012741 0.019393 0.011563 \n", "ZZEF1 -0.002495 -0.005057 -0.001901 \n", "ZZZ3 0.004480 0.009239 0.003054 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -0.017728 -0.020971 -0.017763 \n", "A2M 0.001087 0.002743 0.001958 \n", "A4GALT -0.000614 0.000041 -0.000140 \n", "A4GNT -0.013585 -0.012055 -0.013713 \n", "AAAS 0.005265 0.004076 0.004921 \n", "... ... ... ... \n", "ZXDB -0.000132 -0.000568 -0.001656 \n", "ZXDC -0.007841 -0.008623 -0.010583 \n", "ZYX 0.008750 0.022170 0.016844 \n", "ZZEF1 0.006034 -0.004659 0.000046 \n", "ZZZ3 -0.000552 0.003891 0.002508 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -0.019295 \n", "A2M 0.004480 \n", "A4GALT 0.000447 \n", "A4GNT -0.009598 \n", "AAAS 0.003304 \n", "... ... \n", "ZXDB 0.000642 \n", "ZXDC -0.009344 \n", "ZYX 0.015572 \n", "ZZEF1 -0.003763 \n", "ZZZ3 0.001957 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "All LIMMA batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-60.019539-28.815911-44.964117-42.496500-76.125622-40.349357-38.287313-65.604340-55.876038-43.141008
A2M4.845533-1.8475833.406957-2.852660-2.416547-0.5370581.5652559.0060323.8249932.729793
A4GALT-3.617646-1.0163860.5595266.139557-2.310624-4.533725-3.5293590.950890-2.5837322.579730
A4GNT-42.919025-19.718965-50.182363-30.878052-47.377980-36.774901-24.278347-24.678781-38.743252-27.158267
AAAS26.28291013.76757218.23676314.89750124.92996013.01788016.84603116.41662323.69076014.218616
.................................
ZXDB0.235291-2.809396-0.9873681.787694-6.089417-4.552719-0.102972-0.915882-4.805944-6.810436
ZXDC-16.760846-11.868575-15.712726-6.614901-12.108318-1.424355-16.841576-23.859809-22.600620-8.387029
ZYX36.11229013.85556023.70596420.62945437.05740915.96020611.38551835.86425130.87587525.291581
ZZEF1-19.567202-7.184474-1.622090-7.559272-15.772091-9.1976903.419958-8.114486-3.014485-9.025222
ZZZ317.7887064.8208156.4281178.10547028.5210798.6122883.1896019.1254499.9042612.446274
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -60.019539 -28.815911 -44.964117 \n", "A2M 4.845533 -1.847583 3.406957 \n", "A4GALT -3.617646 -1.016386 0.559526 \n", "A4GNT -42.919025 -19.718965 -50.182363 \n", "AAAS 26.282910 13.767572 18.236763 \n", "... ... ... ... \n", "ZXDB 0.235291 -2.809396 -0.987368 \n", "ZXDC -16.760846 -11.868575 -15.712726 \n", "ZYX 36.112290 13.855560 23.705964 \n", "ZZEF1 -19.567202 -7.184474 -1.622090 \n", "ZZZ3 17.788706 4.820815 6.428117 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -42.496500 -76.125622 -40.349357 \n", "A2M -2.852660 -2.416547 -0.537058 \n", "A4GALT 6.139557 -2.310624 -4.533725 \n", "A4GNT -30.878052 -47.377980 -36.774901 \n", "AAAS 14.897501 24.929960 13.017880 \n", "... ... ... ... \n", "ZXDB 1.787694 -6.089417 -4.552719 \n", "ZXDC -6.614901 -12.108318 -1.424355 \n", "ZYX 20.629454 37.057409 15.960206 \n", "ZZEF1 -7.559272 -15.772091 -9.197690 \n", "ZZZ3 8.105470 28.521079 8.612288 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -38.287313 -65.604340 -55.876038 \n", "A2M 1.565255 9.006032 3.824993 \n", "A4GALT -3.529359 0.950890 -2.583732 \n", "A4GNT -24.278347 -24.678781 -38.743252 \n", "AAAS 16.846031 16.416623 23.690760 \n", "... ... ... ... \n", "ZXDB -0.102972 -0.915882 -4.805944 \n", "ZXDC -16.841576 -23.859809 -22.600620 \n", "ZYX 11.385518 35.864251 30.875875 \n", "ZZEF1 3.419958 -8.114486 -3.014485 \n", "ZZZ3 3.189601 9.125449 9.904261 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -43.141008 \n", "A2M 2.729793 \n", "A4GALT 2.579730 \n", "A4GNT -27.158267 \n", "AAAS 14.218616 \n", "... ... \n", "ZXDB -6.810436 \n", "ZXDC -8.387029 \n", "ZYX 25.291581 \n", "ZZEF1 -9.025222 \n", "ZZZ3 2.446274 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "All LIMMA-VOOM batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-50.651436-24.147615-33.979375-32.001838-42.683320-35.396960-27.405920-32.899780-39.820453-32.782214
A2M4.955400-1.8447243.600092-2.898473-2.402441-0.5384321.5654489.3407043.9122972.731678
A4GALT-3.589970-1.0115280.5543466.084361-2.312838-4.518897-3.5140230.952444-2.6297742.562922
A4GNT-39.256483-17.359412-45.350198-25.820052-40.333346-29.798909-21.557148-22.496492-34.648513-26.759321
AAAS26.20346913.83907418.30413314.37457424.73727513.11290116.68013016.33067722.50793114.093464
.................................
ZXDB0.230335-2.691749-0.9597211.785147-6.041614-4.333230-0.102150-0.908215-4.765242-6.808437
ZXDC-16.024135-10.927358-14.756960-6.754619-11.315591-1.416653-15.285268-22.543888-22.328144-8.161337
ZYX44.93267219.30950831.52298025.00826249.47213019.13133813.65833446.53796435.15709930.122082
ZZEF1-17.671956-5.840489-1.605732-6.858084-14.333785-8.3082583.458953-7.897925-2.958390-8.686235
ZZZ318.7901845.2957186.7607018.66383433.0608969.1209523.3077649.32072110.2935962.459377
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -50.651436 -24.147615 -33.979375 \n", "A2M 4.955400 -1.844724 3.600092 \n", "A4GALT -3.589970 -1.011528 0.554346 \n", "A4GNT -39.256483 -17.359412 -45.350198 \n", "AAAS 26.203469 13.839074 18.304133 \n", "... ... ... ... \n", "ZXDB 0.230335 -2.691749 -0.959721 \n", "ZXDC -16.024135 -10.927358 -14.756960 \n", "ZYX 44.932672 19.309508 31.522980 \n", "ZZEF1 -17.671956 -5.840489 -1.605732 \n", "ZZZ3 18.790184 5.295718 6.760701 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -32.001838 -42.683320 -35.396960 \n", "A2M -2.898473 -2.402441 -0.538432 \n", "A4GALT 6.084361 -2.312838 -4.518897 \n", "A4GNT -25.820052 -40.333346 -29.798909 \n", "AAAS 14.374574 24.737275 13.112901 \n", "... ... ... ... \n", "ZXDB 1.785147 -6.041614 -4.333230 \n", "ZXDC -6.754619 -11.315591 -1.416653 \n", "ZYX 25.008262 49.472130 19.131338 \n", "ZZEF1 -6.858084 -14.333785 -8.308258 \n", "ZZZ3 8.663834 33.060896 9.120952 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -27.405920 -32.899780 -39.820453 \n", "A2M 1.565448 9.340704 3.912297 \n", "A4GALT -3.514023 0.952444 -2.629774 \n", "A4GNT -21.557148 -22.496492 -34.648513 \n", "AAAS 16.680130 16.330677 22.507931 \n", "... ... ... ... \n", "ZXDB -0.102150 -0.908215 -4.765242 \n", "ZXDC -15.285268 -22.543888 -22.328144 \n", "ZYX 13.658334 46.537964 35.157099 \n", "ZZEF1 3.458953 -7.897925 -2.958390 \n", "ZZZ3 3.307764 9.320721 10.293596 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -32.782214 \n", "A2M 2.731678 \n", "A4GALT 2.562922 \n", "A4GNT -26.759321 \n", "AAAS 14.093464 \n", "... ... \n", "ZXDB -6.808437 \n", "ZXDC -8.161337 \n", "ZYX 30.122082 \n", "ZZEF1 -8.686235 \n", "ZZZ3 2.459377 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "All RANKSUM batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-3.765875-3.064524-3.75-3.780756-3.478042-3.780756-3.765875-3.780756-3.780756-3.780756
A2M3.476193-0.4256282.60-2.170434-1.919879-0.0933520.7724873.6874042.8472361.166900
A4GALT-3.041669-0.851257-0.153.780756-2.532014-3.313996-3.0899490.933520-2.4738282.287124
A4GNT-3.765875-3.064524-3.75-3.780756-3.478042-3.780756-3.765875-3.780756-3.780756-3.780756
AAAS3.7658753.0645243.753.7807563.4780423.7807563.7658753.7807563.7807563.780756
.................................
ZXDB-0.289683-2.724021-1.201.400280-3.478042-3.267320-0.241402-1.586984-3.780756-3.780756
ZXDC-3.765875-3.064524-3.75-3.734080-3.478042-1.633660-3.765875-3.780756-3.780756-3.687404
ZYX3.7658753.0645243.753.7807563.4780423.7807563.7658753.7807563.7807563.780756
ZZEF1-3.765875-2.979398-1.55-3.780756-3.478042-3.7807562.945108-3.780756-3.220644-3.780756
ZZZ33.7658753.0645243.753.7807563.4780423.7807562.8968273.7340803.7807561.260252
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -3.765875 -3.064524 -3.75 \n", "A2M 3.476193 -0.425628 2.60 \n", "A4GALT -3.041669 -0.851257 -0.15 \n", "A4GNT -3.765875 -3.064524 -3.75 \n", "AAAS 3.765875 3.064524 3.75 \n", "... ... ... ... \n", "ZXDB -0.289683 -2.724021 -1.20 \n", "ZXDC -3.765875 -3.064524 -3.75 \n", "ZYX 3.765875 3.064524 3.75 \n", "ZZEF1 -3.765875 -2.979398 -1.55 \n", "ZZZ3 3.765875 3.064524 3.75 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -3.780756 -3.478042 -3.780756 \n", "A2M -2.170434 -1.919879 -0.093352 \n", "A4GALT 3.780756 -2.532014 -3.313996 \n", "A4GNT -3.780756 -3.478042 -3.780756 \n", "AAAS 3.780756 3.478042 3.780756 \n", "... ... ... ... \n", "ZXDB 1.400280 -3.478042 -3.267320 \n", "ZXDC -3.734080 -3.478042 -1.633660 \n", "ZYX 3.780756 3.478042 3.780756 \n", "ZZEF1 -3.780756 -3.478042 -3.780756 \n", "ZZZ3 3.780756 3.478042 3.780756 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -3.765875 -3.780756 -3.780756 \n", "A2M 0.772487 3.687404 2.847236 \n", "A4GALT -3.089949 0.933520 -2.473828 \n", "A4GNT -3.765875 -3.780756 -3.780756 \n", "AAAS 3.765875 3.780756 3.780756 \n", "... ... ... ... \n", "ZXDB -0.241402 -1.586984 -3.780756 \n", "ZXDC -3.765875 -3.780756 -3.780756 \n", "ZYX 3.765875 3.780756 3.780756 \n", "ZZEF1 2.945108 -3.780756 -3.220644 \n", "ZZZ3 2.896827 3.734080 3.780756 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -3.780756 \n", "A2M 1.166900 \n", "A4GALT 2.287124 \n", "A4GNT -3.780756 \n", "AAAS 3.780756 \n", "... ... \n", "ZXDB -3.780756 \n", "ZXDC -3.687404 \n", "ZYX 3.780756 \n", "ZZEF1 -3.780756 \n", "ZZZ3 1.260252 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "All TTEST batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-47.343241-25.717956-42.861009-34.360667-60.751893-30.899225-35.139583-71.404329-71.268026-42.707136
A2M3.446543-0.8144052.952941-1.898905-1.956064-0.4060140.9006296.1190732.3532101.439096
A4GALT-4.378029-0.7931520.3845559.602556-4.454827-7.037581-4.1548040.991804-3.4762834.040070
A4GNT-64.783790-29.296212-51.959749-45.564620-65.113722-51.629066-35.261361-38.811093-59.815876-36.346808
AAAS17.72452612.04216829.16760312.07700220.54580514.86153912.54270516.35987130.74898717.028256
.................................
ZXDB-0.416080-4.099616-1.6013541.897163-8.683223-4.247329-0.495350-1.496384-6.292793-6.538260
ZXDC-21.116532-16.305855-23.107084-10.698635-22.546653-2.146681-18.682881-33.469199-23.994271-12.848385
ZYX39.00468319.24998326.09735922.58870452.21929727.15469511.96321947.13647121.37109323.821547
ZZEF1-30.433408-9.115103-1.803287-11.454066-18.123284-14.1509274.327542-11.624698-5.539892-14.754975
ZZZ315.1916158.2705328.12203210.11478139.83219610.3524854.0851265.3614098.4599801.403904
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -47.343241 -25.717956 -42.861009 \n", "A2M 3.446543 -0.814405 2.952941 \n", "A4GALT -4.378029 -0.793152 0.384555 \n", "A4GNT -64.783790 -29.296212 -51.959749 \n", "AAAS 17.724526 12.042168 29.167603 \n", "... ... ... ... \n", "ZXDB -0.416080 -4.099616 -1.601354 \n", "ZXDC -21.116532 -16.305855 -23.107084 \n", "ZYX 39.004683 19.249983 26.097359 \n", "ZZEF1 -30.433408 -9.115103 -1.803287 \n", "ZZZ3 15.191615 8.270532 8.122032 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -34.360667 -60.751893 -30.899225 \n", "A2M -1.898905 -1.956064 -0.406014 \n", "A4GALT 9.602556 -4.454827 -7.037581 \n", "A4GNT -45.564620 -65.113722 -51.629066 \n", "AAAS 12.077002 20.545805 14.861539 \n", "... ... ... ... \n", "ZXDB 1.897163 -8.683223 -4.247329 \n", "ZXDC -10.698635 -22.546653 -2.146681 \n", "ZYX 22.588704 52.219297 27.154695 \n", "ZZEF1 -11.454066 -18.123284 -14.150927 \n", "ZZZ3 10.114781 39.832196 10.352485 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -35.139583 -71.404329 -71.268026 \n", "A2M 0.900629 6.119073 2.353210 \n", "A4GALT -4.154804 0.991804 -3.476283 \n", "A4GNT -35.261361 -38.811093 -59.815876 \n", "AAAS 12.542705 16.359871 30.748987 \n", "... ... ... ... \n", "ZXDB -0.495350 -1.496384 -6.292793 \n", "ZXDC -18.682881 -33.469199 -23.994271 \n", "ZYX 11.963219 47.136471 21.371093 \n", "ZZEF1 4.327542 -11.624698 -5.539892 \n", "ZZZ3 4.085126 5.361409 8.459980 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -42.707136 \n", "A2M 1.439096 \n", "A4GALT 4.040070 \n", "A4GNT -36.346808 \n", "AAAS 17.028256 \n", "... ... \n", "ZXDB -6.538260 \n", "ZXDC -12.848385 \n", "ZYX 23.821547 \n", "ZZEF1 -14.754975 \n", "ZZZ3 1.403904 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "All FC batch signatures" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XPR010_A375.311_96HXPR010_A549.311_96HXPR010_AGS.311_96HXPR010_BICR6.311_96HXPR010_ES2.311_96HXPR010_HT29.311_96HXPR010_MCF7.311_96HXPR010_PC3.311B_96HXPR010_U251MG.311_96HXPR010_YAPC.311_96H
Gene
A1CF-1.418462-1.040406-1.136366-1.194335-1.601811-1.222860-1.217650-1.387168-1.239583-1.206195
A2M0.157055-0.1233500.240677-0.164841-0.072379-0.0268770.1087720.2724030.1590510.225687
A4GALT-0.076488-0.0449040.0090530.139620-0.056478-0.118418-0.1085460.019782-0.0713140.106599
A4GNT-0.767852-0.735214-0.931993-0.863029-0.898443-0.979517-0.801284-0.709464-0.933154-0.581013
AAAS0.3454460.4171200.3293620.2142010.3961000.3238690.3143840.2386650.3257970.263725
.................................
ZXDB-0.003182-0.080916-0.0243530.027470-0.115208-0.109797-0.006245-0.023256-0.091842-0.145286
ZXDC-0.340808-0.592351-0.441383-0.242187-0.334438-0.054448-0.550889-0.568556-0.568459-0.348030
ZYX1.1627560.8135151.2138840.7328711.1977000.8544810.5061341.1240261.0420411.012969
ZZEF1-0.414720-0.482071-0.054090-0.337330-0.374806-0.4377430.199752-0.259344-0.171825-0.375506
ZZZ30.3257690.3082560.1100880.2456610.5911740.2065790.1060090.2074730.2365880.064532
\n", "

12327 rows × 10 columns

\n", "
" ], "text/plain": [ " XPR010_A375.311_96H XPR010_A549.311_96H XPR010_AGS.311_96H \\\n", "Gene \n", "A1CF -1.418462 -1.040406 -1.136366 \n", "A2M 0.157055 -0.123350 0.240677 \n", "A4GALT -0.076488 -0.044904 0.009053 \n", "A4GNT -0.767852 -0.735214 -0.931993 \n", "AAAS 0.345446 0.417120 0.329362 \n", "... ... ... ... \n", "ZXDB -0.003182 -0.080916 -0.024353 \n", "ZXDC -0.340808 -0.592351 -0.441383 \n", "ZYX 1.162756 0.813515 1.213884 \n", "ZZEF1 -0.414720 -0.482071 -0.054090 \n", "ZZZ3 0.325769 0.308256 0.110088 \n", "\n", " XPR010_BICR6.311_96H XPR010_ES2.311_96H XPR010_HT29.311_96H \\\n", "Gene \n", "A1CF -1.194335 -1.601811 -1.222860 \n", "A2M -0.164841 -0.072379 -0.026877 \n", "A4GALT 0.139620 -0.056478 -0.118418 \n", "A4GNT -0.863029 -0.898443 -0.979517 \n", "AAAS 0.214201 0.396100 0.323869 \n", "... ... ... ... \n", "ZXDB 0.027470 -0.115208 -0.109797 \n", "ZXDC -0.242187 -0.334438 -0.054448 \n", "ZYX 0.732871 1.197700 0.854481 \n", "ZZEF1 -0.337330 -0.374806 -0.437743 \n", "ZZZ3 0.245661 0.591174 0.206579 \n", "\n", " XPR010_MCF7.311_96H XPR010_PC3.311B_96H XPR010_U251MG.311_96H \\\n", "Gene \n", "A1CF -1.217650 -1.387168 -1.239583 \n", "A2M 0.108772 0.272403 0.159051 \n", "A4GALT -0.108546 0.019782 -0.071314 \n", "A4GNT -0.801284 -0.709464 -0.933154 \n", "AAAS 0.314384 0.238665 0.325797 \n", "... ... ... ... \n", "ZXDB -0.006245 -0.023256 -0.091842 \n", "ZXDC -0.550889 -0.568556 -0.568459 \n", "ZYX 0.506134 1.124026 1.042041 \n", "ZZEF1 0.199752 -0.259344 -0.171825 \n", "ZZZ3 0.106009 0.207473 0.236588 \n", "\n", " XPR010_YAPC.311_96H \n", "Gene \n", "A1CF -1.206195 \n", "A2M 0.225687 \n", "A4GALT 0.106599 \n", "A4GNT -0.581013 \n", "AAAS 0.263725 \n", "... ... \n", "ZXDB -0.145286 \n", "ZXDC -0.348030 \n", "ZYX 1.012969 \n", "ZZEF1 -0.375506 \n", "ZZZ3 0.064532 \n", "\n", "[12327 rows x 10 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "all_signatures = {}\n", "\n", "all_signatures['cd_all']= pd.concat([\n", " df['CD-coefficient'].rename(b) for (b, df) in batch_signatures['cd'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "all_signatures['limma_all'] = pd.concat([\n", " df['t'].rename(b) for (b, df) in batch_signatures['limma'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "all_signatures['limma-voom_all'] = pd.concat([\n", " df['t'].rename(b) for (b, df) in batch_signatures['limma-voom'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "all_signatures['ranksum_all'] = pd.concat([\n", " df['Statistic'].rename(b) for (b, df) in batch_signatures['ranksum'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "all_signatures['ttest_all'] = pd.concat([\n", " df['Statistic'].rename(b) for (b, df) in batch_signatures['ttest'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "all_signatures['fc_all'] = pd.concat([\n", " df['logFC'].rename(b) for (b, df) in batch_signatures['fc'].items()\n", "], axis=1).sort_index().rename_axis('Gene')\n", "\n", "for k in all_signatures.keys(): \n", " method = k.split('_')[0].upper()\n", " display(Markdown(f\"All {method} batch signatures\"))\n", " display(all_signatures[k])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Enrichment Analysis Rankings" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Function to get Enrichr Results\n", "def getEnrichrLibrary(library_name): \n", " ENRICHR_URL = f'https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName={library_name}'\n", " resp = requests.get(ENRICHR_URL)\n", " if not resp.ok: \n", " raise Exception(f\"Error downloading {library_name} library from Enrichr, please try again.\")\n", " return resp.json()[library_name]['terms']\n", "\n", "def getLibraryIter(libdict):\n", " for k,v in libdict.items():\n", " if type(v) == list:\n", " yield k, v\n", " else:\n", " yield k, list(v.keys())\n", "\n", "def enrich(gene_list, lib_json, name): \n", " all_terms = list(lib_json.keys())\n", " termranks = []\n", " enrich_res = enrich_crisp(gene_list, getLibraryIter(lib_json), 20000, False)\n", " enrich_res = [[r[0], r[1].pvalue] for r in enrich_res]\n", " sorted_res = sorted(enrich_res, key=lambda x: x[1])\n", " for i in range(len(sorted_res)): \n", " termranks.append([name, sorted_res[i][0], i])\n", " for t in set(all_terms).difference([x[1] for x in termranks]): \n", " i+=1\n", " termranks.append([name, t, i])\n", " return pd.DataFrame(termranks, columns=['Gene_Set', 'Term', 'Rank'])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "chea2022 = getEnrichrLibrary('ChEA_2022')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# Get gene lists to put into Enrichr\n", "gene_lists = {}\n", "for m in all_signatures.keys():\n", " mname = m.split('_')[0]\n", " gene_lists[mname] = {'up': {}, 'down': {}, 'combined': {}}\n", " for col in all_signatures[m].columns: \n", " gene_lists[mname]['up'][col] = all_signatures[m][col].sort_values(ascending=False).index.tolist()[:100]\n", " gene_lists[mname]['down'][col] = all_signatures[m][col].sort_values(ascending=True).index.tolist()[:100]\n", " gene_lists[mname]['combined'][col] = gene_lists[mname]['up'][col] + gene_lists[mname]['down'][col]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Get results\n", "chea2022_results = []\n", "\n", "for m in gene_lists.keys(): \n", " for sig in gene_lists[m]['up'].keys(): \n", " chea2022_results.append(enrich(gene_lists[m]['up'][sig], chea2022, f\"{sig}:{m}:up:ChEA 2022\"))\n", " chea2022_results.append(enrich(gene_lists[m]['down'][sig], chea2022, f\"{sig}:{m}:down:ChEA 2022\"))\n", " chea2022_results.append(enrich(gene_lists[m]['combined'][sig], chea2022, f\"{sig}:{m}:combined:ChEA 2022\"))\n", "\n", "chea2022_df = pd.concat(chea2022_results, axis=0)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "dex_chea2022_df = chea2022_df[chea2022_df['Term'].apply(lambda term: 'NR1I2' in term)]\n", "dex_chea2022_df['Library'] = 'ChEA 2022'" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def createResultsDf(df):\n", " df['Method'] = df['Gene_Set'].apply(lambda x: x.split(':')[1])\n", " df['Direction'] = df['Gene_Set'].apply(lambda x: x.split(':')[2])\n", " df['Method_Direction'] = df.apply(lambda row: row.Method + ':' + row.Direction, axis=1)\n", " df['TF'] = df['Term'].apply(lambda x: x.split(' ')[0].split('_')[0])\n", " df['Cell'] = df['Gene_Set'].apply(lambda x: x.split(':')[0].split('_')[1])\n", " df['Batch'] = df['Gene_Set'].apply(lambda x: x.split(':')[0])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "createResultsDf(dex_chea2022_df)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "full_df = dex_chea2022_df\n", "\n", "up_df = full_df[full_df['Direction'] == 'up']\n", "down_df = full_df[full_df['Direction'] == 'down']\n", "combined_df = full_df[full_df['Direction'] == 'combined']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "Mean rank of NR1I2 terms from ChEA 2022 for up genes from each method." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Rank
Method
ranksum388.5
fc426.0
limma562.1
ttest587.3
limma-voom600.1
cd663.9
\n", "
" ], "text/plain": [ " Rank\n", "Method \n", "ranksum 388.5\n", "fc 426.0\n", "limma 562.1\n", "ttest 587.3\n", "limma-voom 600.1\n", "cd 663.9" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Mean rank of NR1I2 terms from ChEA 2022 for down genes from each method." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Rank
Method
limma-voom37.3
cd52.8
limma58.5
ttest128.1
fc326.9
ranksum415.8
\n", "
" ], "text/plain": [ " Rank\n", "Method \n", "limma-voom 37.3\n", "cd 52.8\n", "limma 58.5\n", "ttest 128.1\n", "fc 326.9\n", "ranksum 415.8" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Mean rank of NR1I2 terms from ChEA 2022 for combined up and down genes from each method." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Rank
Method
limma-voom271.2
limma280.7
cd332.3
fc379.5
ttest404.0
ranksum420.1
\n", "
" ], "text/plain": [ " Rank\n", "Method \n", "limma-voom 271.2\n", "limma 280.7\n", "cd 332.3\n", "fc 379.5\n", "ttest 404.0\n", "ranksum 420.1" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(Markdown(f\"Mean rank of {ko_gene} terms from ChEA 2022 for up genes from each method.\"))\n", "display(up_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']))\n", "display(Markdown(f\"Mean rank of {ko_gene} terms from ChEA 2022 for down genes from each method.\"))\n", "display(down_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']))\n", "display(Markdown(f\"Mean rank of {ko_gene} terms from ChEA 2022 for combined up and down genes from each method.\"))\n", "display(combined_df.groupby(['Method']).mean(numeric_only=True).sort_values(['Rank', 'Method']))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Random results" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# bootstrap random results\n", "random_arr_chea2022 = []\n", "for i in range(10):\n", " rand_100 = sample(combined_expr_df.index.tolist(), 100)\n", " rand_200 = sample(combined_expr_df.index.tolist(), 200)\n", "\n", " random_arr_chea2022.append(enrich(rand_100, chea2022, 'random:100'))\n", " random_arr_chea2022.append(enrich(rand_200, chea2022, 'random:200'))\n", "\n", "rand_chea2022_df = pd.concat(random_arr_chea2022, axis=0)\n", "rand_chea2022_df['Library'] = 'ChEA 2022'\n", "rand_chea2022_df['TF'] = rand_chea2022_df['Term'].apply(lambda x: x.split(' ')[0])\n", "\n", "rand_df = rand_chea2022_df\n", "rand_df = rand_df[rand_df['TF'].isin(['NR0B1'])]\n", "rand_df['Method'] = 'random'" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Boxplots" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "full_df['Cell'] = full_df['Cell'].apply(lambda x: x.replace('.311', ''))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "color_dict = {\n", " 'up': '#648FFF',\n", " 'down': '#DC267F', \n", " 'combined': '#785EF0'\n", "}\n", "\n", "fig1 = go.Figure()\n", "for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:\n", " fig1.add_trace(\n", " go.Box(\n", " y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),\n", " name=gs.replace('fc', 'logfc'),\n", " marker_color=color_dict[gs.split(':')[1]]\n", " )\n", " )\n", "fig1.add_trace(\n", " go.Box(\n", " y=rand_df[rand_df['Method']==f'random']['Rank'].tolist(),\n", " name='random',\n", " marker_color='black'\n", " )\n", ")\n", "fig1.update_layout(\n", " title_text=f\"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction\",\n", " xaxis={\n", " 'title': {'text': 'Method:Direction'}, \n", " },\n", " yaxis={\n", " 'title': {'text': 'Rank'}\n", " },\n", " showlegend=False\n", ")\n", "fig1.update_xaxes(tickangle=45)\n", "fig1.show(\"png\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "method_color_map = {\n", " 'up': '#648FFF',\n", " 'down': '#DC267F', \n", " 'combined': '#785EF0'\n", "}\n", "fig1 = go.Figure()\n", "order = full_df.groupby(['Method_Direction']).mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x.split(':')[0]).unique()\n", "full_df['Method'] = pd.Categorical(full_df['Method'], order)\n", "full_df = full_df.sort_values(by=['Method'])\n", "\n", "for d in ['up', 'combined', 'down']:\n", " d_df = full_df[full_df['Direction'] == d]\n", " fig1.add_trace(\n", " go.Box(\n", " x=d_df['Method'],\n", " y=d_df['Rank'],\n", " name=d, \n", " marker_color=method_color_map[d]\n", " )\n", " )\n", "\n", "fig1.add_trace(\n", " go.Box(\n", " x=rand_df['Method'],\n", " y=rand_df['Rank'],\n", " name='random',\n", " marker_color='black'\n", " )\n", ")\n", "\n", "fig1.update_layout(\n", " width=800,\n", " boxmode='group',\n", " boxgap=0.1,\n", " xaxis={\n", " 'title': {'text': 'Method'},\n", " },\n", " yaxis={\n", " 'title': {'text': 'Gene Set Rank'}\n", " },\n", " legend_title_text=\"Direction\"\n", ")\n", "fig1.show(\"png\")\n", "fig1.write_image(f'/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/4_{ko_gene}_1_300dpi.png', scale=(800/300))" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "color_dict = {\n", " 'cd': '#648FFF',\n", " 'limma': '#785EF0', \n", " 'limma-voom': '#DA79FF',\n", " 'fc': '#DC267F',\n", " 'ttest': '#FE6100',\n", " 'ranksum': '#FFB000'\n", "}\n", "\n", "fig1 = go.Figure()\n", "for gs in full_df.groupby('Method_Direction').mean().sort_values('Rank').index:\n", " fig1.add_trace(\n", " go.Box(\n", " y=full_df[full_df['Method_Direction']==gs]['Rank'].tolist(),\n", " name=gs.replace('fc', 'logfc'),\n", " marker_color=color_dict[gs.split(':')[0]]\n", " )\n", " )\n", "fig1.add_trace(\n", " go.Box(\n", " y=rand_df[rand_df['Method']==f'random']['Rank'].tolist(),\n", " name='random',\n", " marker_color='black'\n", " )\n", ")\n", "fig1.update_layout(\n", " title_text=f\"{ko_gene} Term Rankings for L1000 Gene Sets by Method and Direction\",\n", " xaxis={\n", " 'title': {'text': 'Method:Direction'}, \n", " },\n", " yaxis={\n", " 'title': {'text': 'Rank'}\n", " },\n", " showlegend=False\n", ")\n", "fig1.update_xaxes(tickangle=45)\n", "fig1.show(\"png\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "color_dict = {\n", " 'up': '#648FFF',\n", " 'down': '#DC267F', \n", " 'combined': '#785EF0'\n", "}\n", "\n", "box_count = 0\n", "full_df['Batch'] = full_df['Gene_Set'].apply(lambda x: x.split(':')[0])\n", "fig1 = go.Figure()\n", "dir_batch_df = full_df.set_index(['Direction', 'Batch'])\n", "for (d, b) in dir_batch_df.groupby(['Direction', 'Batch']).mean(numeric_only=True).sort_values('Rank').index:\n", " fig1.add_trace(\n", " go.Box(\n", " y=dir_batch_df.loc[(d,b)]['Rank'].tolist(),\n", " name=b.split('_')[1].split('.')[0] + ' ' + d,\n", " marker_color=color_dict[d]\n", " )\n", " )\n", "fig1.add_trace(\n", " go.Box(\n", " y=rand_df[rand_df['Method']==f'random']['Rank'].tolist(),\n", " name='random',\n", " marker_color='black'\n", " )\n", ")\n", "fig1.update_layout(\n", " title_text=f\"{ko_gene} Term Rankings for L1000 Gene Sets by Batch\",\n", " xaxis={\n", " 'title': {'text': 'Cell Line'}, \n", " },\n", " yaxis={\n", " 'title': {'text': 'Rank'}\n", " },\n", " showlegend=False\n", ")\n", "fig1.update_xaxes(tickangle=45)\n", "fig1.show(\"png\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "method_color_map = {\n", " 'up': '#648FFF',\n", " 'down': '#DC267F', \n", " 'combined': '#785EF0'\n", "}\n", "fig1 = go.Figure()\n", "order = full_df.groupby(['Direction', 'Cell']).mean(numeric_only=True).sort_values('Rank').index.map(lambda x: x[1].split(':')[0]).unique()\n", "full_df['Cell'] = pd.Categorical(full_df['Cell'], order)\n", "full_df = full_df.sort_values(by=['Cell'])\n", "\n", "for d in ['up', 'combined', 'down']:\n", " d_df = full_df[full_df['Direction'] == d]\n", " fig1.add_trace(\n", " go.Box(\n", " x=d_df['Cell'],\n", " y=d_df['Rank'],\n", " name=d, \n", " marker_color=method_color_map[d]\n", " )\n", " )\n", "\n", "fig1.add_trace(\n", " go.Box(\n", " x=rand_df['Method'],\n", " y=rand_df['Rank'],\n", " name='random',\n", " marker_color='black'\n", " )\n", ")\n", "\n", "fig1.update_layout(\n", " width=800,\n", " boxmode='group',\n", " boxgap=0.1,\n", " xaxis={\n", " 'title': {'text': 'Cell Line'},\n", " },\n", " yaxis={\n", " 'title': {'text': 'Gene Set Rank'}\n", " },\n", " legend_title_text=\"Direction\"\n", ")\n", "fig1.show(\"png\")\n", "fig1.write_image(f'/Users/maayanlab/Documents/manuscripts/dex-benchmark/revised_figures/4_{ko_gene}_2_300dpi.png', scale=(800/300))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ScatterEnv", "language": "python", "name": "scatterenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "59b903cdca14fb863026e39f4185dd43265f1412df959e516078f4f22f35cec9" } } }, "nbformat": 4, "nbformat_minor": 2 }