import json
import requests
import pandas as pd
metadata_api = "https://maayanlab.cloud/sigcom-lincs/metadata-api"
# Enter perturbagen of interest
chempert = 'dexamethasone'
payload = {
"filter": {
"where": {
"meta": {
"fullTextSearch": "Chemical Perturbation 2021"
}
}
}
}
res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()
[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json', 'id': '54198d6e-fe17-5ef8-91ac-02b425761653', 'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d', 'dataset': 'l1000_cp', 'dataset_type': 'rank_matrix', 'meta': {'date': '2021-06-10', 'icon': './static/images/lincs/CMap.png', 'size': '35.57 GB', 'assay': 'L1000 mRNA profiling assay', 'center': 'LINCS Center for Transcriptomics (Broad Institute)', '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json', 'total_size': 35565630496, 'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/cp', 'datalevel_5': {'id': 'L1000_cp', 'date': '2021-06-10', 'link': 'https://clue.io/data/CMap2020#LINCS2020', 'size': '35.57 GB', 'version': '1', 'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/cp_coeff_mat.gctx', 'filesize': 35565630496, 'datalevel': 5, 'description': 'LINCS L1000 Chemical Perturbations (2021)'}, 'description': 'LINCS L1000 Chemical Perturbations (2021)', '$download_counter': 12}}]
chem_pert_dataset = res.json()[0]
chem_dataset = chem_pert_dataset['dataset']
libid = chem_pert_dataset["id"]
payload = {
"filter": {
"where": {
"meta.pert_name": chempert,
"library": libid
}
}
}
chem_res = requests.post(metadata_api + "/signatures/find", json=payload)
chem_signatures = chem_res.json()
len(chem_signatures)
467
chem_signatures[0]
{'$validator': '/dcic/signature-commons-schema/v5/core/signature.json', 'id': '0151d673-edfa-58eb-a0f4-9d163eda17ca', 'library': '54198d6e-fe17-5ef8-91ac-02b425761653', 'meta': {'md5': 'f96c2a71ca38b0ec02cc03e99b0fcf12', 'sha256': '405756e3568ae98df502ecb5e7c3ec7386b0bc8527b0a00907066fe10fdb9317', 'tissue': 'skeletal muscle organ', 'anatomy': 'UBERON:0014892', 'cmap_id': 'CPC015_SKB_24H:BRD-K47635719-001-03-9:10', 'version': 1, 'filename': 'L1000_LINCS_DCIC_CPC015_SKB_24H_C04_dexamethasone_10uM.tsv', 'local_id': 'CPC015_SKB_24H_C04_dexamethasone_10uM', 'cell_line': 'SKB', 'pert_dose': '10 uM', 'pert_name': 'dexamethasone', 'pert_time': '24 h', 'pert_type': 'Chemical', '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/l1000_signatures.json', 'data_level': 5, 'pubchem_id': 5743, 'creation_time': '2021-05-15', 'persistent_id': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/cp/L1000_LINCS_DCIC_CPC015_SKB_24H_C04_dexamethasone_10uM.tsv', 'size_in_bytes': 217239, 'uncompressed_size_in_bytes': 217239}}
chem_table = pd.DataFrame([chem_signatures[0]['meta']])
for i in range(1, len(chem_signatures)):
row = pd.DataFrame([chem_signatures[i]['meta']])
chem_table = pd.concat([chem_table, row])
chem_table = chem_table[[
'tissue', 'disease','cell_line','pert_dose','pert_name',
'pert_time','pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])
chem_table['persistent_id'] = chem_table['persistent_id'].apply(
lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/cp', 'LINCS-data-2020/L1000/compound')
)
chem_table['batch'] = chem_table['persistent_id'].apply(
lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
chem_table.head()
tissue | disease | cell_line | pert_dose | pert_name | pert_time | pert_type | data_level | creation_time | persistent_id | batch | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | skeletal muscle organ | NaN | SKB | 10 uM | dexamethasone | 24 h | Chemical | 5 | 2021-05-15 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | CPC015_SKB_24H |
1 | prostate gland | prostate adenocarcinoma | VCAP | 10 uM | dexamethasone | 6 h | Chemical | 5 | 2021-05-11 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | CPC009_VCAP_6H |
2 | prostate gland | prostate adenocarcinoma | PC3 | 10 uM | dexamethasone | 6 h | Chemical | 5 | 2021-05-10 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | CPC009_PC3_6H |
3 | lung | lung cancer | HCC15 | 10 uM | dexamethasone | 6 h | Chemical | 5 | 2021-05-08 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | CPC006_HCC15_6H |
4 | kidney | NaN | HA1E | 10 uM | dexamethasone | 6 h | Chemical | 5 | 2021-05-08 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | CPC006_HA1E_6H |
chem_table.to_csv(f"{chempert.capitalize()}_L1000_ChemPert_data.tsv", sep='\t', index=False)
# Set overexpression perturbagen
oepert = 'NR3C1'
payload = {
"filter": {
"where": {
"meta": {
"fullTextSearch": "Overexpression Perturbations 2021"
}
}
}
}
res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()
[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json', 'id': 'ef9389a8-53d3-50db-90cc-57e7d150b76c', 'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d', 'dataset': 'l1000_oe', 'dataset_type': 'rank_matrix', 'meta': {'date': '2021-06-10', 'icon': './static/images/lincs/CMap.png', 'size': '1.69 GB', 'assay': 'L1000 mRNA profiling assay', 'center': 'LINCS Center for Transcriptomics (Broad Institute)', '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json', 'total_size': 1693041160, 'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/oe', 'datalevel_5': {'id': 'L1000_oe', 'date': '2021-06-10', 'link': 'https://clue.io/data/CMap2020#LINCS2020', 'size': '1.69 GB', 'version': '1', 'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/oe_coeff_mat.gctx', 'filesize': 1693041160, 'datalevel': 5, 'description': 'LINCS L1000 Overexpression Perturbations (2021)'}, 'description': 'LINCS L1000 Overexpression Perturbations (2021)', '$download_counter': 5}}]
oe_pert_dataset = res.json()[0]
oe_dataset = oe_pert_dataset['dataset']
libid = oe_pert_dataset["id"]
payload = {
"filter": {
"where": {
"library": libid,
"meta.pert_name": oepert
}
}
}
oe_res = requests.post(metadata_api + "/signatures/find", json=payload)
oe_signatures = oe_res.json()
len(oe_signatures)
18
oe_table = pd.DataFrame([oe_signatures[0]['meta']])
for i in range(1, len(oe_signatures)):
row = pd.DataFrame([oe_signatures[i]['meta']])
oe_table = pd.concat([oe_table, row])
oe_table = oe_table[[
'tissue','disease','cell_line','pert_name','pert_time',
'pert_type','data_level','creation_time','persistent_id','pert_dose'
]].reset_index().drop(columns=['index'])
oe_table['persistent_id'] = oe_table['persistent_id'].apply(
lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/oe', 'LINCS-data-2020/L1000/oe')
)
oe_table['batch'] = oe_table['persistent_id'].apply(
lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
oe_table.head()
tissue | disease | cell_line | pert_name | pert_time | pert_type | data_level | creation_time | persistent_id | pert_dose | batch | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | prostate gland | prostate adenocarcinoma | PC3 | NR3C1 | 96 h | Overexpression | 5 | 2021-05-19 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | OEB005_PC3_96H |
1 | breast | breast adenocarcinoma | MCF7 | NR3C1 | 96 h | Overexpression | 5 | 2021-05-19 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | OEB005_MCF7_96H |
2 | intestine | colon adenocarcinoma | HT29 | NR3C1 | 96 h | Overexpression | 5 | 2021-05-19 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | OEB005_HT29_96H |
3 | liver | carcinoma | HEPG2 | NR3C1 | 96 h | Overexpression | 5 | 2021-05-19 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | OEB005_HEPG2_96H |
4 | lung | lung cancer | HCC515 | NR3C1 | 96 h | Overexpression | 5 | 2021-05-19 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | NaN | OEB005_HCC515_96H |
oe_table.to_csv(f"{oepert}_L1000_OE_data.tsv", sep='\t', index=False)
shrnapert = 'NR3C1'
payload = {
"filter": {
"where": {
"meta": {
"fullTextSearch": "shRNA Perturbations 2021"
}
}
}
}
res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()
[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json', 'id': '8f1ff550-ece8-591d-a213-2763f854c008', 'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d', 'dataset': 'l1000_shRNA', 'dataset_type': 'rank_matrix', 'meta': {'date': '2021-06-10', 'icon': './static/images/lincs/CMap.png', 'size': '7.83 GB', 'assay': 'L1000 mRNA profiling assay', 'center': 'LINCS Center for Transcriptomics (Broad Institute)', '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json', 'total_size': 7826067080, 'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/shRNA', 'datalevel_5': {'id': 'L1000_shRNA', 'date': '2021-06-10', 'link': 'https://clue.io/data/CMap2020#LINCS2020', 'size': '7.83 GB', 'version': '1', 'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/shRNA_coeff_mat.gctx', 'filesize': 7826067080, 'datalevel': 5, 'description': 'LINCS L1000 shRNA Perturbations (2021)'}, 'description': 'LINCS L1000 shRNA Perturbations (2021)', '$download_counter': 16}}]
shrna_pert_dataset = res.json()[0]
shrna_dataset = shrna_pert_dataset['dataset']
libid = shrna_pert_dataset["id"]
payload = {
"filter": {
"where": {
"library": libid,
"meta.pert_name": shrnapert
}
}
}
shrna_res = requests.post(metadata_api + "/signatures/find", json=payload)
shrna_signatures = shrna_res.json()
len(shrna_signatures)
96
shrna_table = pd.DataFrame([shrna_signatures[0]['meta']])
for i in range(1, len(shrna_signatures)):
row = pd.DataFrame([shrna_signatures[i]['meta']])
shrna_table = pd.concat([shrna_table, row])
shrna_table = shrna_table[[
'tissue','disease','cell_line','pert_name','pert_time',
'pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])
shrna_table['persistent_id'] = shrna_table['persistent_id'].apply(
lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/shRNA', 'LINCS-data-2020/L1000/shRNA')
)
shrna_table['batch'] = shrna_table['persistent_id'].apply(
lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
shrna_table.head()
tissue | disease | cell_line | pert_name | pert_time | pert_type | data_level | creation_time | persistent_id | batch | |
---|---|---|---|---|---|---|---|---|---|---|
0 | prostate gland | prostate adenocarcinoma | VCAP | NR3C1 | 120 h | shRNA | 5 | 2021-05-13 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | KDC007_VCAP_120H |
1 | prostate gland | prostate adenocarcinoma | PC3 | NR3C1 | 96 h | shRNA | 5 | 2021-05-13 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | KDC007_PC3_96H |
2 | prostate gland | prostate adenocarcinoma | VCAP | NR3C1 | 120 h | shRNA | 5 | 2021-05-15 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | KDC007_VCAP_120H |
3 | prostate gland | prostate adenocarcinoma | VCAP | NR3C1 | 120 h | shRNA | 5 | 2021-05-15 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | KDC007_VCAP_120H |
4 | breast | breast adenocarcinoma | MCF7 | NR3C1 | 96 h | shRNA | 5 | 2021-05-13 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | KDC007_MCF7_96H |
shrna_table.to_csv(f"{shrnapert}_L1000_shRNA_data.tsv", sep='\t', index=False)
crisprkopert = 'NR1I2'
payload = {
"filter": {
"where": {
"meta": {
"fullTextSearch": "CRISPR Perturbations 2021"
}
}
}
}
res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()
[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json', 'id': '96c7b8c5-1eca-5764-88e4-e4ccaee6603f', 'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d', 'dataset': 'l1000_xpr', 'dataset_type': 'rank_matrix', 'meta': {'date': '2021-06-10', 'icon': './static/images/lincs/CMap.png', 'size': '6.98 GB', 'assay': 'L1000 mRNA profiling assay', 'center': 'LINCS Center for Transcriptomics (Broad Institute)', '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json', 'total_size': 6980690856, 'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/xpr', 'datalevel_5': {'id': 'L1000_xpr', 'date': '2021-06-10', 'link': 'https://clue.io/data/CMap2020#LINCS2020', 'size': '6.98 GB', 'version': '1', 'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/xpr_coeff_mat.gctx', 'filesize': 6980690856, 'datalevel': 5, 'description': 'LINCS L1000 CRISPR Perturbations (2021)'}, 'description': 'LINCS L1000 CRISPR Perturbations (2021)', '$download_counter': 16}}]
crisprko_pert_dataset = res.json()[0]
crisprko_dataset = crisprko_pert_dataset['dataset']
libid = crisprko_pert_dataset["id"]
payload = {
"filter": {
"where": {
"library": libid,
"meta.pert_name": crisprkopert
}
}
}
crisprko_res = requests.post(metadata_api + "/signatures/find", json=payload)
crisprko_signatures = crisprko_res.json()
len(crisprko_signatures)
20
crisprko_table = pd.DataFrame([crisprko_signatures[0]['meta']])
for i in range(1, len(crisprko_signatures)):
row = pd.DataFrame([crisprko_signatures[i]['meta']])
crisprko_table = pd.concat([crisprko_table, row])
crisprko_table = crisprko_table[[
'tissue','disease','cell_line','pert_name','pert_time',
'pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])
crisprko_table['persistent_id'] = crisprko_table['persistent_id'].apply(
lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/xpr', 'LINCS-data-2020/L1000/xpr')
)
crisprko_table['batch'] = crisprko_table['persistent_id'].apply(
lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
crisprko_table.head()
tissue | disease | cell_line | pert_name | pert_time | pert_type | data_level | creation_time | persistent_id | batch | |
---|---|---|---|---|---|---|---|---|---|---|
0 | brain | astrocytoma | U251MG | NR1I2 | 96 h | CRISPR Knockout | 5 | 2021-05-23 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | XPR010_U251MG.311_96H |
1 | brain | astrocytoma | U251MG | NR1I2 | 96 h | CRISPR Knockout | 5 | 2021-05-23 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | XPR010_U251MG.311_96H |
2 | pancreas | pancreatic carcinoma | YAPC | NR1I2 | 96 h | CRISPR Knockout | 5 | 2021-05-23 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | XPR010_YAPC.311_96H |
3 | pancreas | pancreatic carcinoma | YAPC | NR1I2 | 96 h | CRISPR Knockout | 5 | 2021-05-23 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | XPR010_YAPC.311_96H |
4 | prostate gland | prostate adenocarcinoma | PC3 | NR1I2 | 96 h | CRISPR Knockout | 5 | 2021-05-23 | https://lincs-dcic.s3.amazonaws.com/LINCS-data... | XPR010_PC3.311B_96H |
crisprko_table.to_csv(f"{crisprkopert}_L1000_CRISPRKO_data.tsv", sep='\t', index=False)