# Retrieving L1000 Data with SigComLINCS API

In [17]:
import requests
import pandas as pd
import h5py
import os.path

In [2]:
metadata_api = "https://maayanlab.cloud/sigcom-lincs/metadata-api"

## Chemical Perturbation Signatures

In [3]:
# Enter perturbagen of interest
chempert = 'dexamethasone'

In [4]:
payload = {
    "filter": {
        "where": {
            "meta": {
                "fullTextSearch": "Chemical Perturbation 2021"
            }
        }
    }
}

res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()

[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json',
  'id': '54198d6e-fe17-5ef8-91ac-02b425761653',
  'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d',
  'dataset': 'l1000_cp',
  'dataset_type': 'rank_matrix',
  'meta': {'date': '2021-06-10',
   'icon': './static/images/lincs/CMap.png',
   'size': '35.57 GB',
   'assay': 'L1000 mRNA profiling assay',
   'center': 'LINCS Center for Transcriptomics (Broad Institute)',
   '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json',
   'total_size': 35565630496,
   'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/cp',
   'datalevel_5': {'id': 'L1000_cp',
    'date': '2021-06-10',
    'link': 'https://clue.io/data/CMap2020#LINCS2020',
    'size': '35.57 GB',
    'version': '1',
    'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/cp_coeff_mat.gctx',
    'filesize': 35565630496,
    'datalevel': 5,
    'description':

In [5]:
chem_pert_dataset = res.json()[0]
chem_dataset = chem_pert_dataset['dataset']

In [7]:
libid = chem_pert_dataset["id"]
payload = {
    "filter": {
        "where": {
            "meta.pert_name": chempert,
            "library": libid
        }
    }
}

chem_res = requests.post(metadata_api + "/signatures/find", json=payload)
chem_signatures = chem_res.json()

In [8]:
len(chem_signatures)

467

In [9]:
chem_signatures[0]

{'$validator': '/dcic/signature-commons-schema/v5/core/signature.json',
 'id': '0151d673-edfa-58eb-a0f4-9d163eda17ca',
 'library': '54198d6e-fe17-5ef8-91ac-02b425761653',
 'meta': {'md5': 'f96c2a71ca38b0ec02cc03e99b0fcf12',
  'sha256': '405756e3568ae98df502ecb5e7c3ec7386b0bc8527b0a00907066fe10fdb9317',
  'tissue': 'skeletal muscle organ',
  'anatomy': 'UBERON:0014892',
  'cmap_id': 'CPC015_SKB_24H:BRD-K47635719-001-03-9:10',
  'version': 1,
  'filename': 'L1000_LINCS_DCIC_CPC015_SKB_24H_C04_dexamethasone_10uM.tsv',
  'local_id': 'CPC015_SKB_24H_C04_dexamethasone_10uM',
  'cell_line': 'SKB',
  'pert_dose': '10 uM',
  'pert_name': 'dexamethasone',
  'pert_time': '24 h',
  'pert_type': 'Chemical',
  '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/l1000_signatures.json',
  'data_level': 5,
  'pubchem_id': 5743,
  'creation_time': '2021-05-15',
  'persistent_id': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/cp/L1000_LINCS_DCIC_CPC015_SKB_24

In [11]:
chem_table = pd.DataFrame([chem_signatures[0]['meta']])
for i in range(1, len(chem_signatures)):
    row = pd.DataFrame([chem_signatures[i]['meta']])
    chem_table = pd.concat([chem_table, row])

In [18]:
chem_table = chem_table[[
    'tissue', 'disease','cell_line','pert_dose','pert_name',
    'pert_time','pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])

In [25]:
chem_table['persistent_id'] = chem_table['persistent_id'].apply(
    lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/cp', 'LINCS-data-2020/L1000/compound')
)

In [26]:
chem_table['batch'] = chem_table['persistent_id'].apply(
    lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
chem_table.head()

Unnamed: 0,tissue,disease,cell_line,pert_dose,pert_name,pert_time,pert_type,data_level,creation_time,persistent_id,batch
0,skeletal muscle organ,,SKB,10 uM,dexamethasone,24 h,Chemical,5,2021-05-15,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,CPC015_SKB_24H
1,prostate gland,prostate adenocarcinoma,VCAP,10 uM,dexamethasone,6 h,Chemical,5,2021-05-11,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,CPC009_VCAP_6H
2,prostate gland,prostate adenocarcinoma,PC3,10 uM,dexamethasone,6 h,Chemical,5,2021-05-10,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,CPC009_PC3_6H
3,lung,lung cancer,HCC15,10 uM,dexamethasone,6 h,Chemical,5,2021-05-08,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,CPC006_HCC15_6H
4,kidney,,HA1E,10 uM,dexamethasone,6 h,Chemical,5,2021-05-08,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,CPC006_HA1E_6H


In [87]:
chem_table.to_csv(f"{chempert.capitalize()}_L1000_ChemPert_data.tsv", sep='\t', index=False)

## Overexpression

In [27]:
# Set overexpression perturbagen
oepert = 'NR3C1'

In [28]:
payload = {
    "filter": {
        "where": {
            "meta": {
                "fullTextSearch": "Overexpression Perturbations 2021"
            }
        }
    }
}

res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()

[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json',
  'id': 'ef9389a8-53d3-50db-90cc-57e7d150b76c',
  'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d',
  'dataset': 'l1000_oe',
  'dataset_type': 'rank_matrix',
  'meta': {'date': '2021-06-10',
   'icon': './static/images/lincs/CMap.png',
   'size': '1.69 GB',
   'assay': 'L1000 mRNA profiling assay',
   'center': 'LINCS Center for Transcriptomics (Broad Institute)',
   '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json',
   'total_size': 1693041160,
   'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/oe',
   'datalevel_5': {'id': 'L1000_oe',
    'date': '2021-06-10',
    'link': 'https://clue.io/data/CMap2020#LINCS2020',
    'size': '1.69 GB',
    'version': '1',
    'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/oe_coeff_mat.gctx',
    'filesize': 1693041160,
    'datalevel': 5,
    'description': 'LI

In [29]:
oe_pert_dataset = res.json()[0]
oe_dataset = oe_pert_dataset['dataset']

In [30]:
libid = oe_pert_dataset["id"]
payload = {
    "filter": {
        "where": {
            "library": libid,
            "meta.pert_name": oepert
        }
    }
}

oe_res = requests.post(metadata_api + "/signatures/find", json=payload)
oe_signatures = oe_res.json()

In [31]:
len(oe_signatures)

18

In [32]:
oe_table = pd.DataFrame([oe_signatures[0]['meta']])
for i in range(1, len(oe_signatures)):
    row = pd.DataFrame([oe_signatures[i]['meta']])
    oe_table = pd.concat([oe_table, row])

In [33]:
oe_table = oe_table[[
    'tissue','disease','cell_line','pert_name','pert_time',
    'pert_type','data_level','creation_time','persistent_id','pert_dose'
]].reset_index().drop(columns=['index'])

In [35]:
oe_table['persistent_id'] = oe_table['persistent_id'].apply(
    lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/oe', 'LINCS-data-2020/L1000/oe')
)

In [36]:
oe_table['batch'] = oe_table['persistent_id'].apply(
    lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
oe_table.head()

Unnamed: 0,tissue,disease,cell_line,pert_name,pert_time,pert_type,data_level,creation_time,persistent_id,pert_dose,batch
0,prostate gland,prostate adenocarcinoma,PC3,NR3C1,96 h,Overexpression,5,2021-05-19,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,,OEB005_PC3_96H
1,breast,breast adenocarcinoma,MCF7,NR3C1,96 h,Overexpression,5,2021-05-19,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,,OEB005_MCF7_96H
2,intestine,colon adenocarcinoma,HT29,NR3C1,96 h,Overexpression,5,2021-05-19,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,,OEB005_HT29_96H
3,liver,carcinoma,HEPG2,NR3C1,96 h,Overexpression,5,2021-05-19,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,,OEB005_HEPG2_96H
4,lung,lung cancer,HCC515,NR3C1,96 h,Overexpression,5,2021-05-19,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,,OEB005_HCC515_96H


In [89]:
oe_table.to_csv(f"{oepert}_L1000_OE_data.tsv", sep='\t', index=False)

## shRNA Knockdown

In [43]:
shrnapert = 'NR3C1'

In [51]:
payload = {
    "filter": {
        "where": {
            "meta": {
                "fullTextSearch": "shRNA Perturbations 2021"
            }
        }
    }
}

res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()

[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json',
  'id': '8f1ff550-ece8-591d-a213-2763f854c008',
  'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d',
  'dataset': 'l1000_shRNA',
  'dataset_type': 'rank_matrix',
  'meta': {'date': '2021-06-10',
   'icon': './static/images/lincs/CMap.png',
   'size': '7.83 GB',
   'assay': 'L1000 mRNA profiling assay',
   'center': 'LINCS Center for Transcriptomics (Broad Institute)',
   '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json',
   'total_size': 7826067080,
   'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/shRNA',
   'datalevel_5': {'id': 'L1000_shRNA',
    'date': '2021-06-10',
    'link': 'https://clue.io/data/CMap2020#LINCS2020',
    'size': '7.83 GB',
    'version': '1',
    'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/shRNA_coeff_mat.gctx',
    'filesize': 7826067080,
    'datalevel': 5,
    'descr

In [52]:
shrna_pert_dataset = res.json()[0]
shrna_dataset = shrna_pert_dataset['dataset']

In [53]:
libid = shrna_pert_dataset["id"]
payload = {
    "filter": {
        "where": {
            "library": libid,
            "meta.pert_name": shrnapert
        }
    }
}

shrna_res = requests.post(metadata_api + "/signatures/find", json=payload)
shrna_signatures = shrna_res.json()

In [54]:
len(shrna_signatures)

96

In [55]:
shrna_table = pd.DataFrame([shrna_signatures[0]['meta']])
for i in range(1, len(shrna_signatures)):
    row = pd.DataFrame([shrna_signatures[i]['meta']])
    shrna_table = pd.concat([shrna_table, row])

In [57]:
shrna_table = shrna_table[[
    'tissue','disease','cell_line','pert_name','pert_time',
    'pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])

In [59]:
shrna_table['persistent_id'] = shrna_table['persistent_id'].apply(
    lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/shRNA', 'LINCS-data-2020/L1000/shRNA')
)

In [60]:
shrna_table['batch'] = shrna_table['persistent_id'].apply(
    lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
shrna_table.head()

Unnamed: 0,tissue,disease,cell_line,pert_name,pert_time,pert_type,data_level,creation_time,persistent_id,batch
0,prostate gland,prostate adenocarcinoma,VCAP,NR3C1,120 h,shRNA,5,2021-05-13,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,KDC007_VCAP_120H
1,prostate gland,prostate adenocarcinoma,PC3,NR3C1,96 h,shRNA,5,2021-05-13,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,KDC007_PC3_96H
2,prostate gland,prostate adenocarcinoma,VCAP,NR3C1,120 h,shRNA,5,2021-05-15,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,KDC007_VCAP_120H
3,prostate gland,prostate adenocarcinoma,VCAP,NR3C1,120 h,shRNA,5,2021-05-15,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,KDC007_VCAP_120H
4,breast,breast adenocarcinoma,MCF7,NR3C1,96 h,shRNA,5,2021-05-13,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,KDC007_MCF7_96H


In [None]:
shrna_table.to_csv(f"{shrnapert}_L1000_shRNA_data.tsv", sep='\t', index=False)

## CRISPR Knockout

In [3]:
crisprkopert = 'NR1I2'

In [4]:
payload = {
    "filter": {
        "where": {
            "meta": {
                "fullTextSearch": "CRISPR Perturbations 2021"
            }
        }
    }
}

res = requests.post(metadata_api + "/libraries/find", json=payload)
res.json()

[{'$validator': '/dcic/signature-commons-schema/v5/core/library.json',
  'id': '96c7b8c5-1eca-5764-88e4-e4ccaee6603f',
  'resource': 'f2bace34-022d-4147-9ca4-7b6e450c794d',
  'dataset': 'l1000_xpr',
  'dataset_type': 'rank_matrix',
  'meta': {'date': '2021-06-10',
   'icon': './static/images/lincs/CMap.png',
   'size': '6.98 GB',
   'assay': 'L1000 mRNA profiling assay',
   'center': 'LINCS Center for Transcriptomics (Broad Institute)',
   '$validator': 'https://raw.githubusercontent.com/MaayanLab/sigcom-lincs/main/validators/lincs_datasets.json',
   'total_size': 6980690856,
   'url_prefix': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/cd/xpr',
   'datalevel_5': {'id': 'L1000_xpr',
    'date': '2021-06-10',
    'link': 'https://clue.io/data/CMap2020#LINCS2020',
    'size': '6.98 GB',
    'version': '1',
    'file_url': 'https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/xpr_coeff_mat.gctx',
    'filesize': 6980690856,
    'datalevel': 5,
    'description':

In [5]:
crisprko_pert_dataset = res.json()[0]
crisprko_dataset = crisprko_pert_dataset['dataset']

In [6]:
libid = crisprko_pert_dataset["id"]
payload = {
    "filter": {
        "where": {
            "library": libid,
            "meta.pert_name": crisprkopert
        }
    }
}

crisprko_res = requests.post(metadata_api + "/signatures/find", json=payload)
crisprko_signatures = crisprko_res.json()

In [7]:
len(crisprko_signatures)

20

In [8]:
crisprko_table = pd.DataFrame([crisprko_signatures[0]['meta']])
for i in range(1, len(crisprko_signatures)):
    row = pd.DataFrame([crisprko_signatures[i]['meta']])
    crisprko_table = pd.concat([crisprko_table, row])

In [9]:
crisprko_table = crisprko_table[[
    'tissue','disease','cell_line','pert_name','pert_time',
    'pert_type','data_level','creation_time','persistent_id'
]].reset_index().drop(columns=['index'])

In [10]:
crisprko_table['persistent_id'] = crisprko_table['persistent_id'].apply(
    lambda x: x.replace('.tsv', '.tsv.gz').replace('LINCS-sigs-2021/cd/xpr', 'LINCS-data-2020/L1000/xpr')
)

In [11]:
crisprko_table['batch'] = crisprko_table['persistent_id'].apply(
    lambda x: '_'.join(x.split('DCIC_')[1].split('_')[:3])
)
crisprko_table.head()

Unnamed: 0,tissue,disease,cell_line,pert_name,pert_time,pert_type,data_level,creation_time,persistent_id,batch
0,brain,astrocytoma,U251MG,NR1I2,96 h,CRISPR Knockout,5,2021-05-23,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,XPR010_U251MG.311_96H
1,brain,astrocytoma,U251MG,NR1I2,96 h,CRISPR Knockout,5,2021-05-23,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,XPR010_U251MG.311_96H
2,pancreas,pancreatic carcinoma,YAPC,NR1I2,96 h,CRISPR Knockout,5,2021-05-23,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,XPR010_YAPC.311_96H
3,pancreas,pancreatic carcinoma,YAPC,NR1I2,96 h,CRISPR Knockout,5,2021-05-23,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,XPR010_YAPC.311_96H
4,prostate gland,prostate adenocarcinoma,PC3,NR1I2,96 h,CRISPR Knockout,5,2021-05-23,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,XPR010_PC3.311B_96H


In [28]:
crisprko_table.to_csv(f"{crisprkopert}_L1000_CRISPRKO_data.tsv", sep='\t', index=False)

## Controls

In [12]:
ctl_table = pd.read_csv("L1000_Controls.tsv", sep='\t')
ctl_table.head()

Unnamed: 0,local_id,persistent_id,batch
0,L1000_LINCS_DCIC_2021_ABY001_A375_XH_B03_UnTrt...,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,ABY001_A375_XH
1,L1000_LINCS_DCIC_2021_ABY001_A375_XH_B04_UnTrt...,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,ABY001_A375_XH
2,L1000_LINCS_DCIC_2021_ABY001_A375_XH_B05_UnTrt...,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,ABY001_A375_XH
3,L1000_LINCS_DCIC_2021_ABY001_A375_XH_B06_UnTrt...,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,ABY001_A375_XH
4,L1000_LINCS_DCIC_2021_ABY001_A375_XH_B07_UnTrt...,https://lincs-dcic.s3.amazonaws.com/LINCS-data...,ABY001_A375_XH


In [20]:
if not os.path.exists('Dexamethasone_L1000_Controls_fulldata.gctx'):
  ctl_file = h5py.File('Dexamethasone_L1000_Controls_fulldata.gctx', 'w')

  l1000_ctl_sample_list = []
  l1000_ctl_sampid_list = []

  for row in ctl_table.itertuples():
    try:
      temp_df = pd.read_csv(row.persistent_id, sep='\t', index_col=0)
    except:
      print(f"Unable to access data from row {row.Index} at {row.persistent_id}")
      continue
    for col in temp_df.columns:
      l1000_ctl_sampid_list.append(col)
      l1000_ctl_sample_list.append(temp_df.sort_index()[col].to_numpy())

  ctl_file.create_dataset('0/DATA/0/matrix', data=l1000_ctl_sample_list)
  ctl_file.create_dataset('0/META/COL/id', data=l1000_ctl_sampid_list, dtype=h5py.string_dtype('utf-8'))
  ctl_file.create_dataset('0/META/ROW/id', data=temp_df.sort_index().index.to_numpy(), dtype=h5py.string_dtype('utf-8'))
  ctl_file.close()

<HDF5 dataset "id": shape (12328,), type "|O">