Source code for maayanlab_bioinformatics.harmonization.ncbi_genes

import pandas as pd
from functools import lru_cache
from maayanlab_bioinformatics.utils import fetch_save_read

[docs] @lru_cache() def ncbi_genes_fetch(organism='Mammalia/Homo_sapiens', filters=lambda ncbi: ncbi['type_of_gene']=='protein-coding'): ''' Fetch the current NCBI Human Gene Info database. See ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/ for the directory/file of the organism of interest. ''' def maybe_split(record): ''' NCBI Stores Nulls as '-' and lists '|' delimited ''' if record in {'', '-'}: return set() return set(record.split('|')) # def supplement_dbXref_prefix_omitted(ids): ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID ''' for id in ids: # add original id yield id # also add id *without* prefix if ':' in id: yield id.split(':', maxsplit=1)[1] # ncbi = fetch_save_read( 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism), '{}.gene_info.tsv'.format(organism), sep='\t', ) if filters and callable(filters): ncbi = ncbi[filters(ncbi)] # ncbi['All_synonyms'] = [ set.union( maybe_split(gene_info['Symbol']), maybe_split(gene_info['Symbol_from_nomenclature_authority']), maybe_split(str(gene_info['GeneID'])), maybe_split(gene_info['Synonyms']), maybe_split(gene_info['Other_designations']), maybe_split(gene_info['LocusTag']), set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))), ) for _, gene_info in ncbi.iterrows() ] return ncbi
[docs] @lru_cache() def ncbi_genes_lookup(organism='Mammalia/Homo_sapiens', filters=lambda ncbi: ncbi['type_of_gene']=='protein-coding'): ''' Return a lookup dictionary with synonyms as the keys, and official symbols as the values Usage: ```python ncbi_lookup = ncbi_genes_lookup('Mammalia/Homo_sapiens') print(ncbi_lookup('STAT3')) # any alias will get converted into the official symbol ``` ''' ncbi_genes = ncbi_genes_fetch(organism=organism, filters=filters) synonyms, symbols = zip(*{ (synonym, gene_info['Symbol']) for _, gene_info in ncbi_genes.iterrows() for synonym in gene_info['All_synonyms'] }) ncbi_lookup = pd.Series(symbols, index=synonyms) index_values = ncbi_lookup.index.value_counts() ambiguous = index_values[index_values > 1].index ncbi_lookup_disambiguated = ncbi_lookup[( (ncbi_lookup.index == ncbi_lookup) | (~ncbi_lookup.index.isin(ambiguous)) )] return ncbi_lookup_disambiguated.to_dict().get