Source code for maayanlab_bioinformatics.harmonization.transcripts

import pandas as pd
from typing import Dict, Optional

from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
from maayanlab_bioinformatics.utils.merge import merge

[docs] def transcripts_to_genes( df_expression: pd.DataFrame, df_features: pd.DataFrame=None, strategy='var', uppercasegenes=False, lookup_dict: Optional[Dict[str, str]]=None, organism='Mammalia/Homo_sapiens', ): ''' Map gene alternative ids/transcripts to gene symbols using `ncbi_genes_lookup` We take a matrix with genes/transcripts on the rows and samples on the columns. In the case of multiple gene/transcript to symbol mappings, we adopt the collision strategy specified. If df_features is provided, we will use 'symbol' column as the transcript names, otherwise we will use the df_expression index column. The resulting matrix will naturally have fewer samples, corresponding to gene symbols in the `lookup_dict` which defaults to official ncbi_gene symbols for homo sapiens. :param strategy: ('var'|'sum') collision strategy (select one with highest variance, or sum counts) ''' # resolve lookup_dict if necessary if lookup_dict is None: lookup_dict = ncbi_genes_lookup(organism=organism) elif callable(lookup_dict): lookup_dict = lookup_dict() # construct df_features if not provided if df_features is None: df_features = pd.Series(df_expression.index).to_frame('symbol') df_features.index = df_expression.index # uppercase genes if necessary if uppercasegenes: df_features['symbol'] = df_features['symbol'].apply(str.upper) # get df_expression but only the highest variance transcript that # corresponds to the same set of genes if strategy == 'var': df_transcript_genes = merge( df_expression.var(axis=1).to_frame('var'), df_features[['symbol']].applymap(lambda s: lookup_dict(s)) ).groupby('symbol')['var'].idxmax().reset_index() df_transcript_genes.index = df_transcript_genes['var'] df_transcript_genes = df_transcript_genes.drop('var', axis=1) # perform the actual mapping df_gene_expression = df_expression.loc[df_transcript_genes.index] df_gene_expression.index = df_transcript_genes['symbol'] elif strategy == 'sum': df_gene_expression = merge( df_expression, df_features[['symbol']].applymap(lambda s: lookup_dict(s)) ).groupby('symbol').sum() else: raise NotImplementedError return df_gene_expression