Source code for maayanlab_bioinformatics.harmonization.transcripts
import pandas as pd
from typing import Dict, Optional
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
from maayanlab_bioinformatics.utils.merge import merge
[docs]
def transcripts_to_genes(
df_expression: pd.DataFrame,
df_features: pd.DataFrame=None,
strategy='var',
uppercasegenes=False,
lookup_dict: Optional[Dict[str, str]]=None,
organism='Mammalia/Homo_sapiens',
):
''' Map gene alternative ids/transcripts to gene symbols using `ncbi_genes_lookup`
We take a matrix with genes/transcripts on the rows and samples on the columns.
In the case of multiple gene/transcript to symbol mappings, we adopt the collision strategy specified.
If df_features is provided, we will use 'symbol' column as the transcript names,
otherwise we will use the df_expression index column.
The resulting matrix will naturally have fewer samples, corresponding to gene symbols in the
`lookup_dict` which defaults to official ncbi_gene symbols for homo sapiens.
:param strategy: ('var'|'sum') collision strategy (select one with highest variance, or sum counts)
'''
# resolve lookup_dict if necessary
if lookup_dict is None:
lookup_dict = ncbi_genes_lookup(organism=organism)
elif callable(lookup_dict):
lookup_dict = lookup_dict()
# construct df_features if not provided
if df_features is None:
df_features = pd.Series(df_expression.index).to_frame('symbol')
df_features.index = df_expression.index
# uppercase genes if necessary
if uppercasegenes:
df_features['symbol'] = df_features['symbol'].apply(str.upper)
# get df_expression but only the highest variance transcript that
# corresponds to the same set of genes
if strategy == 'var':
df_transcript_genes = merge(
df_expression.var(axis=1).to_frame('var'),
df_features[['symbol']].applymap(lambda s: lookup_dict(s))
).groupby('symbol')['var'].idxmax().reset_index()
df_transcript_genes.index = df_transcript_genes['var']
df_transcript_genes = df_transcript_genes.drop('var', axis=1)
# perform the actual mapping
df_gene_expression = df_expression.loc[df_transcript_genes.index]
df_gene_expression.index = df_transcript_genes['symbol']
elif strategy == 'sum':
df_gene_expression = merge(
df_expression,
df_features[['symbol']].applymap(lambda s: lookup_dict(s))
).groupby('symbol').sum()
else:
raise NotImplementedError
return df_gene_expression