Source code for maayanlab_bioinformatics.dge.limma_voom

import numpy as np
import pandas as pd
from typing import Optional, Tuple

R = None

def _lazy_load():
  global R
  if R is not None:
    return R
  #
  from rpy2.robjects import r
  #
  r('''
  suppressMessages(library("R.utils"))
  suppressMessages(library("RCurl"))
  suppressMessages(library("DESeq2"))
  suppressMessages(library("limma"))
  suppressMessages(library("statmod"))
  suppressMessages(library("edgeR"))
  diffExpression <- function(expression, design_dataframe, filter_genes=FALSE, voom_design=FALSE, adjust="BH") {
    design <- as.matrix(design_dataframe)
    # turn count matrix into a expression object compatible with edgeR
    dge <- DGEList(counts=expression)
    # filter genes
    if (isTRUE(filter_genes)) {
      keep <- filterByExpr(dge, design)
      dge <- dge[keep,]
    }
    # calculate normalization factors, here the different library sizes per sample are accounted for
    # the normalization factors are appended to the dge object and used in later steps
    dge <- calcNormFactors(dge)
    # to be honest I am not sure what exactly happens here. Limma was developed for affymetrix chips that have
    # values that follow different distributions than read counts. It will apply some sort of log transformation
    # and make it compatible with lmFit
    if (isTRUE(voom_design)) {
      v <- voom(dge, design, plot=FALSE)
    } else {
      v <- voom(dge, plot=FALSE)
    }
    # this is basically just applying a linear fit. The test will calculate how much the differentiation between controls and samples
    # improves the fit over the simplest possible model (average)
    fit <- lmFit(v, design)
    # this is what makes it differential expression from B - A, B and A are set in the design matrix
    cont.matrix <- makeContrasts(de=B-A, levels=design)
    fit <- contrasts.fit(fit, cont.matrix)
    # this will calculate moderated t-statistics using empirical bayes moderation of the standard errors towards a common value
    fit <- eBayes(fit)
    # Get results
    limma_dataframe <- topTable(fit, adjust=adjust, number=nrow(expression))
    limma_dataframe$gene_symbol <- rownames(limma_dataframe)
    #
    return (limma_dataframe)
  }
  ''')
  R = r
  return R

[docs] def limma_voom_differential_expression_design( expression: pd.DataFrame, design: pd.DataFrame, de: Tuple[str, str], filter_genes: bool = False, voom_design: bool = False, ): ''' Use R's voom and limma for differential expression. Note that this function expects the original raw gene counts. alex version: voom_design=True, filter_genes=False biojupies version: voom_design=False, filter_genes=True :param expression: (pd.DataFrame) the samples :param design: (pd.DataFrame) the design dataframe :param de: (Tuple[str, str]) ('control_column_name', 'case_column_name') :param filter_genes: (bool) Whether to perform R's `filterByExpr` during normalization :param voom_design: (bool) Whether to give R's voom function the design matrix (supervised) :return: A data frame with the results ''' expression = expression.copy() design = design.copy().loc[expression.columns] expression.columns = design.index = ['s'+str(i) for i, _ in enumerate(expression.columns)] design.columns = [ {de[0]: 'A', de[1]: 'B'}.get(col, 'c'+str(i)) for i, col in enumerate(design.columns) ] assert 'A' in design.columns and 'B' in design.columns import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter r = _lazy_load() with localconverter(ro.default_converter + pandas2ri.converter): return r.diffExpression( expression, design, filter_genes=filter_genes, voom_design=voom_design, ).sort_values('t', ascending=False).set_index('gene_symbol')
[docs] def make_design_matrix(expression_df, controls, cases): expression_df = expression_df.copy() expression_df.index.name = 'index' expression_df = expression_df.reset_index().groupby('index').sum() design_df = pd.DataFrame([{'index': 's'+str(i), 'A': int(x in controls), 'B': int(x in cases)} for i, x in enumerate(expression_df.columns)]).set_index('index') expression_df.columns = ['s'+str(i) for i, _ in enumerate(expression_df.columns)] return expression_df, design_df
[docs] def limma_voom_differential_expression( controls_mat: pd.DataFrame, cases_mat: pd.DataFrame, all_data_mat: Optional[pd.DataFrame] = None, filter_genes: bool = False, voom_design: bool = False, ): ''' Use R's voom and limma for differential expression. Note that this function expects the original raw gene counts. alex version: voom_design=True, filter_genes=False biojupies version: voom_design=False, filter_genes=True :param controls_mat: (pd.DataFrame) the control samples :param cases_mat: (pd.DataFrame) the case samples :param all_data_mat: (pd.DataFrame) *all* samples (for full experiment normalization) :param filter_genes: (bool) Whether to perform R's `filterByExpr` during normalization :param voom_design: (bool) Whether to give R's voom function the design matrix (supervised) :return: A data frame with the results ''' if all_data_mat is None: all_data_mat = pd.concat([controls_mat, cases_mat], axis=1) all_data_mat.columns = all_data_mat.columns.to_flat_index() # transform input into expression/design ready for R functions expression, design = make_design_matrix(all_data_mat, set(controls_mat.columns.to_flat_index()), set(cases_mat.columns.to_flat_index())) import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter r = _lazy_load() with localconverter(ro.default_converter + pandas2ri.converter): return r.diffExpression( expression, design, filter_genes=filter_genes, voom_design=voom_design, ).sort_values('t', ascending=False).set_index('gene_symbol')
[docs] def up_down_from_limma_voom(expr: pd.DataFrame, top_n: int = 600): ''' Given limma_voom_differential_expression output, produce a discrete up/down geneset :param top_n: (int) the number of genes in total to produce :return: UpDownGeneset a type with `.up` and `.down` methods corresponding to the list of genes. ''' most_significant_expr = expr.sort_values('P.Value').iloc[:top_n] return type('UpDownGeneset', tuple(), dict( up=list(most_significant_expr[most_significant_expr['logFC'] > 0].dropna().index), down=list(most_significant_expr[most_significant_expr['logFC'] < 0].dropna().index), ))