Source code for maayanlab_bioinformatics.enrichment.gsea2005

import numpy as np
import pandas as pd


[docs]
def GSEA2005(geneset_membership: pd.Series, correlations: pd.Series):
  '''
  Implementation of algorithm described here:
  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1239896/

  :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes
  :param correlations: (pd.Series) Correlation of a given gene
  :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max()
  '''
  r_j = correlations.abs().sort_values(ascending=False) # r_j: correlation of gene_j in ranked order
  S = geneset_membership[correlations.index] # S: geneset mask aligned with r_j
  N = S.count()             # N: number of genes
  N_H = S.sum()             # N_H: number of hits
  N_R = r_j[S].sum()        # N_R: sum of r_j for g_j \in S
  P_hit =    S  * r_j/N_R   # P_hit: fraction of hits weighted by r_j
  P_miss = (~S) * 1/(N-N_H) # P_hit: fraction of misses up to position i
  # 0 added to beginning for plotting, doesn't affect sum
  x = np.arange(N + 1)
  y = np.concatenate([[0],np.cumsum(P_hit - P_miss)])
  return x, y