Source code for maayanlab_bioinformatics.enrichment.gsea2005

import numpy as np
import pandas as pd

[docs] def GSEA2005(geneset_membership: pd.Series, correlations: pd.Series): ''' Implementation of algorithm described here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1239896/ :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes :param correlations: (pd.Series) Correlation of a given gene :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max() ''' r_j = correlations.abs().sort_values(ascending=False) # r_j: correlation of gene_j in ranked order S = geneset_membership[correlations.index] # S: geneset mask aligned with r_j N = S.count() # N: number of genes N_H = S.sum() # N_H: number of hits N_R = r_j[S].sum() # N_R: sum of r_j for g_j \in S P_hit = S * r_j/N_R # P_hit: fraction of hits weighted by r_j P_miss = (~S) * 1/(N-N_H) # P_hit: fraction of misses up to position i # 0 added to beginning for plotting, doesn't affect sum x = np.arange(N + 1) y = np.concatenate([[0],np.cumsum(P_hit - P_miss)]) return x, y