Source code for maayanlab_bioinformatics.enrichment.crisp

# import fisher
import scipy.stats
from typing import Union, Dict, Set, Iterable, Tuple, Hashable, Any, TypeVar, Optional
from dataclasses import dataclass

[docs] @dataclass(frozen=True) class FisherOverlap: pvalue: float odds_ratio: float n_overlap: int overlap: Optional[Set[Hashable]]
T = TypeVar('T') DictOrIterableTuple = Union[Dict[Hashable, T], Iterable[Tuple[Hashable, T]]] CompatibleSignature = Union[DictOrIterableTuple[Any], Set[Hashable]] CompatibleSignatures = DictOrIterableTuple[CompatibleSignature] EnrichmentResult = Iterable[Tuple[Hashable, FisherOverlap]] def _dict_or_iterable_tuple(it: DictOrIterableTuple[T]) -> Iterable[Tuple[Hashable, T]]: if callable(getattr(it, 'items', None)): return it.items() else: return it
[docs] def safe_odds_ratio(a, b, c, d): ''' Compute the odds ratio returning helpful answers in the case of division by zero issues.. ''' # numerator if a == 0 and c == 0: ac = float('nan') elif c == 0: # a != 0 ac = float('inf') else: ac = float(a / c) # denominator if b == 0 and d == 0: bd = float('nan') elif d == 0: # b != 0 bd = float('inf') else: bd = float(b / d) # odds ratio (numerator / denominator) if ac == float('nan') or bd == float('nan'): # not going to bother.. this would only happen if you had empty signatures return float('nan') elif ac == float('inf') and bd == float('inf'): # this would mean *everything* is in the input set.. # inf probably makes sense given that the occurrence # of the event would be *guaranteed* in this case return float('inf') elif ac == float('inf'): # bd != float('inf') # inf / number = inf return float('inf') elif bd == float('inf'): # ac != float('inf') # number / inf = 0 return 0.0 elif bd == 0: return float('inf') else: return ac / bd
[docs] def fisher_overlap( input_signature: Set[Hashable], background_signature: Set[Hashable], n_background_entities: int, preserve_overlap: bool = False, ) -> Optional[FisherOverlap]: ''' Given input and background set, compute the overlap, fisher significance, and odds ratio. In the case of no overlap, will return None. ''' overlap = input_signature & background_signature n_overlap = len(overlap) n_input_signature = len(input_signature) n_background_signature = len(background_signature) if n_overlap == 0: return None # a = n_overlap b = n_input_signature - n_overlap c = n_background_signature - n_overlap d = n_background_entities - n_background_signature - n_input_signature + n_overlap if d < 0: raise Exception('The total population cannot be smaller than the current overlap..') # # pvalue = fisher.pvalue(a, b, c, d).right_tail pvalue = scipy.stats.fisher_exact([[a, b], [c, d]], 'greater')[1] odds_ratio = safe_odds_ratio(a, b, c, d) # return FisherOverlap( pvalue=pvalue, odds_ratio=odds_ratio, n_overlap=n_overlap, overlap=overlap if preserve_overlap else None, )
[docs] def enrich_crisp( input_signature: CompatibleSignature, background_signatures: CompatibleSignatures, n_background_entities: int, preserve_overlap: bool = False, ) -> Iterable[Tuple[Hashable, FisherOverlap]]: ''' Perform crisp set enrichment analysis using fisher overlap. Eriches the signature in input_signature against signatures in background_signatures. :param n_background_entities: should correspond to the approximate number of entities exist, in the case of Human Genes for instance this might be 21000. ''' input_signature = set(input_signature) for background_signature_term, background_signature in _dict_or_iterable_tuple(background_signatures): background_signature = set(background_signature) result = fisher_overlap( input_signature, background_signature, n_background_entities=n_background_entities, preserve_overlap=preserve_overlap, ) if result is not None: yield background_signature_term, result