Source code for maayanlab_bioinformatics.parse.gmt

import re
import math as m
import pandas as pd
import contextlib
import logging

def _try_load_number(s):
  try:
    return int(s)
  except ValueError:
    pass
  try:
    return float(s)
  except ValueError:
    pass
  return s

@contextlib.contextmanager
def _ensure_fp(fp, mode):
  if type(fp) == str:
    with open(fp, mode) as fh:
      yield fh
  else:
    yield fp

[docs] def parse_gene_weight(gene): ''' A helper to parse the gmt potentially with numeric weights ''' gene, *_weight = re.split(r'[,:;]', gene.strip(), maxsplit=1) if _weight: _weight, = _weight _weight = _try_load_number(_weight) if type(_weight) == str: gene += _weight weight = 1 else: weight = _weight else: weight = 1 return gene.strip(), weight
[docs] def parse_gene_unweighted(gene): ''' A helper to parse the gmt unweighted ''' return gene.strip(), 1
[docs] def gmt_read_iter(fh, parse_gene=parse_gene_weight): with _ensure_fp(fh, 'r') as fh: for n, line in enumerate(fh): try: term1, term2, genes_str = line.strip().split('\t', maxsplit=2) except ValueError: logging.warn('Ignoring line {}:{} because it seems empty'.format(n, line)) continue term = '\t'.join(filter(None, map(str.strip, (term1, term2)))) geneset = { k: v for k, v in map(parse_gene, genes_str.split('\t')) if k } yield term, geneset
[docs] def gmt_read_dict(fh, parse_gene=parse_gene_weight): ''' Read .gmt files into a dictionary of the form: { 'term_1\tterm_2': { gene_1: weight or 1, ... }, ... } If your genes are encoded in a weird way you can also provide your own `parse_gene` function, the current one supports just gene names or gene names with weights separated by non-word/numeric characters. ''' gmt = {} for n, (term, geneset) in enumerate(gmt_read_iter(fh, parse_gene=parse_gene)): if term in gmt: logging.warn('Duplicate term: {}:{}, merging'.format(n, term)) else: gmt[term] = {} gmt[term].update(**geneset) return gmt
[docs] def gmt_read_pd(fh, parse_gene=parse_gene_weight): ''' Read .gmt files directly into a data frame. ''' return pd.DataFrame(gmt_read_dict(fh, parse_gene=parse_gene))
def _serialize_gene_weight_pair(gene, weight): if weight == 1 or m.isclose(weight, 1.): return gene elif m.isclose(weight, 0.) or m.isnan(weight): return None else: return '{},{}'.format(gene, weight) def _ensure_weight(gs): if isinstance(gs, dict): return gs.items() else: return ((g, 1) for g in gs)
[docs] def gmt_write_dict(gmt, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair): ''' Opposite of gmt_read_dict, write a dictionary to a file pointer serialize_gene_weight_pair can be used to customize serialization when dealing with weights. - it should return the serialized gene,weight pair or None if it should be removed By default, 0/nans are dropped, 1s result in a gene (crisp), and everything else uses gene,weight. ''' with _ensure_fp(fh, 'w') as fh: for term, geneset in gmt.items(): if '\t' not in term: serialized_term = term + '\t' else: serialized_term = term serialized_geneset = '\t'.join(filter(None, ( serialize_gene_weight_pair(gene, weight) for gene, weight in _ensure_weight(geneset) ))) if not serialized_geneset: logging.warn('Ignoring term {} because its geneset seems empty'.format(term)) continue print(serialized_term, serialized_geneset, sep='\t', file=fh)
[docs] def gmt_write_pd(df, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair): ''' Write a pandas dataframe as a gmt, where rows are genes and columns are terms. See gmt_write_dict for more information. ''' gmt_write_dict(df.to_dict(), fh, serialize_gene_weight_pair=serialize_gene_weight_pair)