Source code for maayanlab_bioinformatics.parse.gmt

import re
import math as m
import pandas as pd
import contextlib
import logging

def _try_load_number(s):
  try:
    return int(s)
  except ValueError:
    pass
  try:
    return float(s)
  except ValueError:
    pass
  return s

@contextlib.contextmanager
def _ensure_fp(fp, mode):
  if type(fp) == str:
    with open(fp, mode) as fh:
      yield fh
  else:
    yield fp


[docs]
def parse_gene_weight(gene):
  ''' A helper to parse the gmt potentially with numeric weights
  '''
  gene, *_weight = re.split(r'[,:;]', gene.strip(), maxsplit=1)
  if _weight:
    _weight, = _weight
    _weight = _try_load_number(_weight)
    if type(_weight) == str:
      gene += _weight
      weight = 1
    else:
      weight = _weight
  else:
    weight = 1
  return gene.strip(), weight



[docs]
def parse_gene_unweighted(gene):
  ''' A helper to parse the gmt unweighted
  '''
  return gene.strip(), 1



[docs]
def gmt_read_iter(fh, parse_gene=parse_gene_weight):
  with _ensure_fp(fh, 'r') as fh:
    for n, line in enumerate(fh):
      try:
        term1, term2, genes_str = line.strip().split('\t', maxsplit=2)
      except ValueError:
        logging.warn('Ignoring line {}:{} because it seems empty'.format(n, line))
        continue
      term = '\t'.join(filter(None, map(str.strip, (term1, term2))))
      geneset = {
        k: v
        for k, v in map(parse_gene, genes_str.split('\t'))
        if k
      }
      yield term, geneset



[docs]
def gmt_read_dict(fh, parse_gene=parse_gene_weight):
  ''' Read .gmt files into a dictionary of the form:
  {
    'term_1\tterm_2': {
      gene_1: weight or 1,
      ...
    },
    ...
  }

  If your genes are encoded in a weird way you can also provide your own `parse_gene` function,
   the current one supports just gene names or gene names with weights separated by non-word/numeric characters.
  '''
  gmt = {}
  for n, (term, geneset) in enumerate(gmt_read_iter(fh, parse_gene=parse_gene)):
    if term in gmt:
      logging.warn('Duplicate term: {}:{}, merging'.format(n, term))
    else:
      gmt[term] = {}
    gmt[term].update(**geneset)
  return gmt



[docs]
def gmt_read_pd(fh, parse_gene=parse_gene_weight):
  ''' Read .gmt files directly into a data frame.
  '''
  return pd.DataFrame(gmt_read_dict(fh, parse_gene=parse_gene))



def _serialize_gene_weight_pair(gene, weight):
  if weight == 1 or m.isclose(weight, 1.): return gene
  elif m.isclose(weight, 0.) or m.isnan(weight): return None
  else: return '{},{}'.format(gene, weight)

def _ensure_weight(gs):
  if isinstance(gs, dict):
    return gs.items()
  else:
    return ((g, 1) for g in gs)


[docs]
def gmt_write_dict(gmt, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair):
  ''' Opposite of gmt_read_dict, write a dictionary to a file pointer
  serialize_gene_weight_pair can be used to customize serialization when dealing with weights.
    - it should return the serialized gene,weight pair or None if it should be removed
  By default, 0/nans are dropped, 1s result in a gene (crisp), and everything else uses gene,weight.
  '''
  with _ensure_fp(fh, 'w') as fh:
    for term, geneset in gmt.items():
      if '\t' not in term: serialized_term = term + '\t'
      else: serialized_term = term
      serialized_geneset = '\t'.join(filter(None, (
        serialize_gene_weight_pair(gene, weight)
        for gene, weight in _ensure_weight(geneset)
      )))
      if not serialized_geneset:
        logging.warn('Ignoring term {} because its geneset seems empty'.format(term))
        continue
      print(serialized_term, serialized_geneset, sep='\t', file=fh)



[docs]
def gmt_write_pd(df, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair):
  ''' Write a pandas dataframe as a gmt, where rows are genes and columns are terms.
  See gmt_write_dict for more information.
  '''
  gmt_write_dict(df.to_dict(), fh, serialize_gene_weight_pair=serialize_gene_weight_pair)