Source code for maayanlab_bioinformatics.utils.sparse

import numpy as np
import scipy.sparse as sp_sparse


[docs]
def sp_hdf_dump(hdf, sdf, **kwargs):
  ''' Dump Sparse Pandas DataFrame to h5py object.

  Usage:
  ```python
  import h5py
  import pandas as pd
  import scipy.sparse as sp_sparse

  # write
  f = h5py.File('sparse.h5', 'w')
  sdf = pd.DataFrame.sparse.from_spmatrix(sp_sparse.eye(3))
  sp_hdf_dump(f, sdf)
  f.close()
  ```
  '''
  s = sdf.sparse.to_coo()
  hdf.create_dataset('data', data=s.data, **kwargs)
  hdf.create_dataset('row', data=s.row, **kwargs)
  hdf.create_dataset('col', data=s.col, **kwargs)
  hdf.create_dataset('index', data=sdf.index.values, **kwargs)
  hdf.create_dataset('columns', data=sdf.columns.values, **kwargs)
  hdf.attrs['shape'] = s.shape
  return hdf



[docs]
def sp_hdf_load(hdf):
  ''' Load Sparse Pandas DataFrame from h5py object.

  Usage:
  ```python
  import h5py
  import pandas as pd
  import scipy.sparse as sp_sparse

  f = h5py.File('sparse.h5', 'r')
  sdf = sp_hdf_load(f)
  f.close()
  ```
  '''
  import pandas as pd
  return pd.DataFrame.sparse.from_spmatrix(
    sp_sparse.coo_array((hdf['data'], (hdf['row'], hdf['col'])), shape=hdf.attrs['shape']),
    index=pd.Series(hdf['index']).str.decode('utf8'),
    columns=pd.Series(hdf['columns']).str.decode('utf8'),
  )



[docs]
def sp_std(X_ij, ddof=1):
  ''' Standard deviation for a matrix compatible with sparse matrices.
  i is the row index, j is the column index.

  \sigma_j = \sqrt{\frac{\sum(x_ij - \mu_j)^2}{N_j - ddof}}}
  '''
  N_j = X_ij.shape[-1]
  mu_j = X_ij.sum(axis=0) / N_j
  num_j = ((X_ij - mu_j)**2).sum(axis=0)
  denom_j = N_j - ddof
  if sp_sparse.isspmatrix(X_ij):
    return (num_j / denom_j).A.squeeze()**(1/2)
  else:
    return (num_j / denom_j)**(1/2)



[docs]
def sp_nanpercentile(sp, q, axis=None, method='linear'):
  ''' nanpercentile for a sparse matrix, basically we use np.percentile on the underlying data.
  '''
  coo = sp_sparse.coo_array(sp)
  if axis is None:
    return np.percentile(coo.data, q, method=method)
  elif axis == 0:
    return np.array([
      np.percentile(coo.data[coo.col == c], q, method=method)
      for c in range(coo.shape[1])
    ])
  elif axis == 1:
    return np.array([
      np.percentile(coo.data[coo.row == r], q, method=method)
      for r in range(coo.shape[0])
    ])
  else:
    raise NotImplementedError