Source code for maayanlab_bioinformatics.utils.sparse
import numpy as np
import scipy.sparse as sp_sparse
[docs]
def sp_hdf_dump(hdf, sdf, **kwargs):
''' Dump Sparse Pandas DataFrame to h5py object.
Usage:
```python
import h5py
import pandas as pd
import scipy.sparse as sp_sparse
# write
f = h5py.File('sparse.h5', 'w')
sdf = pd.DataFrame.sparse.from_spmatrix(sp_sparse.eye(3))
sp_hdf_dump(f, sdf)
f.close()
```
'''
s = sdf.sparse.to_coo()
hdf.create_dataset('data', data=s.data, **kwargs)
hdf.create_dataset('row', data=s.row, **kwargs)
hdf.create_dataset('col', data=s.col, **kwargs)
hdf.create_dataset('index', data=sdf.index.values, **kwargs)
hdf.create_dataset('columns', data=sdf.columns.values, **kwargs)
hdf.attrs['shape'] = s.shape
return hdf
[docs]
def sp_hdf_load(hdf):
''' Load Sparse Pandas DataFrame from h5py object.
Usage:
```python
import h5py
import pandas as pd
import scipy.sparse as sp_sparse
f = h5py.File('sparse.h5', 'r')
sdf = sp_hdf_load(f)
f.close()
```
'''
import pandas as pd
return pd.DataFrame.sparse.from_spmatrix(
sp_sparse.coo_array((hdf['data'], (hdf['row'], hdf['col'])), shape=hdf.attrs['shape']),
index=pd.Series(hdf['index']).str.decode('utf8'),
columns=pd.Series(hdf['columns']).str.decode('utf8'),
)
[docs]
def sp_std(X_ij, ddof=1):
''' Standard deviation for a matrix compatible with sparse matrices.
i is the row index, j is the column index.
\sigma_j = \sqrt{\frac{\sum(x_ij - \mu_j)^2}{N_j - ddof}}}
'''
N_j = X_ij.shape[-1]
mu_j = X_ij.sum(axis=0) / N_j
num_j = ((X_ij - mu_j)**2).sum(axis=0)
denom_j = N_j - ddof
if sp_sparse.isspmatrix(X_ij):
return (num_j / denom_j).A.squeeze()**(1/2)
else:
return (num_j / denom_j)**(1/2)
[docs]
def sp_nanpercentile(sp, q, axis=None, method='linear'):
''' nanpercentile for a sparse matrix, basically we use np.percentile on the underlying data.
'''
coo = sp_sparse.coo_array(sp)
if axis is None:
return np.percentile(coo.data, q, method=method)
elif axis == 0:
return np.array([
np.percentile(coo.data[coo.col == c], q, method=method)
for c in range(coo.shape[1])
])
elif axis == 1:
return np.array([
np.percentile(coo.data[coo.row == r], q, method=method)
for r in range(coo.shape[0])
])
else:
raise NotImplementedError