Source code for maayanlab_bioinformatics.parse.suerat

import os
import pandas as pd
import scipy.sparse as sp_sparse
from maayanlab_bioinformatics.utils import merge

[docs] def suerat_load(base_dir): ''' Files prepared for suerat are quite common, this function will load them given the directory that contains `barcodes.tsv.gz`, `features.tsv.gz`, and `matrix.tsv.gz`. ''' df_barcodes = pd.read_csv( os.path.join(base_dir, 'barcodes.tsv.gz'), index_col=0, header=None, sep='\t', ) df_features = pd.read_csv( os.path.join(base_dir, 'features.tsv.gz'), header=None, names=['symbol', 'type'], index_col=0, sep='\t', ) matrix = pd.read_csv( os.path.join(base_dir, 'matrix.mtx.gz'), header=None, names=['indices', 'indptr', 'data'], skiprows=2, sep=' ', ) csc_matrix = sp_sparse.csc_matrix( ( matrix['data'].values, ( matrix['indices'].values - 1, # 0 based indexing matrix['indptr'].values - 1, # 0 based indexing ) ), ) df_expression = pd.DataFrame(csc_matrix.todense()) df_expression.index = df_features.index df_expression.columns = df_barcodes.index return df_features, df_barcodes, df_expression
[docs] def suerat_load_multiple(base_dirs): ''' Sets of suerat directories that are meant to be analyzed together are quite common, providing all those directories to this function (much like load_suerat_files) will load each individually and return a merged version that captures the filename in the barcodes. ''' all_df_features = [] all_df_barcodes = [] all_df_expression = [] # for ind, base_dir in enumerate(base_dirs): df_features, df_barcodes, df_expression = suerat_load(base_dir) df_barcodes['barcode'] = df_barcodes.index df_barcodes['file'] = f'File {ind}' df_barcodes.index = df_barcodes.index.map(lambda s, ind=ind: f'{ind}:{s}') df_expression.columns = df_barcodes.index all_df_features.append(df_features) all_df_barcodes.append(df_barcodes) all_df_expression.append(df_expression) # df_features = merge(*all_df_features, how='left', suffixes=('', '_')).drop(['symbol_', 'type_'], axis=1) df_barcodes = pd.concat(all_df_barcodes) df_expression = merge(*all_df_expression) # return df_features, df_barcodes, df_expression