From e081d1716d1f2a11ec924112d6b0447904595ab3 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:20:25 +0200 Subject: [PATCH] add dataset_processor script --- src/api/comp_data_processor.yaml | 4 +- .../process_dataset/config.vsh.yaml | 2 +- src/data_processors/process_dataset/script.py | 78 +++++++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 src/data_processors/process_dataset/script.py diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 019c9ba..184bc54 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -12,11 +12,11 @@ arguments: direction: input required: true - name: "--output_train" - __merge__: file_train.yaml + __merge__: file_train_h5ad.yaml direction: output required: true - name: "--output_test" - __merge__: file_test.yaml + __merge__: file_test_h5ad.yaml direction: output required: true - name: "--output_solution" diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 1a0b36b..6f35e96 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -21,7 +21,7 @@ arguments: resources: - type: python_script path: script.py - - path: common/helper_functions/subset_anndata.py + - path: /common/helper_functions/subset_anndata.py engines: - type: docker diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py new file mode 100644 index 0000000..4348dc8 --- /dev/null +++ b/src/data_processors/process_dataset/script.py @@ -0,0 +1,78 @@ +import sys +import random +import numpy as np +import anndata as ad + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'method': 'batch', + 'seed': None, + 'obs_batch': 'batch', + 'obs_label': 'cell_type', + 'output_train': 'train.h5ad', + 'output_test': 'test.h5ad', + 'output_solution': 'solution.h5ad' +} +meta = { + 'resources_dir': 'data_processors/process_dataset', + 'config': 'data_processors/process_dataset/.config.vsh.yaml' +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par["input"]) +print("input:", adata) + +print(f">> Process data using {par['method']} method") +if par["method"] == "batch": + batch_info = adata.obs[par["obs_batch"]] + batch_categories = batch_info.dtype.categories + test_batches = random.sample(list(batch_categories), 1) + is_test = [ x in test_batches for x in batch_info ] +elif par["method"] == "random": + train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) + is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] + +# subset the different adatas +print(">> Figuring which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} +slot_info = read_config_slots_info(meta["config"], slot_mapping) + +print(">> Creating train data", flush=True) +output_train = subset_anndata( + adata[[not x for x in is_test]], + slot_info["output_train"] +) + +print(">> Creating test data", flush=True) +output_test = subset_anndata( + adata[is_test], + slot_info["output_test"] +) + +print(">> Creating solution data", flush=True) +output_solution = subset_anndata( + adata[is_test], + slot_info['output_solution'] +) + +print(">> Writing data", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +output_solution.write_h5ad(par["output_solution"])