From 1fbb9619d3bd3d88459bc91102795248c6147af0 Mon Sep 17 00:00:00 2001
From: Kai Waldrant <kai@data-intuitive.com>
Date: Fri, 28 Jun 2024 12:50:14 +0200
Subject: [PATCH] add process dataset api file

---
 src/api/comp_process_dataset.yaml | 32 ++++++++++++++++++++++++
 src/api/file_common_dataset.yaml  | 41 +++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 src/api/comp_process_dataset.yaml
 create mode 100644 src/api/file_common_dataset.yaml

diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
new file mode 100644
index 0000000..90bf545
--- /dev/null
+++ b/src/api/comp_process_dataset.yaml
@@ -0,0 +1,32 @@
+functionality:
+  namespace: "label_projection"
+  info:
+    type: process_dataset
+    type_info:
+      label: Data processor
+      summary: A label projection dataset processor.
+      description: |
+        A component for processing a Common Dataset into a task-specific dataset.
+  arguments:
+    - name: "--input"
+      __merge__: file_common_dataset.yaml
+      direction: input
+      required: true
+    - name: "--output_train"
+      __merge__: file_train.yaml
+      direction: output
+      required: true
+    - name: "--output_test"
+      __merge__: file_test.yaml
+      direction: output
+      required: true
+    - name: "--output_solution"
+      __merge__: file_solution.yaml
+      direction: output
+      required: true
+  test_resources:
+    - path: /resources_test/common/pancreas
+      dest: resources_test/common/pancreas
+    - type: python_script
+      path: /common/component_tests/run_and_check_output.py
+        
diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml
new file mode 100644
index 0000000..0a5a05f
--- /dev/null
+++ b/src/api/file_common_dataset.yaml
@@ -0,0 +1,41 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: "resources_test/common/pancreas/dataset.h5ad"
+info:
+  label: "Common Dataset"
+  summary: A subset of the common dataset.
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false