WIP: sample sheets...

jmchilton · Dec 9, 2024 · 6432ea2 · 6432ea2
1 parent 2982f2c
commit 6432ea2
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 8 deletions.
diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py
@@ -176,6 +176,8 @@ def create(
         completed_job=None,
         output_name=None,
         fields=None,
+        column_definitions=None,
+        rows=None,
     ):
         """
         PRECONDITION: security checks on ability to add to parent
@@ -201,6 +203,8 @@ def create(
                 copy_elements=copy_elements,
                 history=history,
                 fields=fields,
+                column_definitions=column_definitions,
+                rows=rows,
             )
 
         implicit_inputs = []
@@ -288,6 +292,8 @@ def create_dataset_collection(
         copy_elements=False,
         history=None,
         fields=None,
+        column_definitions=None,
+        rows=None,
     ):
         # Make sure at least one of these is None.
         assert element_identifiers is None or elements is None
@@ -324,9 +330,12 @@ def create_dataset_collection(
 
         if elements is not self.ELEMENTS_UNINITIALIZED:
             type_plugin = collection_type_description.rank_type_plugin()
-            dataset_collection = builder.build_collection(type_plugin, elements, fields=fields)
+            dataset_collection = builder.build_collection(
+                type_plugin, elements, fields=fields, column_definitions=column_definitions, rows=rows
+            )
         else:
             # TODO: Pass fields here - need test case first.
+            # TODO: same with column definitions I think.
             dataset_collection = model.DatasetCollection(populated=False)
         dataset_collection.collection_type = collection_type
         return dataset_collection

diff --git a/lib/galaxy/managers/collections_util.py b/lib/galaxy/managers/collections_util.py
@@ -40,6 +40,8 @@ def api_payload_to_create_params(payload):
         hide_source_items=string_as_bool(payload.get("hide_source_items", False)),
         copy_elements=string_as_bool(payload.get("copy_elements", False)),
         fields=payload.get("fields", None),
+        column_definitions=payload.get("column_definitions", None),
+        rows=payload.get("rows", None),
     )
     return params
 

diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py
@@ -181,6 +181,8 @@
     DatasetValidatedState,
     InvocationsStateCounts,
     JobState,
+    SampleSheetColumnDefinitions,
+    SampleSheetRow,
     ToolRequestState,
 )
 from galaxy.schema.workflow.comments import WorkflowCommentModel
@@ -260,6 +262,7 @@ class ConfigurationTemplateEnvironmentVariable(TypedDict):
 CONFIGURATION_TEMPLATE_CONFIGURATION_VARIABLES_TYPE = Dict[str, CONFIGURATION_TEMPLATE_CONFIGURATION_VALUE_TYPE]
 CONFIGURATION_TEMPLATE_CONFIGURATION_SECRET_NAMES_TYPE = List[str]
 CONFIGURATION_TEMPLATE_DEFINITION_TYPE = Dict[str, Any]
+DATA_COLLECTION_FIELDS = List[Dict[str, Any]]
 
 
 class TransformAction(TypedDict):
@@ -6521,6 +6524,10 @@ class DatasetCollection(Base, Dictifiable, UsesAnnotations, Serializable):
     element_count: Mapped[Optional[int]]
     create_time: Mapped[datetime] = mapped_column(default=now, nullable=True)
     update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True)
+    # if collection_type is 'record' (heterogenous collection)
+    fields: Mapped[Optional[DATA_COLLECTION_FIELDS]] = mapped_column(JSONType)
+    # if collection_type is 'sample_sheet' (collection of rows that datasets with extra column metadata)
+    column_definitions: Mapped[Optional[SampleSheetColumnDefinitions]] = mapped_column(JSONType)
 
     elements: Mapped[List["DatasetCollectionElement"]] = relationship(
         primaryjoin=(lambda: DatasetCollection.id == DatasetCollectionElement.dataset_collection_id),
@@ -6540,14 +6547,15 @@ def __init__(
         populated=True,
         element_count=None,
         fields=None,
+        column_definitions=None,
     ):
         self.id = id
         self.collection_type = collection_type
         if not populated:
             self.populated_state = DatasetCollection.populated_states.NEW
         self.element_count = element_count
-        # TODO: persist fields...
         self.fields = fields
+        self.column_definitions = column_definitions
 
     def _build_nested_collection_attributes_stmt(
         self,
@@ -6956,6 +6964,7 @@ def _base_to_dict(self, view):
             name=self.name,
             collection_id=self.collection_id,
             collection_type=self.collection.collection_type,
+            column_definitions=self.collection.column_definitions,
             populated=self.populated,
             populated_state=self.collection.populated_state,
             populated_state_message=self.collection.populated_state_message,
@@ -7443,6 +7452,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable):
     # Element index and identifier to define this parent-child relationship.
     element_index: Mapped[Optional[int]]
     element_identifier: Mapped[Optional[str]] = mapped_column(Unicode(255))
+    columns: Mapped[Optional[SampleSheetRow]] = mapped_column(JSONType)
 
     hda = relationship(
         "HistoryDatasetAssociation",
@@ -7463,7 +7473,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable):
 
     # actionable dataset id needs to be available via API...
     dict_collection_visible_keys = ["id", "element_type", "element_index", "element_identifier"]
-    dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier"]
+    dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier", "columns"]
 
     UNINITIALIZED_ELEMENT = object()
 
@@ -7474,6 +7484,7 @@ def __init__(
         element=None,
         element_index=None,
         element_identifier=None,
+        columns: Optional[SampleSheetRow] = None,
     ):
         if isinstance(element, HistoryDatasetAssociation):
             self.hda = element
@@ -7489,6 +7500,7 @@ def __init__(
         self.collection = collection
         self.element_index = element_index
         self.element_identifier = element_identifier or str(element_index)
+        self.columns = columns
 
     def __strict_check_before_flush__(self):
         if self.collection.populated_optimized:

diff --git a/lib/galaxy/model/dataset_collections/builder.py b/lib/galaxy/model/dataset_collections/builder.py
@@ -4,27 +4,39 @@
 from .type_description import COLLECTION_TYPE_DESCRIPTION_FACTORY
 
 
-def build_collection(type, dataset_instances, collection=None, associated_identifiers=None, fields=None):
+def build_collection(
+    type,
+    dataset_instances,
+    collection=None,
+    associated_identifiers=None,
+    fields=None,
+    column_definitions=None,
+    rows=None,
+):
     """
     Build DatasetCollection with populated DatasetcollectionElement objects
     corresponding to the supplied dataset instances or throw exception if
     this is not a valid collection of the specified type.
     """
-    dataset_collection = collection or model.DatasetCollection(fields=fields)
+    dataset_collection = collection or model.DatasetCollection(fields=fields, column_definitions=column_definitions)
     associated_identifiers = associated_identifiers or set()
-    set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=fields)
+    set_collection_elements(
+        dataset_collection, type, dataset_instances, associated_identifiers, fields=fields, rows=rows
+    )
     return dataset_collection
 
 
-def set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=None):
+def set_collection_elements(
+    dataset_collection, type, dataset_instances, associated_identifiers, fields=None, rows=None
+):
     new_element_keys = OrderedSet(dataset_instances.keys()) - associated_identifiers
     new_dataset_instances = {k: dataset_instances[k] for k in new_element_keys}
     dataset_collection.element_count = dataset_collection.element_count or 0
     element_index = dataset_collection.element_count
     elements = []
     if fields == "auto":
         fields = guess_fields(dataset_instances)
-    for element in type.generate_elements(new_dataset_instances, fields=fields):
+    for element in type.generate_elements(new_dataset_instances, fields=fields, rows=rows):
         element.element_index = element_index
         add_object_to_object_session(element, dataset_collection)
         element.collection = dataset_collection

diff --git a/lib/galaxy/model/dataset_collections/registry.py b/lib/galaxy/model/dataset_collections/registry.py
@@ -3,12 +3,14 @@
     list,
     paired,
     record,
+    sample_sheet,
 )
 
 PLUGIN_CLASSES = [
     list.ListDatasetCollectionType,
     paired.PairedDatasetCollectionType,
     record.RecordDatasetCollectionType,
+    sample_sheet.SampleSheetDatasetCollectionType,
 ]
 
 

diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py b/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py
@@ -28,10 +28,12 @@
 def upgrade():
     with transaction():
         add_column(dataset_collection_table, Column("column_definitions", JSONType(), default=None))
+        add_column(dataset_collection_table, Column("fields", JSONType(), default=None))
         add_column(dataset_collection_element_table, Column("columns", JSONType(), default=None))
 
 
 def downgrade():
     with transaction():
         drop_column(dataset_collection_table, "column_definitions")
+        drop_column(dataset_collection_table, "fields")
         drop_column(dataset_collection_element_table, "columns")
diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py
@@ -33,6 +33,7 @@
 from typing_extensions import (
     Annotated,
     Literal,
+    TypedDict,
 )
 
 from galaxy.schema import partial_model
@@ -347,6 +348,20 @@ class LimitedUserModel(Model):
 MaybeLimitedUserModel = Union[UserModel, LimitedUserModel]
 
 
+class SampleSheetColumnDefinition(TypedDict):
+    # named in compatiblity with CWL - trying to keep CWL fields in mind with
+    # this implementation. https://www.commonwl.org/user_guide/topics/inputs.html#inputs
+    # wrapping this up in a dict because I think we will want constraints and such
+    # in the future
+    type: Literal["string", "int", "float", "boolean"]  # excluding "long" and "double" and composite types from CWL
+
+
+SampleSheetColumnValueT = Union[str, int, float, bool]
+SampleSheetColumnDefinitions = List[SampleSheetColumnDefinition]
+SampleSheetRow = List[SampleSheetColumnValueT]
+SampleSheetRows = Dict[str, SampleSheetRow]
+
+
 class DiskUsageUserModel(Model):
     total_disk_usage: float = TotalDiskUsageField
     nice_total_disk_usage: str = NiceTotalDiskUsageField
@@ -997,6 +1012,11 @@ class DCESummary(Model, WithModelClass):
         title="Object",
         description="The element's specific data depending on the value of `element_type`.",
     )
+    columns: Optional[SampleSheetRow] = Field(
+        None,
+        title="Columns",
+        description="A row (or list of columns) of data associated with this element",
+    )
 
 
 DCObject.model_rebuild()
@@ -1141,6 +1161,10 @@ class HDCADetailed(HDCASummary):
         None,
         description="Encoded ID for the ICJ object describing the collection of jobs corresponding to this collection",
     )
+    column_definitions: Optional[SampleSheetColumnDefinitions] = Field(
+        None,
+        description="Column data associated with each element of this collection.",
+    )
 
 
 class HistoryContentItemBase(Model):
@@ -1654,6 +1678,16 @@ class CreateNewCollectionPayload(Model):
         title="Element Identifiers",
         description="List of elements that should be in the new collection.",
     )
+    column_definitions: Optional[SampleSheetColumnDefinitions] = Field(
+        default=None,
+        title="Column Definitions",
+        description="Specify definitions for row data if collection_type if sample_sheet",
+    )
+    rows: Optional[SampleSheetRows] = Field(
+        default=None,
+        title="Row data",
+        description="Specify rows of metadata data corresponding to an indentifier if collection_type is sample_sheet",
+    )
     name: Optional[str] = Field(
         default=None,
         title="Name",

diff --git a/lib/galaxy_test/api/test_dataset_collections.py b/lib/galaxy_test/api/test_dataset_collections.py
@@ -201,6 +201,43 @@ def test_record_field_validation(self, history_id):
             create_response = self._post("dataset_collections", payload)
             self._assert_status_code_is(create_response, 400)
 
+    def test_sample_sheet_requires_columns(self, history_id):
+        contents = [
+            ("sample1", "1\t2\t3"),
+            ("sample2", "4\t5\t6"),
+        ]
+        sample_sheet_identifiers = self.dataset_collection_populator.list_identifiers(history_id, contents)
+        payload = dict(
+            name="my cool sample sheet",
+            instance_type="history",
+            history_id=history_id,
+            element_identifiers=sample_sheet_identifiers,
+            collection_type="sample_sheet",
+            column_definitions=[{"type": "int"}],
+            rows={"sample1": [42], "sample2": [45]},
+        )
+        create_response = self._post("dataset_collections", payload, json=True)
+        dataset_collection = self._check_create_response(create_response)
+
+        self._assert_has_keys(dataset_collection, "collection_type", "column_definitions")
+        assert dataset_collection["collection_type"] == "sample_sheet"
+        assert dataset_collection["name"] == "my cool sample sheet"
+        returned_collections = dataset_collection["elements"]
+        assert len(returned_collections) == 2, dataset_collection
+        sheet_row_0_element = returned_collections[0]
+        self._assert_has_keys(sheet_row_0_element, "element_index", "columns")
+        record_pos_0_object = sheet_row_0_element["object"]
+        self._assert_has_keys(record_pos_0_object, "name", "history_content_type")
+        row_0 = sheet_row_0_element["columns"]
+        assert row_0[0] == 42
+
+        sheet_row_1_element = returned_collections[1]
+        self._assert_has_keys(sheet_row_1_element, "element_index", "columns")
+        row_1 = sheet_row_1_element["columns"]
+        assert row_1[0] == 45
+        # TODO: test case where column definition does not match supplied data
+        # TODO: test case without column definition, implement definition inference based on supplied datatypes
+
     def test_list_download(self):
         with self.dataset_populator.test_history(require_new=False) as history_id:
             fetch_response = self.dataset_collection_populator.create_list_in_history(