From 6432ea2b848a854950437e631193db2202fece0b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 9 Dec 2024 11:34:06 -0500 Subject: [PATCH] WIP: sample sheets... --- lib/galaxy/managers/collections.py | 11 +++++- lib/galaxy/managers/collections_util.py | 2 + lib/galaxy/model/__init__.py | 16 +++++++- .../model/dataset_collections/builder.py | 22 ++++++++--- .../model/dataset_collections/registry.py | 2 + .../ec25b23b08e2_implement_sample_sheets.py | 2 + lib/galaxy/schema/schema.py | 34 +++++++++++++++++ .../api/test_dataset_collections.py | 37 +++++++++++++++++++ 8 files changed, 118 insertions(+), 8 deletions(-) diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index e9042e3aac55..f2c80ddea3ef 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -176,6 +176,8 @@ def create( completed_job=None, output_name=None, fields=None, + column_definitions=None, + rows=None, ): """ PRECONDITION: security checks on ability to add to parent @@ -201,6 +203,8 @@ def create( copy_elements=copy_elements, history=history, fields=fields, + column_definitions=column_definitions, + rows=rows, ) implicit_inputs = [] @@ -288,6 +292,8 @@ def create_dataset_collection( copy_elements=False, history=None, fields=None, + column_definitions=None, + rows=None, ): # Make sure at least one of these is None. assert element_identifiers is None or elements is None @@ -324,9 +330,12 @@ def create_dataset_collection( if elements is not self.ELEMENTS_UNINITIALIZED: type_plugin = collection_type_description.rank_type_plugin() - dataset_collection = builder.build_collection(type_plugin, elements, fields=fields) + dataset_collection = builder.build_collection( + type_plugin, elements, fields=fields, column_definitions=column_definitions, rows=rows + ) else: # TODO: Pass fields here - need test case first. + # TODO: same with column definitions I think. dataset_collection = model.DatasetCollection(populated=False) dataset_collection.collection_type = collection_type return dataset_collection diff --git a/lib/galaxy/managers/collections_util.py b/lib/galaxy/managers/collections_util.py index 7f129992c754..25c676de66b7 100644 --- a/lib/galaxy/managers/collections_util.py +++ b/lib/galaxy/managers/collections_util.py @@ -40,6 +40,8 @@ def api_payload_to_create_params(payload): hide_source_items=string_as_bool(payload.get("hide_source_items", False)), copy_elements=string_as_bool(payload.get("copy_elements", False)), fields=payload.get("fields", None), + column_definitions=payload.get("column_definitions", None), + rows=payload.get("rows", None), ) return params diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 7123cace4990..e3c1657e8426 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -181,6 +181,8 @@ DatasetValidatedState, InvocationsStateCounts, JobState, + SampleSheetColumnDefinitions, + SampleSheetRow, ToolRequestState, ) from galaxy.schema.workflow.comments import WorkflowCommentModel @@ -260,6 +262,7 @@ class ConfigurationTemplateEnvironmentVariable(TypedDict): CONFIGURATION_TEMPLATE_CONFIGURATION_VARIABLES_TYPE = Dict[str, CONFIGURATION_TEMPLATE_CONFIGURATION_VALUE_TYPE] CONFIGURATION_TEMPLATE_CONFIGURATION_SECRET_NAMES_TYPE = List[str] CONFIGURATION_TEMPLATE_DEFINITION_TYPE = Dict[str, Any] +DATA_COLLECTION_FIELDS = List[Dict[str, Any]] class TransformAction(TypedDict): @@ -6521,6 +6524,10 @@ class DatasetCollection(Base, Dictifiable, UsesAnnotations, Serializable): element_count: Mapped[Optional[int]] create_time: Mapped[datetime] = mapped_column(default=now, nullable=True) update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True) + # if collection_type is 'record' (heterogenous collection) + fields: Mapped[Optional[DATA_COLLECTION_FIELDS]] = mapped_column(JSONType) + # if collection_type is 'sample_sheet' (collection of rows that datasets with extra column metadata) + column_definitions: Mapped[Optional[SampleSheetColumnDefinitions]] = mapped_column(JSONType) elements: Mapped[List["DatasetCollectionElement"]] = relationship( primaryjoin=(lambda: DatasetCollection.id == DatasetCollectionElement.dataset_collection_id), @@ -6540,14 +6547,15 @@ def __init__( populated=True, element_count=None, fields=None, + column_definitions=None, ): self.id = id self.collection_type = collection_type if not populated: self.populated_state = DatasetCollection.populated_states.NEW self.element_count = element_count - # TODO: persist fields... self.fields = fields + self.column_definitions = column_definitions def _build_nested_collection_attributes_stmt( self, @@ -6956,6 +6964,7 @@ def _base_to_dict(self, view): name=self.name, collection_id=self.collection_id, collection_type=self.collection.collection_type, + column_definitions=self.collection.column_definitions, populated=self.populated, populated_state=self.collection.populated_state, populated_state_message=self.collection.populated_state_message, @@ -7443,6 +7452,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable): # Element index and identifier to define this parent-child relationship. element_index: Mapped[Optional[int]] element_identifier: Mapped[Optional[str]] = mapped_column(Unicode(255)) + columns: Mapped[Optional[SampleSheetRow]] = mapped_column(JSONType) hda = relationship( "HistoryDatasetAssociation", @@ -7463,7 +7473,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable): # actionable dataset id needs to be available via API... dict_collection_visible_keys = ["id", "element_type", "element_index", "element_identifier"] - dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier"] + dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier", "columns"] UNINITIALIZED_ELEMENT = object() @@ -7474,6 +7484,7 @@ def __init__( element=None, element_index=None, element_identifier=None, + columns: Optional[SampleSheetRow] = None, ): if isinstance(element, HistoryDatasetAssociation): self.hda = element @@ -7489,6 +7500,7 @@ def __init__( self.collection = collection self.element_index = element_index self.element_identifier = element_identifier or str(element_index) + self.columns = columns def __strict_check_before_flush__(self): if self.collection.populated_optimized: diff --git a/lib/galaxy/model/dataset_collections/builder.py b/lib/galaxy/model/dataset_collections/builder.py index 73af774904fe..fb827e603593 100644 --- a/lib/galaxy/model/dataset_collections/builder.py +++ b/lib/galaxy/model/dataset_collections/builder.py @@ -4,19 +4,31 @@ from .type_description import COLLECTION_TYPE_DESCRIPTION_FACTORY -def build_collection(type, dataset_instances, collection=None, associated_identifiers=None, fields=None): +def build_collection( + type, + dataset_instances, + collection=None, + associated_identifiers=None, + fields=None, + column_definitions=None, + rows=None, +): """ Build DatasetCollection with populated DatasetcollectionElement objects corresponding to the supplied dataset instances or throw exception if this is not a valid collection of the specified type. """ - dataset_collection = collection or model.DatasetCollection(fields=fields) + dataset_collection = collection or model.DatasetCollection(fields=fields, column_definitions=column_definitions) associated_identifiers = associated_identifiers or set() - set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=fields) + set_collection_elements( + dataset_collection, type, dataset_instances, associated_identifiers, fields=fields, rows=rows + ) return dataset_collection -def set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=None): +def set_collection_elements( + dataset_collection, type, dataset_instances, associated_identifiers, fields=None, rows=None +): new_element_keys = OrderedSet(dataset_instances.keys()) - associated_identifiers new_dataset_instances = {k: dataset_instances[k] for k in new_element_keys} dataset_collection.element_count = dataset_collection.element_count or 0 @@ -24,7 +36,7 @@ def set_collection_elements(dataset_collection, type, dataset_instances, associa elements = [] if fields == "auto": fields = guess_fields(dataset_instances) - for element in type.generate_elements(new_dataset_instances, fields=fields): + for element in type.generate_elements(new_dataset_instances, fields=fields, rows=rows): element.element_index = element_index add_object_to_object_session(element, dataset_collection) element.collection = dataset_collection diff --git a/lib/galaxy/model/dataset_collections/registry.py b/lib/galaxy/model/dataset_collections/registry.py index bd148edafd2d..3ba42faed8d8 100644 --- a/lib/galaxy/model/dataset_collections/registry.py +++ b/lib/galaxy/model/dataset_collections/registry.py @@ -3,12 +3,14 @@ list, paired, record, + sample_sheet, ) PLUGIN_CLASSES = [ list.ListDatasetCollectionType, paired.PairedDatasetCollectionType, record.RecordDatasetCollectionType, + sample_sheet.SampleSheetDatasetCollectionType, ] diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py b/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py index 558a485020cc..435bdce0ab6e 100644 --- a/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py +++ b/lib/galaxy/model/migrations/alembic/versions_gxy/ec25b23b08e2_implement_sample_sheets.py @@ -28,10 +28,12 @@ def upgrade(): with transaction(): add_column(dataset_collection_table, Column("column_definitions", JSONType(), default=None)) + add_column(dataset_collection_table, Column("fields", JSONType(), default=None)) add_column(dataset_collection_element_table, Column("columns", JSONType(), default=None)) def downgrade(): with transaction(): drop_column(dataset_collection_table, "column_definitions") + drop_column(dataset_collection_table, "fields") drop_column(dataset_collection_element_table, "columns") diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py index fe8d280f7515..847289d327b7 100644 --- a/lib/galaxy/schema/schema.py +++ b/lib/galaxy/schema/schema.py @@ -33,6 +33,7 @@ from typing_extensions import ( Annotated, Literal, + TypedDict, ) from galaxy.schema import partial_model @@ -347,6 +348,20 @@ class LimitedUserModel(Model): MaybeLimitedUserModel = Union[UserModel, LimitedUserModel] +class SampleSheetColumnDefinition(TypedDict): + # named in compatiblity with CWL - trying to keep CWL fields in mind with + # this implementation. https://www.commonwl.org/user_guide/topics/inputs.html#inputs + # wrapping this up in a dict because I think we will want constraints and such + # in the future + type: Literal["string", "int", "float", "boolean"] # excluding "long" and "double" and composite types from CWL + + +SampleSheetColumnValueT = Union[str, int, float, bool] +SampleSheetColumnDefinitions = List[SampleSheetColumnDefinition] +SampleSheetRow = List[SampleSheetColumnValueT] +SampleSheetRows = Dict[str, SampleSheetRow] + + class DiskUsageUserModel(Model): total_disk_usage: float = TotalDiskUsageField nice_total_disk_usage: str = NiceTotalDiskUsageField @@ -997,6 +1012,11 @@ class DCESummary(Model, WithModelClass): title="Object", description="The element's specific data depending on the value of `element_type`.", ) + columns: Optional[SampleSheetRow] = Field( + None, + title="Columns", + description="A row (or list of columns) of data associated with this element", + ) DCObject.model_rebuild() @@ -1141,6 +1161,10 @@ class HDCADetailed(HDCASummary): None, description="Encoded ID for the ICJ object describing the collection of jobs corresponding to this collection", ) + column_definitions: Optional[SampleSheetColumnDefinitions] = Field( + None, + description="Column data associated with each element of this collection.", + ) class HistoryContentItemBase(Model): @@ -1654,6 +1678,16 @@ class CreateNewCollectionPayload(Model): title="Element Identifiers", description="List of elements that should be in the new collection.", ) + column_definitions: Optional[SampleSheetColumnDefinitions] = Field( + default=None, + title="Column Definitions", + description="Specify definitions for row data if collection_type if sample_sheet", + ) + rows: Optional[SampleSheetRows] = Field( + default=None, + title="Row data", + description="Specify rows of metadata data corresponding to an indentifier if collection_type is sample_sheet", + ) name: Optional[str] = Field( default=None, title="Name", diff --git a/lib/galaxy_test/api/test_dataset_collections.py b/lib/galaxy_test/api/test_dataset_collections.py index a4ba01877e5e..dfb4536109df 100644 --- a/lib/galaxy_test/api/test_dataset_collections.py +++ b/lib/galaxy_test/api/test_dataset_collections.py @@ -201,6 +201,43 @@ def test_record_field_validation(self, history_id): create_response = self._post("dataset_collections", payload) self._assert_status_code_is(create_response, 400) + def test_sample_sheet_requires_columns(self, history_id): + contents = [ + ("sample1", "1\t2\t3"), + ("sample2", "4\t5\t6"), + ] + sample_sheet_identifiers = self.dataset_collection_populator.list_identifiers(history_id, contents) + payload = dict( + name="my cool sample sheet", + instance_type="history", + history_id=history_id, + element_identifiers=sample_sheet_identifiers, + collection_type="sample_sheet", + column_definitions=[{"type": "int"}], + rows={"sample1": [42], "sample2": [45]}, + ) + create_response = self._post("dataset_collections", payload, json=True) + dataset_collection = self._check_create_response(create_response) + + self._assert_has_keys(dataset_collection, "collection_type", "column_definitions") + assert dataset_collection["collection_type"] == "sample_sheet" + assert dataset_collection["name"] == "my cool sample sheet" + returned_collections = dataset_collection["elements"] + assert len(returned_collections) == 2, dataset_collection + sheet_row_0_element = returned_collections[0] + self._assert_has_keys(sheet_row_0_element, "element_index", "columns") + record_pos_0_object = sheet_row_0_element["object"] + self._assert_has_keys(record_pos_0_object, "name", "history_content_type") + row_0 = sheet_row_0_element["columns"] + assert row_0[0] == 42 + + sheet_row_1_element = returned_collections[1] + self._assert_has_keys(sheet_row_1_element, "element_index", "columns") + row_1 = sheet_row_1_element["columns"] + assert row_1[0] == 45 + # TODO: test case where column definition does not match supplied data + # TODO: test case without column definition, implement definition inference based on supplied datatypes + def test_list_download(self): with self.dataset_populator.test_history(require_new=False) as history_id: fetch_response = self.dataset_collection_populator.create_list_in_history(