Skip to content

Commit

Permalink
WIP: sample sheets...
Browse files Browse the repository at this point in the history
  • Loading branch information
jmchilton committed Dec 9, 2024
1 parent 2982f2c commit 6432ea2
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 8 deletions.
11 changes: 10 additions & 1 deletion lib/galaxy/managers/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def create(
completed_job=None,
output_name=None,
fields=None,
column_definitions=None,
rows=None,
):
"""
PRECONDITION: security checks on ability to add to parent
Expand All @@ -201,6 +203,8 @@ def create(
copy_elements=copy_elements,
history=history,
fields=fields,
column_definitions=column_definitions,
rows=rows,
)

implicit_inputs = []
Expand Down Expand Up @@ -288,6 +292,8 @@ def create_dataset_collection(
copy_elements=False,
history=None,
fields=None,
column_definitions=None,
rows=None,
):
# Make sure at least one of these is None.
assert element_identifiers is None or elements is None
Expand Down Expand Up @@ -324,9 +330,12 @@ def create_dataset_collection(

if elements is not self.ELEMENTS_UNINITIALIZED:
type_plugin = collection_type_description.rank_type_plugin()
dataset_collection = builder.build_collection(type_plugin, elements, fields=fields)
dataset_collection = builder.build_collection(
type_plugin, elements, fields=fields, column_definitions=column_definitions, rows=rows
)
else:
# TODO: Pass fields here - need test case first.
# TODO: same with column definitions I think.
dataset_collection = model.DatasetCollection(populated=False)
dataset_collection.collection_type = collection_type
return dataset_collection
Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy/managers/collections_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def api_payload_to_create_params(payload):
hide_source_items=string_as_bool(payload.get("hide_source_items", False)),
copy_elements=string_as_bool(payload.get("copy_elements", False)),
fields=payload.get("fields", None),
column_definitions=payload.get("column_definitions", None),
rows=payload.get("rows", None),
)
return params

Expand Down
16 changes: 14 additions & 2 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@
DatasetValidatedState,
InvocationsStateCounts,
JobState,
SampleSheetColumnDefinitions,
SampleSheetRow,
ToolRequestState,
)
from galaxy.schema.workflow.comments import WorkflowCommentModel
Expand Down Expand Up @@ -260,6 +262,7 @@ class ConfigurationTemplateEnvironmentVariable(TypedDict):
CONFIGURATION_TEMPLATE_CONFIGURATION_VARIABLES_TYPE = Dict[str, CONFIGURATION_TEMPLATE_CONFIGURATION_VALUE_TYPE]
CONFIGURATION_TEMPLATE_CONFIGURATION_SECRET_NAMES_TYPE = List[str]
CONFIGURATION_TEMPLATE_DEFINITION_TYPE = Dict[str, Any]
DATA_COLLECTION_FIELDS = List[Dict[str, Any]]


class TransformAction(TypedDict):
Expand Down Expand Up @@ -6521,6 +6524,10 @@ class DatasetCollection(Base, Dictifiable, UsesAnnotations, Serializable):
element_count: Mapped[Optional[int]]
create_time: Mapped[datetime] = mapped_column(default=now, nullable=True)
update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True)
# if collection_type is 'record' (heterogenous collection)
fields: Mapped[Optional[DATA_COLLECTION_FIELDS]] = mapped_column(JSONType)
# if collection_type is 'sample_sheet' (collection of rows that datasets with extra column metadata)
column_definitions: Mapped[Optional[SampleSheetColumnDefinitions]] = mapped_column(JSONType)

elements: Mapped[List["DatasetCollectionElement"]] = relationship(
primaryjoin=(lambda: DatasetCollection.id == DatasetCollectionElement.dataset_collection_id),
Expand All @@ -6540,14 +6547,15 @@ def __init__(
populated=True,
element_count=None,
fields=None,
column_definitions=None,
):
self.id = id
self.collection_type = collection_type
if not populated:
self.populated_state = DatasetCollection.populated_states.NEW
self.element_count = element_count
# TODO: persist fields...
self.fields = fields
self.column_definitions = column_definitions

def _build_nested_collection_attributes_stmt(
self,
Expand Down Expand Up @@ -6956,6 +6964,7 @@ def _base_to_dict(self, view):
name=self.name,
collection_id=self.collection_id,
collection_type=self.collection.collection_type,
column_definitions=self.collection.column_definitions,
populated=self.populated,
populated_state=self.collection.populated_state,
populated_state_message=self.collection.populated_state_message,
Expand Down Expand Up @@ -7443,6 +7452,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable):
# Element index and identifier to define this parent-child relationship.
element_index: Mapped[Optional[int]]
element_identifier: Mapped[Optional[str]] = mapped_column(Unicode(255))
columns: Mapped[Optional[SampleSheetRow]] = mapped_column(JSONType)

hda = relationship(
"HistoryDatasetAssociation",
Expand All @@ -7463,7 +7473,7 @@ class DatasetCollectionElement(Base, Dictifiable, Serializable):

# actionable dataset id needs to be available via API...
dict_collection_visible_keys = ["id", "element_type", "element_index", "element_identifier"]
dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier"]
dict_element_visible_keys = ["id", "element_type", "element_index", "element_identifier", "columns"]

UNINITIALIZED_ELEMENT = object()

Expand All @@ -7474,6 +7484,7 @@ def __init__(
element=None,
element_index=None,
element_identifier=None,
columns: Optional[SampleSheetRow] = None,
):
if isinstance(element, HistoryDatasetAssociation):
self.hda = element
Expand All @@ -7489,6 +7500,7 @@ def __init__(
self.collection = collection
self.element_index = element_index
self.element_identifier = element_identifier or str(element_index)
self.columns = columns

def __strict_check_before_flush__(self):
if self.collection.populated_optimized:
Expand Down
22 changes: 17 additions & 5 deletions lib/galaxy/model/dataset_collections/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,39 @@
from .type_description import COLLECTION_TYPE_DESCRIPTION_FACTORY


def build_collection(type, dataset_instances, collection=None, associated_identifiers=None, fields=None):
def build_collection(
type,
dataset_instances,
collection=None,
associated_identifiers=None,
fields=None,
column_definitions=None,
rows=None,
):
"""
Build DatasetCollection with populated DatasetcollectionElement objects
corresponding to the supplied dataset instances or throw exception if
this is not a valid collection of the specified type.
"""
dataset_collection = collection or model.DatasetCollection(fields=fields)
dataset_collection = collection or model.DatasetCollection(fields=fields, column_definitions=column_definitions)
associated_identifiers = associated_identifiers or set()
set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=fields)
set_collection_elements(
dataset_collection, type, dataset_instances, associated_identifiers, fields=fields, rows=rows
)
return dataset_collection


def set_collection_elements(dataset_collection, type, dataset_instances, associated_identifiers, fields=None):
def set_collection_elements(
dataset_collection, type, dataset_instances, associated_identifiers, fields=None, rows=None
):
new_element_keys = OrderedSet(dataset_instances.keys()) - associated_identifiers
new_dataset_instances = {k: dataset_instances[k] for k in new_element_keys}
dataset_collection.element_count = dataset_collection.element_count or 0
element_index = dataset_collection.element_count
elements = []
if fields == "auto":
fields = guess_fields(dataset_instances)
for element in type.generate_elements(new_dataset_instances, fields=fields):
for element in type.generate_elements(new_dataset_instances, fields=fields, rows=rows):
element.element_index = element_index
add_object_to_object_session(element, dataset_collection)
element.collection = dataset_collection
Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy/model/dataset_collections/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
list,
paired,
record,
sample_sheet,
)

PLUGIN_CLASSES = [
list.ListDatasetCollectionType,
paired.PairedDatasetCollectionType,
record.RecordDatasetCollectionType,
sample_sheet.SampleSheetDatasetCollectionType,
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
def upgrade():
with transaction():
add_column(dataset_collection_table, Column("column_definitions", JSONType(), default=None))
add_column(dataset_collection_table, Column("fields", JSONType(), default=None))
add_column(dataset_collection_element_table, Column("columns", JSONType(), default=None))


def downgrade():
with transaction():
drop_column(dataset_collection_table, "column_definitions")
drop_column(dataset_collection_table, "fields")
drop_column(dataset_collection_element_table, "columns")
34 changes: 34 additions & 0 deletions lib/galaxy/schema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from typing_extensions import (
Annotated,
Literal,
TypedDict,
)

from galaxy.schema import partial_model
Expand Down Expand Up @@ -347,6 +348,20 @@ class LimitedUserModel(Model):
MaybeLimitedUserModel = Union[UserModel, LimitedUserModel]


class SampleSheetColumnDefinition(TypedDict):
# named in compatiblity with CWL - trying to keep CWL fields in mind with
# this implementation. https://www.commonwl.org/user_guide/topics/inputs.html#inputs
# wrapping this up in a dict because I think we will want constraints and such
# in the future
type: Literal["string", "int", "float", "boolean"] # excluding "long" and "double" and composite types from CWL


SampleSheetColumnValueT = Union[str, int, float, bool]
SampleSheetColumnDefinitions = List[SampleSheetColumnDefinition]
SampleSheetRow = List[SampleSheetColumnValueT]
SampleSheetRows = Dict[str, SampleSheetRow]


class DiskUsageUserModel(Model):
total_disk_usage: float = TotalDiskUsageField
nice_total_disk_usage: str = NiceTotalDiskUsageField
Expand Down Expand Up @@ -997,6 +1012,11 @@ class DCESummary(Model, WithModelClass):
title="Object",
description="The element's specific data depending on the value of `element_type`.",
)
columns: Optional[SampleSheetRow] = Field(
None,
title="Columns",
description="A row (or list of columns) of data associated with this element",
)


DCObject.model_rebuild()
Expand Down Expand Up @@ -1141,6 +1161,10 @@ class HDCADetailed(HDCASummary):
None,
description="Encoded ID for the ICJ object describing the collection of jobs corresponding to this collection",
)
column_definitions: Optional[SampleSheetColumnDefinitions] = Field(
None,
description="Column data associated with each element of this collection.",
)


class HistoryContentItemBase(Model):
Expand Down Expand Up @@ -1654,6 +1678,16 @@ class CreateNewCollectionPayload(Model):
title="Element Identifiers",
description="List of elements that should be in the new collection.",
)
column_definitions: Optional[SampleSheetColumnDefinitions] = Field(
default=None,
title="Column Definitions",
description="Specify definitions for row data if collection_type if sample_sheet",
)
rows: Optional[SampleSheetRows] = Field(
default=None,
title="Row data",
description="Specify rows of metadata data corresponding to an indentifier if collection_type is sample_sheet",
)
name: Optional[str] = Field(
default=None,
title="Name",
Expand Down
37 changes: 37 additions & 0 deletions lib/galaxy_test/api/test_dataset_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,43 @@ def test_record_field_validation(self, history_id):
create_response = self._post("dataset_collections", payload)
self._assert_status_code_is(create_response, 400)

def test_sample_sheet_requires_columns(self, history_id):
contents = [
("sample1", "1\t2\t3"),
("sample2", "4\t5\t6"),
]
sample_sheet_identifiers = self.dataset_collection_populator.list_identifiers(history_id, contents)
payload = dict(
name="my cool sample sheet",
instance_type="history",
history_id=history_id,
element_identifiers=sample_sheet_identifiers,
collection_type="sample_sheet",
column_definitions=[{"type": "int"}],
rows={"sample1": [42], "sample2": [45]},
)
create_response = self._post("dataset_collections", payload, json=True)
dataset_collection = self._check_create_response(create_response)

self._assert_has_keys(dataset_collection, "collection_type", "column_definitions")
assert dataset_collection["collection_type"] == "sample_sheet"
assert dataset_collection["name"] == "my cool sample sheet"
returned_collections = dataset_collection["elements"]
assert len(returned_collections) == 2, dataset_collection
sheet_row_0_element = returned_collections[0]
self._assert_has_keys(sheet_row_0_element, "element_index", "columns")
record_pos_0_object = sheet_row_0_element["object"]
self._assert_has_keys(record_pos_0_object, "name", "history_content_type")
row_0 = sheet_row_0_element["columns"]
assert row_0[0] == 42

sheet_row_1_element = returned_collections[1]
self._assert_has_keys(sheet_row_1_element, "element_index", "columns")
row_1 = sheet_row_1_element["columns"]
assert row_1[0] == 45
# TODO: test case where column definition does not match supplied data
# TODO: test case without column definition, implement definition inference based on supplied datatypes

def test_list_download(self):
with self.dataset_populator.test_history(require_new=False) as history_id:
fetch_response = self.dataset_collection_populator.create_list_in_history(
Expand Down

0 comments on commit 6432ea2

Please sign in to comment.