From cfa534c4afbec853b1ffe89511a00e1fc4534fae Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Fri, 6 Dec 2024 19:35:44 -0500 Subject: [PATCH 1/9] extend CWLProv utilities --- cwltool/cwlprov/provenance_profile.py | 23 +--------- cwltool/cwlprov/ro.py | 60 ++++++++++++++++++++++++--- cwltool/executors.py | 3 +- cwltool/workflow.py | 3 +- 4 files changed, 58 insertions(+), 31 deletions(-) diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index ce8d63ad4..341a7d3be 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -5,7 +5,6 @@ import uuid from io import BytesIO from pathlib import PurePath, PurePosixPath -from socket import getfqdn from typing import ( TYPE_CHECKING, Any, @@ -35,7 +34,6 @@ ACCOUNT_UUID, CWLPROV, ENCODING, - FOAF, METADATA, ORE, PROVENANCE, @@ -119,25 +117,6 @@ def __str__(self) -> str: def generate_prov_doc(self) -> Tuple[str, ProvDocument]: """Add basic namespaces.""" - - def host_provenance(document: ProvDocument) -> None: - """Record host provenance.""" - document.add_namespace(CWLPROV) - document.add_namespace(UUID) - document.add_namespace(FOAF) - - hostname = getfqdn() - # won't have a foaf:accountServiceHomepage for unix hosts, but - # we can at least provide hostname - document.agent( - ACCOUNT_UUID, - { - PROV_TYPE: FOAF["OnlineAccount"], - "prov:location": hostname, - CWLPROV["hostname"]: hostname, - }, - ) - self.cwltool_version = f"cwltool {versionstring().split()[-1]}" self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#") # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') @@ -192,7 +171,7 @@ def host_provenance(document: ProvDocument) -> None: self.document.actedOnBehalfOf(account, agent) else: if self.host_provenance: - host_provenance(self.document) + self.research_object.host_provenance(self.document) if self.user_provenance: self.research_object.user_provenance(self.document) # The execution of cwltool diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index 7c6eaf5d6..882e14a77 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -8,6 +8,7 @@ import urllib import uuid from pathlib import Path, PurePosixPath +from socket import getfqdn from typing import ( IO, Any, @@ -23,7 +24,7 @@ ) import prov.model as provM -from prov.model import PROV, ProvDocument +from prov.model import ProvDocument from ..loghandler import _logger from ..stdfsaccess import StdFsAccess @@ -38,6 +39,7 @@ from . import Aggregate, Annotation, AuthoredBy, _valid_orcid, _whoami, checksum_copy from .provenance_constants import ( ACCOUNT_UUID, + CWLPROV, CWLPROV_VERSION, DATA, ENCODING, @@ -56,6 +58,7 @@ WORKFLOW, Hasher, ) +from .provenance_profile import ProvenanceProfile class ResearchObject: @@ -93,6 +96,26 @@ def __init__( self._initialize() _logger.debug("[provenance] Temporary research object: %s", self.folder) + def initialize_provenance( + self, + full_name: str, + host_provenance: bool, + user_provenance: bool, + orcid: str, + fsaccess: StdFsAccess, + run_uuid: Optional[uuid.UUID] = None, + ): + """Hook function allowing calling code to extend the provenance details if needed.""" + return ProvenanceProfile( + research_object=self, + full_name=full_name, + host_provenance=host_provenance, + user_provenance=user_provenance, + orcid=orcid, + fsaccess=fsaccess, + run_uuid=run_uuid, + ) + def self_check(self) -> None: """Raise ValueError if this RO is closed.""" if self.closed: @@ -128,10 +151,14 @@ def _initialize_bagit(self) -> None: bag_it_file.write("BagIt-Version: 0.97\n") bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n") + def resolve_user(self) -> tuple[str, str]: + """Hook function in case the calling code can provide a better resolution.""" + return _whoami() + def user_provenance(self, document: ProvDocument) -> None: """Add the user provenance.""" self.self_check() - (username, fullname) = _whoami() + (username, fullname) = self.resolve_user() if not self.full_name: self.full_name = fullname @@ -143,7 +170,7 @@ def user_provenance(self, document: ProvDocument) -> None: ACCOUNT_UUID, { provM.PROV_TYPE: FOAF["OnlineAccount"], - "prov:label": username, + provM.PROV_LABEL: username, FOAF["accountName"]: username, }, ) @@ -151,8 +178,8 @@ def user_provenance(self, document: ProvDocument) -> None: user = document.agent( self.orcid or USER_UUID, { - provM.PROV_TYPE: PROV["Person"], - "prov:label": self.full_name, + provM.PROV_TYPE: provM.PROV["Person"], + provM.PROV_LABEL: self.full_name, FOAF["name"]: self.full_name, FOAF["account"]: account, }, @@ -167,6 +194,29 @@ def user_provenance(self, document: ProvDocument) -> None: # get their name wrong!) document.actedOnBehalfOf(account, user) + def resolve_host(self) -> tuple[str, str]: + """Hook function in case the calling code can provide a better resolution.""" + fqdn = getfqdn() + return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default + + def host_provenance(self, document: ProvDocument) -> None: + """Record host provenance.""" + document.add_namespace(CWLPROV) + document.add_namespace(UUID) + document.add_namespace(FOAF) + + hostname, uri = self.resolve_host() + # won't have a foaf:accountServiceHomepage for unix hosts, but + # we can at least provide hostname + document.agent( + ACCOUNT_UUID, + { + provM.PROV_TYPE: FOAF["OnlineAccount"], + provM.PROV_LOCATION: uri, + CWLPROV["hostname"]: hostname, + }, + ) + def add_tagfile(self, path: str, timestamp: Optional[datetime.datetime] = None) -> None: """Add tag files to our research object.""" self.self_check() diff --git a/cwltool/executors.py b/cwltool/executors.py index bfc87f9c7..4c295cfad 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -203,8 +203,7 @@ def run_jobs( # define provenance profile for single commandline tool if not isinstance(process, Workflow) and runtime_context.research_obj is not None: - process.provenance_object = ProvenanceProfile( - runtime_context.research_obj, + process.provenance_object = runtime_context.research_obj.initialize_provenance( full_name=runtime_context.cwl_full_name, host_provenance=False, user_provenance=False, diff --git a/cwltool/workflow.py b/cwltool/workflow.py index 982ec7e70..45a085262 100644 --- a/cwltool/workflow.py +++ b/cwltool/workflow.py @@ -80,8 +80,7 @@ def __init__( if is_main: run_uuid = loadingContext.research_obj.ro_uuid - self.provenance_object = ProvenanceProfile( - loadingContext.research_obj, + self.provenance_object = loadingContext.research_obj.initialize_provenance( full_name=loadingContext.cwl_full_name, host_provenance=loadingContext.host_provenance, user_provenance=loadingContext.user_provenance, From a57b2a45c4ad6b5800d8260ae7e3f3ac3593af25 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Mon, 9 Dec 2024 12:15:37 -0500 Subject: [PATCH 2/9] fix circular import --- cwltool/cwlprov/ro.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index ef8b55ecd..f57e167d4 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -47,7 +47,6 @@ WORKFLOW, Hasher, ) -from .provenance_profile import ProvenanceProfile class ResearchObject: @@ -95,6 +94,8 @@ def initialize_provenance( run_uuid: Optional[uuid.UUID] = None, ): """Hook function allowing calling code to extend the provenance details if needed.""" + from .provenance_profile import ProvenanceProfile + return ProvenanceProfile( research_object=self, full_name=full_name, From fe6b706a02081df1063e33803473d827b5db70fc Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Mon, 9 Dec 2024 14:00:09 -0500 Subject: [PATCH 3/9] fix mypy typing --- cwltool/cwlprov/ro.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index f57e167d4..08dd72d47 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -10,7 +10,7 @@ from collections.abc import MutableMapping, MutableSequence from pathlib import Path, PurePosixPath from socket import getfqdn -from typing import IO, Any, Optional, Union, cast +from typing import TYPE_CHECKING, IO, Any, Optional, Union, cast import prov.model as provM from prov.model import ProvDocument @@ -47,6 +47,8 @@ WORKFLOW, Hasher, ) +if TYPE_CHECKING: + from .provenance_profile import ProvenanceProfile class ResearchObject: @@ -92,7 +94,7 @@ def initialize_provenance( orcid: str, fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, - ): + ) -> "ProvenanceProfile": """Hook function allowing calling code to extend the provenance details if needed.""" from .provenance_profile import ProvenanceProfile From f6154034b7765ac25f2580a5e05c20a15e830477 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Mon, 9 Dec 2024 14:04:53 -0500 Subject: [PATCH 4/9] fix docs linting --- cwltool/cwlprov/ro.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index 08dd72d47..dfd340491 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -95,7 +95,7 @@ def initialize_provenance( fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> "ProvenanceProfile": - """Hook function allowing calling code to extend the provenance details if needed.""" + """Provide a provenance profile initialization hook function to extend details as needed.""" from .provenance_profile import ProvenanceProfile return ProvenanceProfile( @@ -144,7 +144,7 @@ def _initialize_bagit(self) -> None: bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n") def resolve_user(self) -> tuple[str, str]: - """Hook function in case the calling code can provide a better resolution.""" + """Provide a user provenance hook function in case the calling code can provide a better resolution.""" return _whoami() def user_provenance(self, document: ProvDocument) -> None: @@ -187,7 +187,7 @@ def user_provenance(self, document: ProvDocument) -> None: document.actedOnBehalfOf(account, user) def resolve_host(self) -> tuple[str, str]: - """Hook function in case the calling code can provide a better resolution.""" + """Provide a host provenance hook function in case the calling code can provide a better resolution.""" fqdn = getfqdn() return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default From 525c1cce70886516bb178a87c23dbe581998deb0 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Mon, 9 Dec 2024 14:24:42 -0500 Subject: [PATCH 5/9] more linting fixes --- .gitignore | 1 + cwltool/cwlprov/ro.py | 29 +++++++++++++++++++++++++---- cwltool/executors.py | 1 - 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index fbe4b24fc..b4cab0e66 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ eggs/ *.egg .tox/ .pytest_cache +*.so # Editor Temps .*.sw? diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index dfd340491..98295f512 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -47,8 +47,9 @@ WORKFLOW, Hasher, ) + if TYPE_CHECKING: - from .provenance_profile import ProvenanceProfile + from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import class ResearchObject: @@ -95,7 +96,13 @@ def initialize_provenance( fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> "ProvenanceProfile": - """Provide a provenance profile initialization hook function to extend details as needed.""" + """ + Provide a provenance profile initialization hook function. + + Allows overriding the default strategy to define the + provenance profile concepts and associations to extend + details as needed. + """ from .provenance_profile import ProvenanceProfile return ProvenanceProfile( @@ -144,7 +151,15 @@ def _initialize_bagit(self) -> None: bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n") def resolve_user(self) -> tuple[str, str]: - """Provide a user provenance hook function in case the calling code can provide a better resolution.""" + """ + Provide a user provenance hook function. + + Allows overriding the default strategy to retrieve user provenance + in case the calling code can provide a better resolution. + The function must return a tuple of the (username, fullname) + that identifies the user. This user will be applied on top + to any provided ORCID or fullname by agent association. + """ return _whoami() def user_provenance(self, document: ProvDocument) -> None: @@ -187,7 +202,13 @@ def user_provenance(self, document: ProvDocument) -> None: document.actedOnBehalfOf(account, user) def resolve_host(self) -> tuple[str, str]: - """Provide a host provenance hook function in case the calling code can provide a better resolution.""" + """ + Provide a host provenance hook function. + + Allows overriding the default strategy to retrieve host provenance + in case the calling code can provide a better resolution. + The function must return a tuple of the (fqdn, uri) that identifies the host. + """ fqdn = getfqdn() return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default diff --git a/cwltool/executors.py b/cwltool/executors.py index 9064019c9..eef5e857b 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -19,7 +19,6 @@ from .command_line_tool import CallbackJob, ExpressionJob from .context import RuntimeContext, getdefault from .cuda import cuda_version_and_device_count -from .cwlprov.provenance_profile import ProvenanceProfile from .errors import WorkflowException from .job import JobBase from .loghandler import _logger From a5f603ebdd05d6413c38407d4320982c4ad3cb20 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 11 Dec 2024 16:55:51 -0500 Subject: [PATCH 6/9] test extra provenance options and validate resolved agent/user association --- tests/test_provenance.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/test_provenance.py b/tests/test_provenance.py index e8d8416be..a49b0157e 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -34,10 +34,18 @@ OA = Namespace("http://www.w3.org/ns/oa#") +TEST_ORCID = "https://orcid.org/0000-0003-4862-3349" + + def cwltool(tmp_path: Path, *args: Any) -> Path: prov_folder = tmp_path / "provenance" prov_folder.mkdir() - new_args = ["--provenance", str(prov_folder)] + new_args = [ + "--enable-user-provenance", + "--enable-host-provenance", + "--orcid", TEST_ORCID, + "--provenance", str(prov_folder) + ] new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run" @@ -485,7 +493,6 @@ def check_prov( # the has_provenance annotations in manifest.json instead # run should have been started by a wf engine - engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine)) assert engines, "Could not find WorkflowEngine" assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines @@ -502,6 +509,30 @@ def check_prov( PROV.SoftwareAgent, ) in g, "Engine not declared as SoftwareAgent" + # run should be associated to the user + people = set(g.subjects(RDF.type, SCHEMA.Person)) + assert len(people) == 1, "Can't find associated person in workflow run" + person = people.pop() + assert person == URIRef(TEST_ORCID) + + # find the random UUID assigned to cwltool + tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent)) + n_all_agents = 2 + len(tool_agents) + agents = set(g.subjects(RDF.type, PROV.Agent)) + assert len(agents) == n_all_agents, ( + "There should be 1 agent per tool (engine), 1 user agent, and 1 cwltool agent" + ) + agents.remove(person) + agents.remove(engine) # the main tool + remain_agents = agents - tool_agents + assert len(remain_agents) == 1 + cwltool_agent = remain_agents.pop() + assert ( + cwltool_agent, + PROV.actedOnBehalfOf, + person + ) in g, "Association of cwltool agent acting for user is missing" + if single_tool: activities = set(g.subjects(RDF.type, PROV.Activity)) assert len(activities) == 1, "Too many activities: %s" % activities From ea2a0b998baba6ea6bad89d5ea846248da890706 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 11 Dec 2024 16:58:57 -0500 Subject: [PATCH 7/9] fix linting --- cwltool/cwlprov/ro.py | 2 +- tests/test_provenance.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index 98295f512..39a06a473 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -10,7 +10,7 @@ from collections.abc import MutableMapping, MutableSequence from pathlib import Path, PurePosixPath from socket import getfqdn -from typing import TYPE_CHECKING, IO, Any, Optional, Union, cast +from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast import prov.model as provM from prov.model import ProvDocument diff --git a/tests/test_provenance.py b/tests/test_provenance.py index a49b0157e..94ed62603 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -43,8 +43,10 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: new_args = [ "--enable-user-provenance", "--enable-host-provenance", - "--orcid", TEST_ORCID, - "--provenance", str(prov_folder) + "--orcid", + TEST_ORCID, + "--provenance", + str(prov_folder), ] new_args.extend(args) # Run within a temporary directory to not pollute git checkout @@ -519,9 +521,9 @@ def check_prov( tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent)) n_all_agents = 2 + len(tool_agents) agents = set(g.subjects(RDF.type, PROV.Agent)) - assert len(agents) == n_all_agents, ( - "There should be 1 agent per tool (engine), 1 user agent, and 1 cwltool agent" - ) + assert ( + len(agents) == n_all_agents + ), "There should be 1 agent per tool (engine), 1 user agent, and 1 cwltool agent" agents.remove(person) agents.remove(engine) # the main tool remain_agents = agents - tool_agents @@ -530,7 +532,7 @@ def check_prov( assert ( cwltool_agent, PROV.actedOnBehalfOf, - person + person, ) in g, "Association of cwltool agent acting for user is missing" if single_tool: From 78ef52de19140650e0d3c1d547100f92f93dcd29 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 11 Dec 2024 22:37:01 -0500 Subject: [PATCH 8/9] test with/without orcid prov + fix missing user prov when running CommandLineTool directly --- cwltool/context.py | 2 + cwltool/cwlprov/provenance_profile.py | 24 +----- cwltool/cwlprov/ro.py | 15 ++-- cwltool/executors.py | 7 +- cwltool/main.py | 5 ++ tests/test_provenance.py | 115 +++++++++++++++++++------- 6 files changed, 112 insertions(+), 56 deletions(-) diff --git a/cwltool/context.py b/cwltool/context.py index 237a90968..bb281fd88 100644 --- a/cwltool/context.py +++ b/cwltool/context.py @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None: self.orcid: str = "" self.cwl_full_name: str = "" self.process_run_id: Optional[str] = None + self.prov_host: bool = False + self.prov_user: bool = False self.prov_obj: Optional[ProvenanceProfile] = None self.mpi_config: MpiConfig = MpiConfig() self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index 5a0f12f94..e8538e51b 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -27,7 +27,6 @@ ORE, PROVENANCE, RO, - SCHEMA, SHA1, SHA256, TEXT_PLAIN, @@ -144,25 +143,10 @@ def generate_prov_doc(self) -> tuple[str, ProvDocument]: # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) - if self.orcid or self.full_name: - person: dict[Union[str, Identifier], Any] = { - PROV_TYPE: PROV["Person"], - "prov:type": SCHEMA["Person"], - } - if self.full_name: - person["prov:label"] = self.full_name - person["foaf:name"] = self.full_name - person["schema:name"] = self.full_name - else: - # TODO: Look up name from ORCID API? - pass - agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) - self.document.actedOnBehalfOf(account, agent) - else: - if self.host_provenance: - self.research_object.host_provenance(self.document) - if self.user_provenance: - self.research_object.user_provenance(self.document) + if self.host_provenance: + self.research_object.host_provenance(self.document) + if self.user_provenance or self.orcid or self.full_name: + self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index 39a06a473..f58919a6b 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -37,6 +37,7 @@ METADATA, ORCID, PROVENANCE, + SCHEMA, SHA1, SHA256, SHA512, @@ -184,12 +185,14 @@ def user_provenance(self, document: ProvDocument) -> None: user = document.agent( self.orcid or USER_UUID, - { - provM.PROV_TYPE: provM.PROV["Person"], - provM.PROV_LABEL: self.full_name, - FOAF["name"]: self.full_name, - FOAF["account"]: account, - }, + [ + (provM.PROV_TYPE, SCHEMA["Person"]), + (provM.PROV_TYPE, provM.PROV["Person"]), + (provM.PROV_LABEL, self.full_name), + (FOAF["name"], self.full_name), + (FOAF["account"], account), + (SCHEMA["name"], self.full_name), + ], ) # cwltool may be started on the shell (directly by user), # by shell script (indirectly by user) diff --git a/cwltool/executors.py b/cwltool/executors.py index eef5e857b..33198d854 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -195,8 +195,11 @@ def run_jobs( if not isinstance(process, Workflow) and runtime_context.research_obj is not None: process.provenance_object = runtime_context.research_obj.initialize_provenance( full_name=runtime_context.cwl_full_name, - host_provenance=False, - user_provenance=False, + # following are only set from main when directly command line tool + # when nested in a workflow, they should be disabled since they would + # already have been provided/initialized by the parent workflow prov-obj + host_provenance=runtime_context.prov_host, + user_provenance=runtime_context.prov_user, orcid=runtime_context.orcid, # single tool execution, so RO UUID = wf UUID = tool UUID run_uuid=runtime_context.research_obj.ro_uuid, diff --git a/cwltool/main.py b/cwltool/main.py index 17ccb11ce..2649d0a77 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -1060,6 +1060,11 @@ def main( loadingContext = setup_loadingContext(loadingContext, runtimeContext, args) + if loadingContext.research_obj: + # early forward parameters required for a single command line tool + runtimeContext.prov_host = loadingContext.host_provenance + runtimeContext.prov_user = loadingContext.user_provenance + uri, tool_file_uri = resolve_tool_uri( args.workflow, resolver=loadingContext.resolver, diff --git a/tests/test_provenance.py b/tests/test_provenance.py index 94ed62603..e89660b43 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -32,22 +32,23 @@ SCHEMA = Namespace("http://schema.org/") CWLPROV = Namespace("https://w3id.org/cwl/prov#") OA = Namespace("http://www.w3.org/ns/oa#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") TEST_ORCID = "https://orcid.org/0000-0003-4862-3349" -def cwltool(tmp_path: Path, *args: Any) -> Path: +def cwltool(tmp_path: Path, *args: Any, with_orcid: bool = False) -> Path: prov_folder = tmp_path / "provenance" prov_folder.mkdir() new_args = [ "--enable-user-provenance", "--enable-host-provenance", - "--orcid", - TEST_ORCID, "--provenance", str(prov_folder), ] + if with_orcid: + new_args.extend(["--orcid", TEST_ORCID]) new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run" @@ -59,61 +60,81 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: @needs_docker -def test_hello_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_workflow(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello-workflow.cwl"), "--usermessage", "Hello workflow", - ) + with_orcid=with_orcid, + ), + with_orcid=with_orcid, ) @needs_docker -def test_hello_single_tool(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_single_tool(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello_single_tool.cwl"), "--message", "Hello tool", + with_orcid=with_orcid, ), single_tool=True, + with_orcid=with_orcid, ) @needs_docker -def test_revsort_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/revsort.cwl"), get_data("tests/wf/revsort-job.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) @needs_docker -def test_revsort_workflow_shortcut(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow_shortcut(tmp_path: Path, with_orcid: bool) -> None: """Confirm that using 'cwl:tool' shortcut still snapshots the CWL files.""" folder = cwltool( tmp_path, get_data("tests/wf/revsort-job-shortcut.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) assert not (folder / "snapshot" / "revsort-job-shortcut.json").exists() assert len(list((folder / "snapshot").iterdir())) == 4 @needs_docker -def test_nested_workflow(tmp_path: Path) -> None: - check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_nested_workflow(tmp_path: Path, with_orcid: bool) -> None: + check_provenance( + cwltool( + tmp_path, + get_data("tests/wf/nested.cwl"), + with_orcid=with_orcid, + ), + nested=True, + with_orcid=with_orcid, + ) @needs_docker -def test_secondary_files_implicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_implicit(tmp_path: Path, with_orcid: bool) -> None: file1 = tmp_path / "foo1.txt" file1idx = tmp_path / "foo1.txt.idx" @@ -123,13 +144,20 @@ def test_secondary_files_implicit(tmp_path: Path) -> None: f.write("bar") # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), "--file1", str(file1)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + "--file1", + str(file1), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_explicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_explicit(tmp_path: Path, with_orcid: bool) -> None: # Deliberately do NOT have common basename or extension file1dir = tmp_path / "foo" file1dir.mkdir() @@ -164,22 +192,33 @@ def test_secondary_files_explicit(tmp_path: Path) -> None: j = json.dumps(job, ensure_ascii=True) fp.write(j.encode("ascii")) - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), str(jobJson)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + str(jobJson), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_output(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_output(tmp_path: Path, with_orcid: bool) -> None: # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf-out.cwl")) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf-out.cwl"), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) # Skipped, not the same secondary files as above # self.check_secondary_files() @needs_docker -def test_directory_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_directory_workflow(tmp_path: Path, with_orcid: bool) -> None: dir2 = tmp_path / "dir2" dir2.mkdir() sha1 = { @@ -195,8 +234,14 @@ def test_directory_workflow(tmp_path: Path) -> None: with open(dir2 / x, "w", encoding="ascii") as f: f.write(x) - folder = cwltool(tmp_path, get_data("tests/wf/directory.cwl"), "--dir", str(dir2)) - check_provenance(folder, directory=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/directory.cwl"), + "--dir", + str(dir2), + with_orcid=with_orcid, + ) + check_provenance(folder, directory=True, with_orcid=with_orcid) # Output should include ls stdout of filenames a b c on each line file_list = ( @@ -219,10 +264,12 @@ def test_directory_workflow(tmp_path: Path) -> None: @needs_docker -def test_no_data_files(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_no_data_files(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/conditional_step_no_inputs.cwl"), + with_orcid=with_orcid, ) check_bagit(folder) @@ -273,6 +320,7 @@ def check_provenance( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: check_folders(base_path) check_bagit(base_path) @@ -283,6 +331,7 @@ def check_provenance( single_tool=single_tool, directory=directory, secondary_files=secondary_files, + with_orcid=with_orcid, ) @@ -473,6 +522,7 @@ def check_prov( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" assert prov_file.is_file(), f"Can't find {prov_file}" @@ -512,10 +562,20 @@ def check_prov( ) in g, "Engine not declared as SoftwareAgent" # run should be associated to the user + accounts = set(g.subjects(RDF.type, FOAF.OnlineAccount)) + assert len(accounts) == 1 + account = accounts.pop() people = set(g.subjects(RDF.type, SCHEMA.Person)) assert len(people) == 1, "Can't find associated person in workflow run" person = people.pop() - assert person == URIRef(TEST_ORCID) + if with_orcid: + assert person == URIRef(TEST_ORCID) + else: + account_names = set(g.objects(account, FOAF.accountName)) + assert len(account_names) == 1 + account_name = account_names.pop() + machine_user = provenance._whoami()[0] + assert account_name.value == machine_user # find the random UUID assigned to cwltool tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent)) @@ -528,9 +588,8 @@ def check_prov( agents.remove(engine) # the main tool remain_agents = agents - tool_agents assert len(remain_agents) == 1 - cwltool_agent = remain_agents.pop() assert ( - cwltool_agent, + account, PROV.actedOnBehalfOf, person, ) in g, "Association of cwltool agent acting for user is missing" From 365b4f2a6890b0e215c5ee8376bb7df1ab2714e5 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 11 Dec 2024 22:51:00 -0500 Subject: [PATCH 9/9] fix prov graph literal type value property --- mypy-stubs/rdflib/graph.pyi | 4 ++-- tests/test_provenance.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mypy-stubs/rdflib/graph.pyi b/mypy-stubs/rdflib/graph.pyi index d3e6f2f54..9764972b2 100644 --- a/mypy-stubs/rdflib/graph.pyi +++ b/mypy-stubs/rdflib/graph.pyi @@ -16,7 +16,7 @@ from rdflib import query from rdflib.collection import Collection from rdflib.paths import Path from rdflib.resource import Resource -from rdflib.term import BNode, Identifier, Node +from rdflib.term import BNode, Identifier, Literal, Node class Graph(Node): base: Any = ... @@ -66,7 +66,7 @@ class Graph(Node): ) -> Iterable[Node]: ... def objects( self, subject: Optional[Any] = ..., predicate: Optional[Any] = ... - ) -> Iterable[Identifier]: ... + ) -> Iterable[Union[Identifier, Literal]]: ... def subject_predicates(self, object: Optional[Any] = ...) -> None: ... def subject_objects(self, predicate: Optional[Any] = ...) -> None: ... def predicate_objects(self, subject: Optional[Any] = ...) -> None: ... diff --git a/tests/test_provenance.py b/tests/test_provenance.py index e89660b43..d7a2a698b 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -573,7 +573,7 @@ def check_prov( else: account_names = set(g.objects(account, FOAF.accountName)) assert len(account_names) == 1 - account_name = account_names.pop() + account_name = cast(Literal, account_names.pop()) machine_user = provenance._whoami()[0] assert account_name.value == machine_user