Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proposal: Improved ProvenanceProfile definition #2082

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ eggs/
*.egg
.tox/
.pytest_cache
*.so

# Editor Temps
.*.sw?
Expand Down
2 changes: 2 additions & 0 deletions cwltool/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None:
self.orcid: str = ""
self.cwl_full_name: str = ""
self.process_run_id: Optional[str] = None
self.prov_host: bool = False
self.prov_user: bool = False
self.prov_obj: Optional[ProvenanceProfile] = None
self.mpi_config: MpiConfig = MpiConfig()
self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None
Expand Down
45 changes: 4 additions & 41 deletions cwltool/cwlprov/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from collections.abc import MutableMapping, MutableSequence, Sequence
from io import BytesIO
from pathlib import PurePath, PurePosixPath
from socket import getfqdn
from typing import TYPE_CHECKING, Any, Optional, Union, cast

from prov.identifier import Identifier, QualifiedName
Expand All @@ -24,12 +23,10 @@
ACCOUNT_UUID,
CWLPROV,
ENCODING,
FOAF,
METADATA,
ORE,
PROVENANCE,
RO,
SCHEMA,
SHA1,
SHA256,
TEXT_PLAIN,
Expand Down Expand Up @@ -108,25 +105,6 @@

def generate_prov_doc(self) -> tuple[str, ProvDocument]:
"""Add basic namespaces."""

def host_provenance(document: ProvDocument) -> None:
"""Record host provenance."""
document.add_namespace(CWLPROV)
document.add_namespace(UUID)
document.add_namespace(FOAF)

hostname = getfqdn()
# won't have a foaf:accountServiceHomepage for unix hosts, but
# we can at least provide hostname
document.agent(
ACCOUNT_UUID,
{
PROV_TYPE: FOAF["OnlineAccount"],
"prov:location": hostname,
CWLPROV["hostname"]: hostname,
},
)

self.cwltool_version = f"cwltool {versionstring().split()[-1]}"
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
Expand Down Expand Up @@ -165,25 +143,10 @@
# .. but we always know cwltool was launched (directly or indirectly)
# by a user account, as cwltool is a command line tool
account = self.document.agent(ACCOUNT_UUID)
if self.orcid or self.full_name:
person: dict[Union[str, Identifier], Any] = {
PROV_TYPE: PROV["Person"],
"prov:type": SCHEMA["Person"],
}
if self.full_name:
person["prov:label"] = self.full_name
person["foaf:name"] = self.full_name
person["schema:name"] = self.full_name
else:
# TODO: Look up name from ORCID API?
pass
agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
self.document.actedOnBehalfOf(account, agent)
else:
if self.host_provenance:
host_provenance(self.document)
if self.user_provenance:
self.research_object.user_provenance(self.document)
if self.host_provenance:
self.research_object.host_provenance(self.document)

Check warning on line 147 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L147

Added line #L147 was not covered by tests
if self.user_provenance or self.orcid or self.full_name:
self.research_object.user_provenance(self.document)

Check warning on line 149 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L149

Added line #L149 was not covered by tests
# The execution of cwltool
wfengine = self.document.agent(
self.engine_uuid,
Expand Down
97 changes: 87 additions & 10 deletions cwltool/cwlprov/ro.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
import uuid
from collections.abc import MutableMapping, MutableSequence
from pathlib import Path, PurePosixPath
from typing import IO, Any, Optional, Union, cast
from socket import getfqdn
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast

import prov.model as provM
from prov.model import PROV, ProvDocument
from prov.model import ProvDocument

from ..loghandler import _logger
from ..stdfsaccess import StdFsAccess
Expand All @@ -27,6 +28,7 @@
from . import Aggregate, Annotation, AuthoredBy, _valid_orcid, _whoami, checksum_copy
from .provenance_constants import (
ACCOUNT_UUID,
CWLPROV,
CWLPROV_VERSION,
DATA,
ENCODING,
Expand All @@ -35,6 +37,7 @@
METADATA,
ORCID,
PROVENANCE,
SCHEMA,
SHA1,
SHA256,
SHA512,
Expand All @@ -46,6 +49,9 @@
Hasher,
)

if TYPE_CHECKING:
from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import


class ResearchObject:
"""CWLProv Research Object."""
Expand Down Expand Up @@ -82,6 +88,34 @@
self._initialize()
_logger.debug("[provenance] Temporary research object: %s", self.folder)

def initialize_provenance(
self,
full_name: str,
host_provenance: bool,
user_provenance: bool,
orcid: str,
fsaccess: StdFsAccess,
run_uuid: Optional[uuid.UUID] = None,
) -> "ProvenanceProfile":
"""
Provide a provenance profile initialization hook function.

Allows overriding the default strategy to define the
provenance profile concepts and associations to extend
details as needed.
"""
from .provenance_profile import ProvenanceProfile

return ProvenanceProfile(
research_object=self,
full_name=full_name,
host_provenance=host_provenance,
user_provenance=user_provenance,
orcid=orcid,
fsaccess=fsaccess,
run_uuid=run_uuid,
)

def self_check(self) -> None:
"""Raise ValueError if this RO is closed."""
if self.closed:
Expand Down Expand Up @@ -117,10 +151,22 @@
bag_it_file.write("BagIt-Version: 0.97\n")
bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n")

def resolve_user(self) -> tuple[str, str]:
"""
Provide a user provenance hook function.

Allows overriding the default strategy to retrieve user provenance
in case the calling code can provide a better resolution.
The function must return a tuple of the (username, fullname)
that identifies the user. This user will be applied on top
to any provided ORCID or fullname by agent association.
"""
return _whoami()

Check warning on line 164 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L164

Added line #L164 was not covered by tests

def user_provenance(self, document: ProvDocument) -> None:
"""Add the user provenance."""
self.self_check()
(username, fullname) = _whoami()
(username, fullname) = self.resolve_user()

Check warning on line 169 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L169

Added line #L169 was not covered by tests

if not self.full_name:
self.full_name = fullname
Expand All @@ -132,19 +178,21 @@
ACCOUNT_UUID,
{
provM.PROV_TYPE: FOAF["OnlineAccount"],
"prov:label": username,
provM.PROV_LABEL: username,
FOAF["accountName"]: username,
},
)

user = document.agent(
self.orcid or USER_UUID,
{
provM.PROV_TYPE: PROV["Person"],
"prov:label": self.full_name,
FOAF["name"]: self.full_name,
FOAF["account"]: account,
},
[
(provM.PROV_TYPE, SCHEMA["Person"]),
(provM.PROV_TYPE, provM.PROV["Person"]),
(provM.PROV_LABEL, self.full_name),
(FOAF["name"], self.full_name),
(FOAF["account"], account),
(SCHEMA["name"], self.full_name),
],
)
# cwltool may be started on the shell (directly by user),
# by shell script (indirectly by user)
Expand All @@ -156,6 +204,35 @@
# get their name wrong!)
document.actedOnBehalfOf(account, user)

def resolve_host(self) -> tuple[str, str]:
"""
Provide a host provenance hook function.

Allows overriding the default strategy to retrieve host provenance
in case the calling code can provide a better resolution.
The function must return a tuple of the (fqdn, uri) that identifies the host.
"""
fqdn = getfqdn()
return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default

Check warning on line 216 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L215-L216

Added lines #L215 - L216 were not covered by tests

def host_provenance(self, document: ProvDocument) -> None:
"""Record host provenance."""
document.add_namespace(CWLPROV)
document.add_namespace(UUID)
document.add_namespace(FOAF)

Check warning on line 222 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L220-L222

Added lines #L220 - L222 were not covered by tests

hostname, uri = self.resolve_host()

Check warning on line 224 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L224

Added line #L224 was not covered by tests
# won't have a foaf:accountServiceHomepage for unix hosts, but
# we can at least provide hostname
document.agent(

Check warning on line 227 in cwltool/cwlprov/ro.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/ro.py#L227

Added line #L227 was not covered by tests
ACCOUNT_UUID,
{
provM.PROV_TYPE: FOAF["OnlineAccount"],
provM.PROV_LOCATION: uri,
CWLPROV["hostname"]: hostname,
},
)

def add_tagfile(self, path: str, timestamp: Optional[datetime.datetime] = None) -> None:
"""Add tag files to our research object."""
self.self_check()
Expand Down
11 changes: 6 additions & 5 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from .command_line_tool import CallbackJob, ExpressionJob
from .context import RuntimeContext, getdefault
from .cuda import cuda_version_and_device_count
from .cwlprov.provenance_profile import ProvenanceProfile
from .errors import WorkflowException
from .job import JobBase
from .loghandler import _logger
Expand Down Expand Up @@ -194,11 +193,13 @@ def run_jobs(

# define provenance profile for single commandline tool
if not isinstance(process, Workflow) and runtime_context.research_obj is not None:
process.provenance_object = ProvenanceProfile(
runtime_context.research_obj,
process.provenance_object = runtime_context.research_obj.initialize_provenance(
full_name=runtime_context.cwl_full_name,
host_provenance=False,
user_provenance=False,
# following are only set from main when directly command line tool
# when nested in a workflow, they should be disabled since they would
# already have been provided/initialized by the parent workflow prov-obj
host_provenance=runtime_context.prov_host,
user_provenance=runtime_context.prov_user,
orcid=runtime_context.orcid,
# single tool execution, so RO UUID = wf UUID = tool UUID
run_uuid=runtime_context.research_obj.ro_uuid,
Expand Down
5 changes: 5 additions & 0 deletions cwltool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,11 @@ def main(

loadingContext = setup_loadingContext(loadingContext, runtimeContext, args)

if loadingContext.research_obj:
# early forward parameters required for a single command line tool
runtimeContext.prov_host = loadingContext.host_provenance
runtimeContext.prov_user = loadingContext.user_provenance

uri, tool_file_uri = resolve_tool_uri(
args.workflow,
resolver=loadingContext.resolver,
Expand Down
3 changes: 1 addition & 2 deletions cwltool/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@
if is_main:
run_uuid = loadingContext.research_obj.ro_uuid

self.provenance_object = ProvenanceProfile(
loadingContext.research_obj,
self.provenance_object = loadingContext.research_obj.initialize_provenance(

Check warning on line 75 in cwltool/workflow.py

View check run for this annotation

Codecov / codecov/patch

cwltool/workflow.py#L75

Added line #L75 was not covered by tests
full_name=loadingContext.cwl_full_name,
host_provenance=loadingContext.host_provenance,
user_provenance=loadingContext.user_provenance,
Expand Down
4 changes: 2 additions & 2 deletions mypy-stubs/rdflib/graph.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from rdflib import query
from rdflib.collection import Collection
from rdflib.paths import Path
from rdflib.resource import Resource
from rdflib.term import BNode, Identifier, Node
from rdflib.term import BNode, Identifier, Literal, Node

class Graph(Node):
base: Any = ...
Expand Down Expand Up @@ -66,7 +66,7 @@ class Graph(Node):
) -> Iterable[Node]: ...
def objects(
self, subject: Optional[Any] = ..., predicate: Optional[Any] = ...
) -> Iterable[Identifier]: ...
) -> Iterable[Union[Identifier, Literal]]: ...
def subject_predicates(self, object: Optional[Any] = ...) -> None: ...
def subject_objects(self, predicate: Optional[Any] = ...) -> None: ...
def predicate_objects(self, subject: Optional[Any] = ...) -> None: ...
Expand Down
Loading
Loading