Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proposal: Improved ProvenanceProfile definition #2082

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ eggs/
*.egg
.tox/
.pytest_cache
*.so

# Editor Temps
.*.sw?
Expand Down
2 changes: 2 additions & 0 deletions cwltool/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None:
self.orcid: str = ""
self.cwl_full_name: str = ""
self.process_run_id: Optional[str] = None
self.prov_host: bool = False
self.prov_user: bool = False
self.prov_obj: Optional[ProvenanceProfile] = None
self.mpi_config: MpiConfig = MpiConfig()
self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None
Expand Down
45 changes: 4 additions & 41 deletions cwltool/cwlprov/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from collections.abc import MutableMapping, MutableSequence, Sequence
from io import BytesIO
from pathlib import PurePath, PurePosixPath
from socket import getfqdn
from typing import TYPE_CHECKING, Any, Optional, Union, cast

from prov.identifier import Identifier, QualifiedName
Expand All @@ -24,12 +23,10 @@
ACCOUNT_UUID,
CWLPROV,
ENCODING,
FOAF,
METADATA,
ORE,
PROVENANCE,
RO,
SCHEMA,
SHA1,
SHA256,
TEXT_PLAIN,
Expand Down Expand Up @@ -108,25 +105,6 @@ def __str__(self) -> str:

def generate_prov_doc(self) -> tuple[str, ProvDocument]:
"""Add basic namespaces."""

def host_provenance(document: ProvDocument) -> None:
"""Record host provenance."""
document.add_namespace(CWLPROV)
document.add_namespace(UUID)
document.add_namespace(FOAF)

hostname = getfqdn()
# won't have a foaf:accountServiceHomepage for unix hosts, but
# we can at least provide hostname
document.agent(
ACCOUNT_UUID,
{
PROV_TYPE: FOAF["OnlineAccount"],
"prov:location": hostname,
CWLPROV["hostname"]: hostname,
},
)

self.cwltool_version = f"cwltool {versionstring().split()[-1]}"
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
Expand Down Expand Up @@ -165,25 +143,10 @@ def host_provenance(document: ProvDocument) -> None:
# .. but we always know cwltool was launched (directly or indirectly)
# by a user account, as cwltool is a command line tool
account = self.document.agent(ACCOUNT_UUID)
if self.orcid or self.full_name:
person: dict[Union[str, Identifier], Any] = {
PROV_TYPE: PROV["Person"],
"prov:type": SCHEMA["Person"],
}
if self.full_name:
person["prov:label"] = self.full_name
person["foaf:name"] = self.full_name
person["schema:name"] = self.full_name
else:
# TODO: Look up name from ORCID API?
pass
agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
self.document.actedOnBehalfOf(account, agent)
else:
if self.host_provenance:
host_provenance(self.document)
if self.user_provenance:
self.research_object.user_provenance(self.document)
if self.host_provenance:
self.research_object.host_provenance(self.document)
if self.user_provenance or self.orcid or self.full_name:
self.research_object.user_provenance(self.document)
# The execution of cwltool
wfengine = self.document.agent(
self.engine_uuid,
Expand Down
97 changes: 87 additions & 10 deletions cwltool/cwlprov/ro.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
import uuid
from collections.abc import MutableMapping, MutableSequence
from pathlib import Path, PurePosixPath
from typing import IO, Any, Optional, Union, cast
from socket import getfqdn
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast

import prov.model as provM
from prov.model import PROV, ProvDocument
from prov.model import ProvDocument

from ..loghandler import _logger
from ..stdfsaccess import StdFsAccess
Expand All @@ -27,6 +28,7 @@
from . import Aggregate, Annotation, AuthoredBy, _valid_orcid, _whoami, checksum_copy
from .provenance_constants import (
ACCOUNT_UUID,
CWLPROV,
CWLPROV_VERSION,
DATA,
ENCODING,
Expand All @@ -35,6 +37,7 @@
METADATA,
ORCID,
PROVENANCE,
SCHEMA,
SHA1,
SHA256,
SHA512,
Expand All @@ -46,6 +49,9 @@
Hasher,
)

if TYPE_CHECKING:
from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import


class ResearchObject:
"""CWLProv Research Object."""
Expand Down Expand Up @@ -82,6 +88,34 @@ def __init__(
self._initialize()
_logger.debug("[provenance] Temporary research object: %s", self.folder)

def initialize_provenance(
self,
full_name: str,
host_provenance: bool,
user_provenance: bool,
orcid: str,
fsaccess: StdFsAccess,
run_uuid: Optional[uuid.UUID] = None,
) -> "ProvenanceProfile":
"""
Provide a provenance profile initialization hook function.

Allows overriding the default strategy to define the
provenance profile concepts and associations to extend
details as needed.
"""
from .provenance_profile import ProvenanceProfile

return ProvenanceProfile(
research_object=self,
full_name=full_name,
host_provenance=host_provenance,
user_provenance=user_provenance,
orcid=orcid,
fsaccess=fsaccess,
run_uuid=run_uuid,
)

def self_check(self) -> None:
"""Raise ValueError if this RO is closed."""
if self.closed:
Expand Down Expand Up @@ -117,10 +151,22 @@ def _initialize_bagit(self) -> None:
bag_it_file.write("BagIt-Version: 0.97\n")
bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n")

def resolve_user(self) -> tuple[str, str]:
"""
Provide a user provenance hook function.

Allows overriding the default strategy to retrieve user provenance
in case the calling code can provide a better resolution.
The function must return a tuple of the (username, fullname)
that identifies the user. This user will be applied on top
to any provided ORCID or fullname by agent association.
"""
return _whoami()

def user_provenance(self, document: ProvDocument) -> None:
"""Add the user provenance."""
self.self_check()
(username, fullname) = _whoami()
(username, fullname) = self.resolve_user()

if not self.full_name:
self.full_name = fullname
Expand All @@ -132,19 +178,21 @@ def user_provenance(self, document: ProvDocument) -> None:
ACCOUNT_UUID,
{
provM.PROV_TYPE: FOAF["OnlineAccount"],
"prov:label": username,
provM.PROV_LABEL: username,
FOAF["accountName"]: username,
},
)

user = document.agent(
self.orcid or USER_UUID,
{
provM.PROV_TYPE: PROV["Person"],
"prov:label": self.full_name,
FOAF["name"]: self.full_name,
FOAF["account"]: account,
},
[
(provM.PROV_TYPE, SCHEMA["Person"]),
(provM.PROV_TYPE, provM.PROV["Person"]),
(provM.PROV_LABEL, self.full_name),
(FOAF["name"], self.full_name),
(FOAF["account"], account),
(SCHEMA["name"], self.full_name),
],
)
# cwltool may be started on the shell (directly by user),
# by shell script (indirectly by user)
Expand All @@ -156,6 +204,35 @@ def user_provenance(self, document: ProvDocument) -> None:
# get their name wrong!)
document.actedOnBehalfOf(account, user)

def resolve_host(self) -> tuple[str, str]:
"""
Provide a host provenance hook function.

Allows overriding the default strategy to retrieve host provenance
in case the calling code can provide a better resolution.
The function must return a tuple of the (fqdn, uri) that identifies the host.
"""
fqdn = getfqdn()
return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default

def host_provenance(self, document: ProvDocument) -> None:
"""Record host provenance."""
document.add_namespace(CWLPROV)
document.add_namespace(UUID)
document.add_namespace(FOAF)

hostname, uri = self.resolve_host()
# won't have a foaf:accountServiceHomepage for unix hosts, but
# we can at least provide hostname
document.agent(
ACCOUNT_UUID,
{
provM.PROV_TYPE: FOAF["OnlineAccount"],
provM.PROV_LOCATION: uri,
CWLPROV["hostname"]: hostname,
},
)

def add_tagfile(self, path: str, timestamp: Optional[datetime.datetime] = None) -> None:
"""Add tag files to our research object."""
self.self_check()
Expand Down
11 changes: 6 additions & 5 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from .command_line_tool import CallbackJob, ExpressionJob
from .context import RuntimeContext, getdefault
from .cuda import cuda_version_and_device_count
from .cwlprov.provenance_profile import ProvenanceProfile
from .errors import WorkflowException
from .job import JobBase
from .loghandler import _logger
Expand Down Expand Up @@ -194,11 +193,13 @@ def run_jobs(

# define provenance profile for single commandline tool
if not isinstance(process, Workflow) and runtime_context.research_obj is not None:
process.provenance_object = ProvenanceProfile(
runtime_context.research_obj,
process.provenance_object = runtime_context.research_obj.initialize_provenance(
full_name=runtime_context.cwl_full_name,
host_provenance=False,
user_provenance=False,
# following are only set from main when directly command line tool
# when nested in a workflow, they should be disabled since they would
# already have been provided/initialized by the parent workflow prov-obj
host_provenance=runtime_context.prov_host,
user_provenance=runtime_context.prov_user,
orcid=runtime_context.orcid,
# single tool execution, so RO UUID = wf UUID = tool UUID
run_uuid=runtime_context.research_obj.ro_uuid,
Expand Down
5 changes: 5 additions & 0 deletions cwltool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,11 @@ def main(

loadingContext = setup_loadingContext(loadingContext, runtimeContext, args)

if loadingContext.research_obj:
# early forward parameters required for a single command line tool
runtimeContext.prov_host = loadingContext.host_provenance
runtimeContext.prov_user = loadingContext.user_provenance

uri, tool_file_uri = resolve_tool_uri(
args.workflow,
resolver=loadingContext.resolver,
Expand Down
3 changes: 1 addition & 2 deletions cwltool/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ def __init__(
if is_main:
run_uuid = loadingContext.research_obj.ro_uuid

self.provenance_object = ProvenanceProfile(
loadingContext.research_obj,
self.provenance_object = loadingContext.research_obj.initialize_provenance(
full_name=loadingContext.cwl_full_name,
host_provenance=loadingContext.host_provenance,
user_provenance=loadingContext.user_provenance,
Expand Down
4 changes: 2 additions & 2 deletions mypy-stubs/rdflib/graph.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from rdflib import query
from rdflib.collection import Collection
from rdflib.paths import Path
from rdflib.resource import Resource
from rdflib.term import BNode, Identifier, Node
from rdflib.term import BNode, Identifier, Literal, Node

class Graph(Node):
base: Any = ...
Expand Down Expand Up @@ -66,7 +66,7 @@ class Graph(Node):
) -> Iterable[Node]: ...
def objects(
self, subject: Optional[Any] = ..., predicate: Optional[Any] = ...
) -> Iterable[Identifier]: ...
) -> Iterable[Union[Identifier, Literal]]: ...
def subject_predicates(self, object: Optional[Any] = ...) -> None: ...
def subject_objects(self, predicate: Optional[Any] = ...) -> None: ...
def predicate_objects(self, subject: Optional[Any] = ...) -> None: ...
Expand Down
Loading
Loading