From 75617d61218a6fdbd46720efee208903a53823cf Mon Sep 17 00:00:00 2001 From: Gavin Date: Wed, 15 Jan 2025 15:29:42 -0800 Subject: [PATCH 1/2] Add a NERSC status checker. --- README.md | 3 +- cdmtaskservice/nersc/manager.py | 1 - cdmtaskservice/nersc/status.py | 82 +++++++++++++++++++++++++++++++++ test/nersc/nersc_status_test.py | 7 +++ 4 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 cdmtaskservice/nersc/status.py create mode 100644 test/nersc/nersc_status_test.py diff --git a/README.md b/README.md index 4987c10..863ab47 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ Enables running jobs on remote compute from the KBase CDM cluster. * Python 3.11+ * [crane](https://github.com/google/go-containerregistry/blob/main/cmd/crane/README.md) -* An s3 instance for use as a file store, but see "S3 requirements" below +* An S3 instance for use as a file store, but see "S3 requirements" below +* MongoDB 7+ ### S3 requirements diff --git a/cdmtaskservice/nersc/manager.py b/cdmtaskservice/nersc/manager.py index d11cf02..3ae4cdb 100644 --- a/cdmtaskservice/nersc/manager.py +++ b/cdmtaskservice/nersc/manager.py @@ -637,7 +637,6 @@ async def upload_JAWS_job_files( outputs = jaws_output.parse_outputs_json(outputs_json) container_files = list(outputs.output_files.keys()) presigns = await files_to_urls(container_files) - # TODO LOGGING upload job stdout and stderr logs return await self.upload_presigned_files( job.id, [os.path.join(jaws_output_dir, outputs.output_files[f]) for f in container_files], diff --git a/cdmtaskservice/nersc/status.py b/cdmtaskservice/nersc/status.py new file mode 100644 index 0000000..ecd505f --- /dev/null +++ b/cdmtaskservice/nersc/status.py @@ -0,0 +1,82 @@ +""" +Get the status of the NERSC perlmutter and DTN systems without authentication. +""" + +import datetime +from sfapi_client import AsyncClient, StatusValue +from sfapi_client.compute import Machine +from typing import NamedTuple +import asyncio + + +class Status(NamedTuple): + """ The status of the NERSC compute systems. """ + + ok: bool + """ True if all systems are available. """ + + perlmutter_up: bool + """ True if perlmutter is available. """ + + dtns_up: bool + """ True if the DTNs are available. """ + + perlmutter_available: datetime.datetime | None + """ A date for when perlmutter is expected to be up if available. """ + + dtns_available: datetime.datetime | None + """ A date for when the DTNs are expected to be up if available. """ + + perlmutter_description: str | None + """ A free text description of the perlmutter issue, if any. """ + + dtns_description: str | None + """ A free text description of the DTN issue, if any. """ + + @property + def systems_available(self) -> datetime.datetime | None: + """ A date for when perlmutter and the DTNs are expected to be up if available. """ + if self.perlmutter_available and self.dtns_available: + return max(self.perlmutter_available, self.dtns_available) + return self.dtns_available if self.dtns_available else self.perlmutter_available + + +class NERSCStatus: + """ Checks NERSC status. """ + + def __init__(self): + """ + Create the status checker. Does not confirm connectivity to NERSC at startup. + """ + self._cli = AsyncClient() + + async def status(self) -> Status: + async with asyncio.TaskGroup() as tg: + perl = tg.create_task(self._get_status(Machine.perlmutter)) + dtns = tg.create_task(self._get_status(Machine.dtns)) + + perl_ok, perl_up, perl_desc = perl.result() + dtns_ok, dtns_up, dtns_desc = dtns.result() + return Status( + ok=perl_ok and dtns_ok, + perlmutter_up=perl_ok, + dtns_up=dtns_ok, + perlmutter_available=perl_up, + dtns_available=dtns_up, + perlmutter_description=perl_desc, + dtns_description=dtns_desc, + ) + + async def _get_status(self, m: Machine) -> tuple[bool, datetime.datetime, str]: + ac = await self._cli.compute(m) + if ac.status != StatusValue.active: + outages = await self._cli.resources.outages(ac.name) + for o in outages: + if o.status == "Active": + return False, o.end_at, o.description + raise ValueError(f"NERSC resource {m} is inactive but found no outage") + return True, None, None + + async def close(self): + """ Close the status client. Further calls will result in unspecified errors. """ + await self._cli.close() diff --git a/test/nersc/nersc_status_test.py b/test/nersc/nersc_status_test.py new file mode 100644 index 0000000..8a52cf1 --- /dev/null +++ b/test/nersc/nersc_status_test.py @@ -0,0 +1,7 @@ +# TODO TEST how to test this? Mock a nersc server? + +from cdmtaskservice.nersc import status # @UnusedImport + + +def test_noop(): + pass From 4a474b6a4bd8988465ea70002d552743c4964a60 Mon Sep 17 00:00:00 2001 From: Gavin Date: Sat, 18 Jan 2025 11:50:26 -0800 Subject: [PATCH 2/2] Add tests for the nersc Status class's single method --- test/nersc/nersc_status_test.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/test/nersc/nersc_status_test.py b/test/nersc/nersc_status_test.py index 8a52cf1..8331dd5 100644 --- a/test/nersc/nersc_status_test.py +++ b/test/nersc/nersc_status_test.py @@ -1,7 +1,31 @@ # TODO TEST how to test this? Mock a nersc server? -from cdmtaskservice.nersc import status # @UnusedImport +import datetime +from cdmtaskservice.nersc.status import Status -def test_noop(): - pass + +def test_systems_available(): + dt1 = datetime.datetime(1984, 1, 26) + dt2 = datetime.datetime(1984, 1, 27) + _test_systems_available(None, None, None) + _test_systems_available(dt1, None, dt1) + _test_systems_available(None, dt2, dt2) + _test_systems_available(dt1, dt2, dt2) + _test_systems_available(dt2, dt1, dt2) + + + +def _test_systems_available( + perl: datetime.datetime, dtns: datetime.datetime, expected: datetime.datetime +): + s = Status( + ok=True, + perlmutter_up=True, + dtns_up=True, + perlmutter_available=perl, + dtns_available=dtns, + perlmutter_description=None, + dtns_description=None, + ) + assert s.systems_available == expected