Skip to content

Commit

Permalink
Merge pull request #183 from kbase/dev-service
Browse files Browse the repository at this point in the history
Add a NERSC status checker.
  • Loading branch information
MrCreosote authored Jan 18, 2025
2 parents 35e3705 + 4a474b6 commit b267cf9
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 2 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ Enables running jobs on remote compute from the KBase CDM cluster.

* Python 3.11+
* [crane](https://github.com/google/go-containerregistry/blob/main/cmd/crane/README.md)
* An s3 instance for use as a file store, but see "S3 requirements" below
* An S3 instance for use as a file store, but see "S3 requirements" below
* MongoDB 7+

### S3 requirements

Expand Down
1 change: 0 additions & 1 deletion cdmtaskservice/nersc/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,6 @@ async def upload_JAWS_job_files(
outputs = jaws_output.parse_outputs_json(outputs_json)
container_files = list(outputs.output_files.keys())
presigns = await files_to_urls(container_files)
# TODO LOGGING upload job stdout and stderr logs
return await self.upload_presigned_files(
job.id,
[os.path.join(jaws_output_dir, outputs.output_files[f]) for f in container_files],
Expand Down
82 changes: 82 additions & 0 deletions cdmtaskservice/nersc/status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Get the status of the NERSC perlmutter and DTN systems without authentication.
"""

import datetime
from sfapi_client import AsyncClient, StatusValue
from sfapi_client.compute import Machine
from typing import NamedTuple
import asyncio


class Status(NamedTuple):
""" The status of the NERSC compute systems. """

ok: bool
""" True if all systems are available. """

perlmutter_up: bool
""" True if perlmutter is available. """

dtns_up: bool
""" True if the DTNs are available. """

perlmutter_available: datetime.datetime | None
""" A date for when perlmutter is expected to be up if available. """

dtns_available: datetime.datetime | None
""" A date for when the DTNs are expected to be up if available. """

perlmutter_description: str | None
""" A free text description of the perlmutter issue, if any. """

dtns_description: str | None
""" A free text description of the DTN issue, if any. """

@property
def systems_available(self) -> datetime.datetime | None:
""" A date for when perlmutter and the DTNs are expected to be up if available. """
if self.perlmutter_available and self.dtns_available:
return max(self.perlmutter_available, self.dtns_available)
return self.dtns_available if self.dtns_available else self.perlmutter_available


class NERSCStatus:
""" Checks NERSC status. """

def __init__(self):
"""
Create the status checker. Does not confirm connectivity to NERSC at startup.
"""
self._cli = AsyncClient()

async def status(self) -> Status:
async with asyncio.TaskGroup() as tg:
perl = tg.create_task(self._get_status(Machine.perlmutter))
dtns = tg.create_task(self._get_status(Machine.dtns))

perl_ok, perl_up, perl_desc = perl.result()
dtns_ok, dtns_up, dtns_desc = dtns.result()
return Status(
ok=perl_ok and dtns_ok,
perlmutter_up=perl_ok,
dtns_up=dtns_ok,
perlmutter_available=perl_up,
dtns_available=dtns_up,
perlmutter_description=perl_desc,
dtns_description=dtns_desc,
)

async def _get_status(self, m: Machine) -> tuple[bool, datetime.datetime, str]:
ac = await self._cli.compute(m)
if ac.status != StatusValue.active:
outages = await self._cli.resources.outages(ac.name)
for o in outages:
if o.status == "Active":
return False, o.end_at, o.description
raise ValueError(f"NERSC resource {m} is inactive but found no outage")
return True, None, None

async def close(self):
""" Close the status client. Further calls will result in unspecified errors. """
await self._cli.close()
31 changes: 31 additions & 0 deletions test/nersc/nersc_status_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# TODO TEST how to test this? Mock a nersc server?

import datetime

from cdmtaskservice.nersc.status import Status


def test_systems_available():
dt1 = datetime.datetime(1984, 1, 26)
dt2 = datetime.datetime(1984, 1, 27)
_test_systems_available(None, None, None)
_test_systems_available(dt1, None, dt1)
_test_systems_available(None, dt2, dt2)
_test_systems_available(dt1, dt2, dt2)
_test_systems_available(dt2, dt1, dt2)



def _test_systems_available(
perl: datetime.datetime, dtns: datetime.datetime, expected: datetime.datetime
):
s = Status(
ok=True,
perlmutter_up=True,
dtns_up=True,
perlmutter_available=perl,
dtns_available=dtns,
perlmutter_description=None,
dtns_description=None,
)
assert s.systems_available == expected

0 comments on commit b267cf9

Please sign in to comment.