From 668c9ab9df2617631d60ca2b3ba403d09ad2ad79 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Sat, 4 Mar 2023 15:04:01 -0700 Subject: [PATCH 01/41] Add missing features, polish to `SynchronousComputeService` Closes #34. --- alchemiscale/compute/api.py | 33 ++++++++-- alchemiscale/compute/client.py | 10 +++ alchemiscale/compute/service.py | 111 ++++++++++++++++++++++++++------ alchemiscale/storage/models.py | 4 +- 4 files changed, 128 insertions(+), 30 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 193a441e..64ca6ad2 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -91,6 +91,30 @@ async def list_scopes( return [str(scope) for scope in scopes] +@router.post("/computeservice/{identifier}/register") +async def register_computeservice( + identifier, + n4js: Neo4jStore = Depends(get_n4js_depends), +): + n4js.register_computeservice(identifier) + + +@router.post("/computeservice/{identifier}/deregister") +async def deregister_computeservice( + identifier, + n4js: Neo4jStore = Depends(get_n4js_depends), +): + n4js.deregister_computeservice(identifier) + + +@router.post("/computeserviceid/{identifier}/unclaim") +async def unclaim_tasks( + identifier, + n4js: Neo4jStore = Depends(get_n4js_depends), +): + n4js.unclaim_tasks(identifier) + + @router.get("/taskhubs") async def query_taskhubs( *, @@ -127,7 +151,7 @@ async def query_taskhubs( async def claim_taskhub_tasks( taskhub_scoped_key, *, - claimant: str = Body(), + computeserviceid: str = Body(), count: int = Body(), n4js: Neo4jStore = Depends(get_n4js_depends), token: TokenData = Depends(get_token_data_depends), @@ -136,7 +160,7 @@ async def claim_taskhub_tasks( validate_scopes(sk.scope, token) tasks = n4js.claim_taskhub_tasks( - taskhub=taskhub_scoped_key, claimant=claimant, count=count + taskhub=taskhub_scoped_key, computeservice=computeserviceid, count=count ) return [str(t) if t is not None else None for t in tasks] @@ -202,11 +226,6 @@ def set_task_result( return result_sk -@router.get("/chemicalsystems") -async def chemicalsystems(): - return {"message": "nothing yet"} - - ### add router app.include_router(router) diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 000f1d89..44f7feaa 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -31,6 +31,12 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): _exception = AlchemiscaleComputeClientError + def register(self): + ... + + def deregister(self): + ... + def list_scopes(self) -> List[Scope]: scopes = self._get_resource( f"/identities/{self.identifier}/scopes", @@ -67,6 +73,10 @@ def claim_taskhub_tasks( return [ScopedKey.from_str(t) if t is not None else None for t in tasks] + def unclaim_tasks(self): + """Drop all `Task` claims.""" + ... + def get_task_transformation( self, task: ScopedKey ) -> Tuple[Transformation, Optional[ProtocolDAGResult]]: diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 40679d19..1f247429 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -7,6 +7,7 @@ import asyncio import sched import time +from uuid import uuid4 import random import threading from typing import Union, Optional, List, Dict, Tuple @@ -19,7 +20,7 @@ from gufe.protocols.protocoldag import execute_DAG, ProtocolDAG, ProtocolDAGResult from .client import AlchemiscaleComputeClient -from ..storage.models import Task, TaskHub +from ..storage.models import Task, TaskHub, ComputeServiceID from ..models import Scope, ScopedKey @@ -74,16 +75,42 @@ def __init__( identifier: str, key: str, name: str, - shared_path: Path, + shared_basedir: Path, + scratch_basedir: Path, + keep_scratch: bool = False, sleep_interval: int = 30, heartbeat_frequency: int = 30, scopes: Optional[List[Scope]] = None, limit: int = 1, ): - """ + """Create a `SynchronousComputeService` instance. Parameters ---------- + api_url + URL pointing to the compute API to execute Tasks for. + identifier + Identifier for the compute identity used for authentication. + key + Credential for the compute identity used for authentication. + name + The name to give this compute service; used for Task provenance, so + typically set to a distinct value to distinguish different compute + resources, e.g. different hosts or HPC clusters. + shared_basedir + Filesystem path to use for `ProtocolDAG` `shared` space. + scratch_basedir + Filesystem path to use for `ProtocolUnit` `scratch` space. + keep_scratch + If True, don't remove scratch directories for `ProtocolUnit`s after + completion. + sleep_interval + Time in seconds to sleep if no Tasks claimed from compute API. + heartbeat_frequency + Frequency at which to send heartbeats to compute API. + scopes + Scopes to limit Task claiming to; defaults to all Scopes accessible + by compute identity. limit Maximum number of Tasks to claim at a time from a TaskHub. @@ -99,16 +126,40 @@ def __init__( if scopes is None: self.scopes = [Scope()] - self.shared = shared_path + self.shared_basedir = shared_basedir + self.shared_basedir.mkdir(exist_ok=True) + + self.scratch_basedir = scratch_basedir + self.scratch_basedir.mkdir(exist_ok=True) + self.keep_scratch = keep_scratch + self.scheduler = sched.scheduler(time.monotonic, time.sleep) + self.counter = 0 + + self.computeserviceid = ComputeServiceID( + identifier=f"{self.name}-{uuid4()}") + self._stop = False + + def _register(self): + """Register this compute service with the compute API. + + """ + self.client.register(self.computeserviceid) + + def _deregister(self): + """Deregister this compute service with the compute API. + + """ + self.client.deregister(self.computeserviceid) + def heartbeat(self): """Deliver a heartbeat to the compute API, indicating this service is still alive.""" ... - def get_tasks(self, count=1) -> List[Optional[ScopedKey]]: + def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: """Get a Task to execute from compute API. Returns `None` if no Task was available matching service configuration. @@ -130,6 +181,9 @@ def get_tasks(self, count=1) -> List[Optional[ScopedKey]]: return tasks + def unclaim_tasks(self): + self.client.unclaim_tasks() + def task_to_protocoldag( self, task: ScopedKey ) -> Tuple[ProtocolDAG, Transformation, Optional[ProtocolDAGResult]]: @@ -140,7 +194,6 @@ def task_to_protocoldag( other Task is also given; otherwise `None` given. """ - ... transformation, extends_protocoldagresult = self.client.get_task_transformation( task @@ -158,12 +211,10 @@ def push_result( # TODO: this method should postprocess any paths, # leaf nodes in DAG for blob results that should go to object store - sk: ScopedKey = self.client.set_task_result(task, protocoldagresult) + # TODO: ship paths to object store - # TODO: remove claim on task, set to complete; remove from hubs - # TODO: if protocoldagresult.ok is False, need to handle this - # if protocoldagresult.ok(): - # self.client. + # finally, push ProtocolDAGResult + sk: ScopedKey = self.client.set_task_result(task, protocoldagresult) return sk @@ -178,9 +229,14 @@ def execute(self, task: ScopedKey) -> ScopedKey: # execute the task; this looks the same whether the ProtocolDAG is a # success or failure + shared = self.shared_basedir / str(protocoldag.key) / self.counter + shared.mkdir() + protocoldagresult = execute_DAG( protocoldag, - shared=self.shared, + shared=shared, + scratch_basdir = self.scratch_basedir, + keep_scratch=self.keep_scratch ) # push the result (or failure) back to the compute API @@ -194,10 +250,11 @@ def start(self, task_limit: Optional[int] = None): Parameters ---------- task_limit - Number of tasks to complete before exiting. + Number of Tasks to complete before exiting. If `None`, the service will continue until told to stop. """ + self._register() def scheduler_heartbeat(): self.heartbeat() @@ -205,37 +262,49 @@ def scheduler_heartbeat(): self.scheduler.enter(0, 2, scheduler_heartbeat) - counter = 0 while True: if task_limit is not None: - if counter >= task_limit: + if self.counter >= task_limit: break if self._stop: return - # get a task from the compute API - tasks: List[ScopedKey] = self.get_tasks(self.limit) + # claim tasks from the compute API + tasks: List[ScopedKey] = self.claim_tasks(self.limit) + # if no tasks claimed, sleep if all([task is None for task in tasks]): + if self._stop: + return time.sleep(self.sleep_interval) continue + # otherwise, process tasks for task in tasks: + if self._stop: + return + if task is None: continue + # execute each task self.execute(task) - - counter += 1 + self.counter += 1 def stop(self): self._stop = True - # Interrupt the scheduler (will finish if in the middle of an update or something, but will - # cancel running calculations) + # TODO: drop claims on tasks + self.unclaim_tasks() + + # Interrupt the scheduler (will finish if in the middle of an update or + # something, but will cancel running calculations) self.int_sleep.interrupt() + self._deregister() + + class AsynchronousComputeService(SynchronousComputeService): """Asynchronous compute service. diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index ba95b1ba..54c66fc3 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -18,13 +18,13 @@ from ..models import ScopedKey, Scope -class ComputeKey(BaseModel): +class ComputeServiceID(BaseModel): """Unique identifier for AlchemiscaleComputeService instances.""" identifier: str def __repr__(self): # pragma: no cover - return f"" + return f"" def __str__(self): return "-".join([self.identifier]) From da5fb7ab2c42319e30bcf794932d8921e3a725f0 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 10 Mar 2023 00:36:09 -0700 Subject: [PATCH 02/41] ComputeServiceID additions --- alchemiscale/compute/api.py | 13 +++---------- alchemiscale/compute/client.py | 6 +++--- alchemiscale/compute/service.py | 2 +- alchemiscale/storage/models.py | 2 +- alchemiscale/storage/statestore.py | 12 ++++++------ .../compute/client/test_compute_service.py | 7 ++++--- 6 files changed, 18 insertions(+), 24 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 64ca6ad2..79817dd4 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -28,7 +28,7 @@ from ..settings import get_base_api_settings, get_compute_api_settings from ..storage.statestore import Neo4jStore from ..storage.objectstore import S3ObjectStore -from ..storage.models import ProtocolDAGResultRef +from ..storage.models import ProtocolDAGResultRef, ComputeServiceID from ..models import Scope, ScopedKey from ..security.auth import get_token_data, oauth2_scheme from ..security.models import ( @@ -140,18 +140,11 @@ async def query_taskhubs( return taskhubs_handler.format_return() -# @app.get("/taskhubs/{scoped_key}") -# async def get_taskhub(scoped_key: str, -# *, -# n4js: Neo4jStore = Depends(get_n4js_depends)): -# return - - @router.post("/taskhubs/{taskhub_scoped_key}/claim") async def claim_taskhub_tasks( taskhub_scoped_key, *, - computeserviceid: str = Body(), + computeserviceid: ComputeServiceID , count: int = Body(), n4js: Neo4jStore = Depends(get_n4js_depends), token: TokenData = Depends(get_token_data_depends), @@ -160,7 +153,7 @@ async def claim_taskhub_tasks( validate_scopes(sk.scope, token) tasks = n4js.claim_taskhub_tasks( - taskhub=taskhub_scoped_key, computeservice=computeserviceid, count=count + taskhub=taskhub_scoped_key, computeserviceid=computeserviceid, count=count ) return [str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 44f7feaa..0e9906fa 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -19,7 +19,7 @@ from ..base.client import AlchemiscaleBaseClient, AlchemiscaleBaseClientError from ..models import Scope, ScopedKey -from ..storage.models import TaskHub, Task +from ..storage.models import TaskHub, Task, ComputeServiceID class AlchemiscaleComputeClientError(AlchemiscaleBaseClientError): @@ -65,10 +65,10 @@ def query_taskhubs( return taskhubs def claim_taskhub_tasks( - self, taskhub: ScopedKey, claimant: str, count: int = 1 + self, taskhub: ScopedKey, computeserviceid: ComputeServiceID, count: int = 1 ) -> Task: """Claim a `Task` from the specified `TaskHub`""" - data = dict(claimant=claimant, count=count) + data = dict(computeserviceid=computeserviceid, count=count) tasks = self._post_resource(f"taskhubs/{taskhub}/claim", data) return [ScopedKey.from_str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 1f247429..b08a295a 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -176,7 +176,7 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: # claim tasks from the taskhub tasks = self.client.claim_taskhub_tasks( - taskhub, claimant=self.name, count=count + taskhub, computeserviceid=self.computeserviceid, count=count ) return tasks diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index 54c66fc3..55ee888d 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -31,7 +31,7 @@ def __str__(self): class TaskProvenance(BaseModel): - computekey: ComputeKey + computekey: ComputeServiceID datetime_start: datetime datetime_end: datetime diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index c1e9960c..0cacc218 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -23,7 +23,7 @@ from py2neo.errors import ClientError from .models import ( - ComputeKey, + ComputeServiceID, Task, TaskHub, TaskArchive, @@ -91,7 +91,7 @@ def _select_task_from_taskpool(taskpool: Subgraph) -> Union[ScopedKey, None]: return chosen_one[0] -def _generate_claim_query(task_sk: ScopedKey, claimant: str) -> str: +def _generate_claim_query(task_sk: ScopedKey, computeserviceid: ComputeServiceID) -> str: """ Generate a query to claim a single Task. Parameters @@ -108,7 +108,7 @@ def _generate_claim_query(task_sk: ScopedKey, claimant: str) -> str: """ query = f""" MATCH (t:Task {{_scoped_key: '{task_sk}'}}) - SET t.status = 'running', t.claim = '{claimant}' + SET t.status = 'running', t.claim = '{computeserviceid.identifier}' RETURN t """ return query @@ -1086,7 +1086,7 @@ def get_taskhub_unclaimed_tasks( return [ScopedKey.from_str(t["_scoped_key"]) for t in tasks] def claim_taskhub_tasks( - self, taskhub: ScopedKey, claimant: str, count: int = 1 + self, taskhub: ScopedKey, computeserviceid: ComputeServiceID, count: int = 1 ) -> List[Union[ScopedKey, None]]: """Claim a TaskHub Task. @@ -1153,7 +1153,7 @@ def claim_taskhub_tasks( tasks.append(None) else: chosen_one = _select_task_from_taskpool(taskpool) - claim_query = _generate_claim_query(chosen_one, claimant) + claim_query = _generate_claim_query(chosen_one, computeserviceid) tasks.append(tx.run(claim_query).to_subgraph()) tx.run( @@ -1468,7 +1468,7 @@ def set_task_waiting(self, task: ScopedKey, clear_claim=True): with self.transaction() as tx: tx.run(q) - def set_task_running(self, task: ScopedKey, computekey: ComputeKey): + def set_task_running(self, task: ScopedKey, computeserviceid: ComputeServiceID): ... def set_task_complete( diff --git a/alchemiscale/tests/integration/compute/client/test_compute_service.py b/alchemiscale/tests/integration/compute/client/test_compute_service.py index 6bf8b295..a800f1e1 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_service.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_service.py @@ -20,13 +20,14 @@ def service(self, n4js_preloaded, compute_client, tmpdir): identifier=compute_client.identifier, key=compute_client.key, name="test_compute_service", - shared_path=Path(".").absolute(), + shared_basedir=Path("shared").absolute(), + scratch_basedir=Path("scratch").absolute(), ) - def test_get_tasks(self, n4js_preloaded, service): + def test_claim_tasks(self, n4js_preloaded, service): n4js: Neo4jStore = n4js_preloaded - task_sks: List[Optional[ScopedKey]] = service.get_tasks(count=2) + task_sks: List[Optional[ScopedKey]] = service.claim_tasks(count=2) # should have 2 tasks assert len(task_sks) == 2 From fbd78844685ac7f4f6530592c2b56691dcb645a7 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 10 Mar 2023 18:51:50 -0700 Subject: [PATCH 03/41] Added ComputeServiceID handling to state store Still need to modify claiming code yet. --- alchemiscale/compute/api.py | 20 +++----- alchemiscale/compute/client.py | 4 -- alchemiscale/compute/service.py | 6 +-- alchemiscale/storage/statestore.py | 74 +++++++++++++++++++++++++++++- 4 files changed, 80 insertions(+), 24 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 5cf2da7c..7229c112 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -92,28 +92,20 @@ async def list_scopes( return [str(scope) for scope in scopes] -@router.post("/computeservice/{identifier}/register") +@router.post("/computeservice/{computeservice_identifier}/register") async def register_computeservice( - identifier, + computeservice_identifier, n4js: Neo4jStore = Depends(get_n4js_depends), ): - n4js.register_computeservice(identifier) + n4js.register_computeservice(computeservice_identifier) -@router.post("/computeservice/{identifier}/deregister") +@router.post("/computeservice/{computeservice_identifier}/deregister") async def deregister_computeservice( - identifier, + computeservice_identifier, n4js: Neo4jStore = Depends(get_n4js_depends), ): - n4js.deregister_computeservice(identifier) - - -@router.post("/computeserviceid/{identifier}/unclaim") -async def unclaim_tasks( - identifier, - n4js: Neo4jStore = Depends(get_n4js_depends), -): - n4js.unclaim_tasks(identifier) + n4js.deregister_computeservice(computeservice_identifier) @router.get("/taskhubs") diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 11d00431..1daba72a 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -76,10 +76,6 @@ def claim_taskhub_tasks( return [ScopedKey.from_str(t) if t is not None else None for t in tasks] - def unclaim_tasks(self): - """Drop all `Task` claims.""" - ... - def get_task_transformation( self, task: ScopedKey ) -> Tuple[Transformation, Optional[ProtocolDAGResult]]: diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index b08a295a..11c9d536 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -181,9 +181,6 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: return tasks - def unclaim_tasks(self): - self.client.unclaim_tasks() - def task_to_protocoldag( self, task: ScopedKey ) -> Tuple[ProtocolDAG, Transformation, Optional[ProtocolDAGResult]]: @@ -296,12 +293,13 @@ def stop(self): self._stop = True # TODO: drop claims on tasks - self.unclaim_tasks() + #self.unclaim_tasks() # Interrupt the scheduler (will finish if in the middle of an update or # something, but will cancel running calculations) self.int_sleep.interrupt() + # remove ComputeServiceID, drop all claims self._deregister() diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index bd384496..df113b40 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -120,7 +120,6 @@ class Neo4jStore(AlchemiscaleStateStore): # with that label constraints = { "GufeTokenizable": {"name": "scoped_key", "property": "_scoped_key"}, - "Settings": {"name": "settings_content", "property": "content"}, "CredentialedUserIdentity": { "name": "user_identifier", "property": "identifier", @@ -730,6 +729,50 @@ def set_strategy( """Set the compute Strategy for the given AlchemicalNetwork.""" ... + def register_computeservice( + self, + computeserviceid: ComputeServiceID + ): + """Register a ComputeServiceID uniquely identifying a running + ComputeService. + + A ComputeServiceID node is used for CLAIMS relationships on Tasks to + avoid collisions in Task execution. + + """ + + node = Node("ComputeServiceID", **computeserviceid.dict()) + + with self.transaction() as tx: + tx.merge( + node, primary_label="ComputeServiceID", primary_key="identifier" + ) + + def deregister_computeservice( + self, + computeserviceid: ComputeServiceID + ): + """Remove the given ComputeServiceID from the state store. + + This wil remove the ComputeServiceID node, and all its CLAIMS + relationships to Tasks. + + All Tasks with CLAIMS relationships to the ComputeServiceID and with + status `running` will have their status set to `waiting`. + + """ + q = f""" + MATCH (n:ComputeServiceID {{identifier: '{computeserviceid.identifier}'}}) + + OPTIONAL MATCH (n)-[cl:CLAIMS]->(t:Task {{status: 'running'}}) + SET t.status = 'waiting' + + DETACH DELETE n + """ + + with self.transaction() as tx: + tx.run(q) + ## task hubs def create_taskhub( @@ -1566,7 +1609,14 @@ def q(t): OPTIONAL MATCH (t_:Task {{_scoped_key: '{t}'}}) WHERE t_.status IN ['waiting', 'running', 'error'] - SET t_.status = '{TaskStatusEnum.waiting.value}', t_.claimant = null + SET t_.status = '{TaskStatusEnum.waiting.value}' + + WITH t, t_ + + // if we changed the status to waiting, + // drop CLAIMS relationship + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + DELETE cl RETURN t, t_ """ @@ -1625,6 +1675,11 @@ def q(t): OPTIONAL MATCH (t_)<-[ar:ACTIONS]-(th:TaskHub) DELETE ar + // if we changed the status to complete, + // drop CLAIMS relationship + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + DELETE cl + RETURN t, t_ """ @@ -1650,6 +1705,13 @@ def q(t): WHERE t_.status IN ['error', 'running'] SET t_.status = '{TaskStatusEnum.error.value}' + WITH t, t_ + + // if we changed the status to error, + // drop CLAIMS relationship + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + DELETE cl + RETURN t, t_ """ @@ -1687,6 +1749,10 @@ def set_task_invalid( DELETE ar DELETE are + + // drop CLAIMS relationship if present + OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csid:ComputeServiceID) + DELETE cl """ tx.run(q) @@ -1721,6 +1787,10 @@ def set_task_deleted( DELETE ar DELETE are + + // drop CLAIMS relationship if present + OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csid:ComputeServiceID) + DELETE cl """ tx.run(q) From f6b3c35d1946472cb5b222872ec8d70e77fee1b1 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 13 Mar 2023 22:05:58 -0700 Subject: [PATCH 04/41] Finished converting claimant -> CLAIMS relationships --- alchemiscale/compute/api.py | 27 +++++--- alchemiscale/compute/client.py | 12 ++-- alchemiscale/compute/service.py | 9 ++- alchemiscale/storage/models.py | 12 +++- alchemiscale/storage/statestore.py | 66 +++++++++++-------- .../compute/client/test_compute_client.py | 8 +-- 6 files changed, 81 insertions(+), 53 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 7229c112..ea41a007 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List import os import json +from datetime import datetime from fastapi import FastAPI, APIRouter, Body, Depends, HTTPException, status from gufe.tokenization import GufeTokenizable, JSON_HANDLER @@ -29,7 +30,7 @@ from ..settings import get_base_api_settings, get_compute_api_settings from ..storage.statestore import Neo4jStore from ..storage.objectstore import S3ObjectStore -from ..storage.models import ProtocolDAGResultRef, ComputeServiceID, TaskStatusEnum +from ..storage.models import ProtocolDAGResultRef, ComputeServiceID, ComputeServiceRegistration, TaskStatusEnum from ..models import Scope, ScopedKey from ..security.auth import get_token_data, oauth2_scheme from ..security.models import ( @@ -92,21 +93,29 @@ async def list_scopes( return [str(scope) for scope in scopes] -@router.post("/computeservice/{computeservice_identifier}/register") +@router.post("/computeservice/{compute_service_id}/register") async def register_computeservice( - computeservice_identifier, + compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - n4js.register_computeservice(computeservice_identifier) + now = datetime.utcnow() + csid = ComputeServiceRegistration(identitfier=ComputeServiceID(compute_service_id), + registered=now, + heartbeat=now) + n4js.register_computeservice(csid) -@router.post("/computeservice/{computeservice_identifier}/deregister") + return compute_service_id + + +@router.post("/computeservice/{compute_service_id}/deregister") async def deregister_computeservice( - computeservice_identifier, + compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - n4js.deregister_computeservice(computeservice_identifier) + n4js.deregister_computeservice(ComputeServiceID(compute_service_id)) + return compute_service_id @router.get("/taskhubs") async def query_taskhubs( @@ -137,7 +146,7 @@ async def query_taskhubs( async def claim_taskhub_tasks( taskhub_scoped_key, *, - computeserviceid: ComputeServiceID , + compute_service_id: str, count: int = Body(), n4js: Neo4jStore = Depends(get_n4js_depends), token: TokenData = Depends(get_token_data_depends), @@ -146,7 +155,7 @@ async def claim_taskhub_tasks( validate_scopes(sk.scope, token) tasks = n4js.claim_taskhub_tasks( - taskhub=taskhub_scoped_key, computeserviceid=computeserviceid, count=count + taskhub=taskhub_scoped_key, compute_service_id=compute_service_id, count=count ) return [str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 1daba72a..3b2a7e2a 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -35,11 +35,11 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): _exception = AlchemiscaleComputeClientError - def register(self): - ... + def register(self, compute_service_id: ComputeServiceID): + return self._post_resource(f"computeservice/{compute_service_id}/register", {}) - def deregister(self): - ... + def deregister(self, compute_service_id, ComputeServiceID): + return self._post_resource(f"computeservice/{compute_service_id}/deregister", {}) def list_scopes(self) -> List[Scope]: scopes = self._get_resource( @@ -68,10 +68,10 @@ def query_taskhubs( return taskhubs def claim_taskhub_tasks( - self, taskhub: ScopedKey, computeserviceid: ComputeServiceID, count: int = 1 + self, taskhub: ScopedKey, compute_service_id: str, count: int = 1 ) -> Task: """Claim a `Task` from the specified `TaskHub`""" - data = dict(computeserviceid=computeserviceid, count=count) + data = dict(compute_service_id=compute_service_id, count=count) tasks = self._post_resource(f"taskhubs/{taskhub}/claim", data) return [ScopedKey.from_str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 11c9d536..84e84ebc 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -137,8 +137,7 @@ def __init__( self.counter = 0 - self.computeserviceid = ComputeServiceID( - identifier=f"{self.name}-{uuid4()}") + self.compute_service_id = ComputeServiceID(f"{self.name}-{uuid4()}") self._stop = False @@ -147,13 +146,13 @@ def _register(self): """Register this compute service with the compute API. """ - self.client.register(self.computeserviceid) + self.client.register(self.compute_service_id) def _deregister(self): """Deregister this compute service with the compute API. """ - self.client.deregister(self.computeserviceid) + self.client.deregister(self.compute_service_id) def heartbeat(self): """Deliver a heartbeat to the compute API, indicating this service is still alive.""" @@ -176,7 +175,7 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: # claim tasks from the taskhub tasks = self.client.claim_taskhub_tasks( - taskhub, computeserviceid=self.computeserviceid, count=count + taskhub, compute_service_id=self.compute_service_id, count=count ) return tasks diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index b546344e..a3ae7fcc 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -18,10 +18,16 @@ from ..models import ScopedKey, Scope -class ComputeServiceID(BaseModel): - """Unique identifier for AlchemiscaleComputeService instances.""" +class ComputeServiceID(str): + ... + + +class ComputeServiceRegistration(BaseModel): + """Registration for AlchemiscaleComputeService instances.""" - identifier: str + identifier: ComputeServiceID + registered: datetime + heartbeat: datetime def __repr__(self): # pragma: no cover return f"" diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index df113b40..d9f3eee0 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -24,6 +24,7 @@ from .models import ( ComputeServiceID, + ComputeServiceRegistration, Task, TaskHub, TaskArchive, @@ -91,15 +92,15 @@ def _select_task_from_taskpool(taskpool: Subgraph) -> Union[ScopedKey, None]: return chosen_one[0] -def _generate_claim_query(task_sk: ScopedKey, computeserviceid: ComputeServiceID) -> str: +def _generate_claim_query(task_sk: ScopedKey, compute_service_id: ComputeServiceID) -> str: """ Generate a query to claim a single Task. Parameters ---------- - task_sk: ScopedKey + task_sk The ScopedKey of the Task to claim. - claimant: str - The name of the claimant. + compute_service_id + ComputeServiceID of the claiming service. Returns ------- @@ -107,8 +108,15 @@ def _generate_claim_query(task_sk: ScopedKey, computeserviceid: ComputeServiceID The Cypher query to claim the Task. """ query = f""" + // only match the task if it doesn't have an existing CLAIMS relationship MATCH (t:Task {{_scoped_key: '{task_sk}'}}) - SET t.status = 'running', t.claim = '{computeserviceid.identifier}' + WHERE NOT (t)<-[:CLAIMS]-(:ComputeServiceRegistration) + SET t.status = 'running' + + // create CLAIMS relationship with given compute service + MATCH (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + CREATE (t)<-[cl:CLAIMS {{claimed: datetime({datetime.utcnow()})}}]-(csreg) + RETURN t """ return query @@ -128,6 +136,10 @@ class Neo4jStore(AlchemiscaleStateStore): "name": "compute_identifier", "property": "identifier", }, + "ComputeServiceRegistration": { + "name": "compute_service_registration_identifier", + "property": "identifier" + } } def __init__(self, graph: "py2neo.Graph"): @@ -731,38 +743,40 @@ def set_strategy( def register_computeservice( self, - computeserviceid: ComputeServiceID + compute_service_registration: ComputeServiceRegistration ): - """Register a ComputeServiceID uniquely identifying a running + """Register a ComputeServiceRegistration uniquely identifying a running ComputeService. - A ComputeServiceID node is used for CLAIMS relationships on Tasks to - avoid collisions in Task execution. + A ComputeServiceRegistration node is used for CLAIMS relationships on + Tasks to avoid collisions in Task execution. """ - node = Node("ComputeServiceID", **computeserviceid.dict()) + node = Node("ComputeServiceRegistration", + **compute_service_registration.dict()) with self.transaction() as tx: tx.merge( - node, primary_label="ComputeServiceID", primary_key="identifier" + node, primary_label="ComputeServiceRegistration", primary_key="identifier" ) def deregister_computeservice( self, - computeserviceid: ComputeServiceID + compute_service_id: ComputeServiceID ): - """Remove the given ComputeServiceID from the state store. + """Remove the registration for the given ComputeServiceID from the + state store. - This wil remove the ComputeServiceID node, and all its CLAIMS + This wil remove the ComputeServiceRegistration node, and all its CLAIMS relationships to Tasks. - All Tasks with CLAIMS relationships to the ComputeServiceID and with - status `running` will have their status set to `waiting`. + All Tasks with CLAIMS relationships to the ComputeServiceRegistration + and with status `running` will have their status set to `waiting`. """ q = f""" - MATCH (n:ComputeServiceID {{identifier: '{computeserviceid.identifier}'}}) + MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) OPTIONAL MATCH (n)-[cl:CLAIMS]->(t:Task {{status: 'running'}}) SET t.status = 'waiting' @@ -1133,7 +1147,7 @@ def get_taskhub_unclaimed_tasks( return [ScopedKey.from_str(t["_scoped_key"]) for t in tasks] def claim_taskhub_tasks( - self, taskhub: ScopedKey, computeserviceid: ComputeServiceID, count: int = 1 + self, taskhub: ScopedKey, compute_service_id: ComputeServiceID, count: int = 1 ) -> List[Union[ScopedKey, None]]: """Claim a TaskHub Task. @@ -1149,8 +1163,8 @@ def claim_taskhub_tasks( Parameters ---------- - claimant - Unique identifier for the entity claiming the Tasks for execution. + compute_service_id + Unique identifier for the compute service claiming the Tasks for execution. count Claim the given number of Tasks in a single transaction. @@ -1200,7 +1214,7 @@ def claim_taskhub_tasks( tasks.append(None) else: chosen_one = _select_task_from_taskpool(taskpool) - claim_query = _generate_claim_query(chosen_one, computeserviceid) + claim_query = _generate_claim_query(chosen_one, compute_service_id) tasks.append(tx.run(claim_query).to_subgraph()) tx.run( @@ -1615,7 +1629,7 @@ def q(t): // if we changed the status to waiting, // drop CLAIMS relationship - OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl RETURN t, t_ @@ -1677,7 +1691,7 @@ def q(t): // if we changed the status to complete, // drop CLAIMS relationship - OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl RETURN t, t_ @@ -1709,7 +1723,7 @@ def q(t): // if we changed the status to error, // drop CLAIMS relationship - OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csid:ComputeServiceID) + OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl RETURN t, t_ @@ -1751,7 +1765,7 @@ def set_task_invalid( DELETE are // drop CLAIMS relationship if present - OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csid:ComputeServiceID) + OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl """ tx.run(q) @@ -1789,7 +1803,7 @@ def set_task_deleted( DELETE are // drop CLAIMS relationship if present - OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csid:ComputeServiceID) + OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl """ tx.run(q) diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index 95485563..6cd5e1f1 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -88,7 +88,7 @@ def test_claim_taskhub_task( taskhub_sks = compute_client.query_taskhubs([scope_test]) # claim a single task; should get highest priority task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sks[0], claimant="me") + task_sks = compute_client.claim_taskhub_tasks(taskhub_sks[0], compute_service_id="me-123") all_tasks = n4js_preloaded.get_taskhub_tasks(taskhub_sks[0], return_gufe=True) assert len(task_sks) == 1 @@ -100,7 +100,7 @@ def test_claim_taskhub_task( remaining_tasks = n4js_preloaded.get_taskhub_unclaimed_tasks(taskhub_sks[0]) # claim two more tasks task_sks2 = compute_client.claim_taskhub_tasks( - taskhub_sks[0], count=2, claimant="me" + taskhub_sks[0], count=2, compute_service_id="me-123" ) assert task_sks2[0] in remaining_tasks assert task_sks2[1] in remaining_tasks @@ -119,7 +119,7 @@ def test_get_task_transformation( taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, claimant="me") + task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, compute_service_id="me-123") # get the transformation corresponding to this task ( @@ -223,7 +223,7 @@ def test_set_task_result( taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, claimant="me") + task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, compute_service_id="me-123") # get the transformation corresponding to this task ( From a6c1e83fc3722be8829b94aa07a4dc97fdc2edb6 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 13 Mar 2023 22:22:40 -0700 Subject: [PATCH 05/41] Black and bugfix --- alchemiscale/compute/api.py | 14 ++++++--- alchemiscale/compute/client.py | 4 ++- alchemiscale/compute/service.py | 18 ++++-------- alchemiscale/storage/statestore.py | 29 +++++++++++-------- .../compute/client/test_compute_client.py | 12 ++++++-- 5 files changed, 45 insertions(+), 32 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index ea41a007..4efa846e 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -30,7 +30,12 @@ from ..settings import get_base_api_settings, get_compute_api_settings from ..storage.statestore import Neo4jStore from ..storage.objectstore import S3ObjectStore -from ..storage.models import ProtocolDAGResultRef, ComputeServiceID, ComputeServiceRegistration, TaskStatusEnum +from ..storage.models import ( + ProtocolDAGResultRef, + ComputeServiceID, + ComputeServiceRegistration, + TaskStatusEnum, +) from ..models import Scope, ScopedKey from ..security.auth import get_token_data, oauth2_scheme from ..security.models import ( @@ -99,9 +104,9 @@ async def register_computeservice( n4js: Neo4jStore = Depends(get_n4js_depends), ): now = datetime.utcnow() - csid = ComputeServiceRegistration(identitfier=ComputeServiceID(compute_service_id), - registered=now, - heartbeat=now) + csid = ComputeServiceRegistration( + identitfier=ComputeServiceID(compute_service_id), registered=now, heartbeat=now + ) n4js.register_computeservice(csid) @@ -117,6 +122,7 @@ async def deregister_computeservice( return compute_service_id + @router.get("/taskhubs") async def query_taskhubs( *, diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 3b2a7e2a..50461b1b 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -39,7 +39,9 @@ def register(self, compute_service_id: ComputeServiceID): return self._post_resource(f"computeservice/{compute_service_id}/register", {}) def deregister(self, compute_service_id, ComputeServiceID): - return self._post_resource(f"computeservice/{compute_service_id}/deregister", {}) + return self._post_resource( + f"computeservice/{compute_service_id}/deregister", {} + ) def list_scopes(self) -> List[Scope]: scopes = self._get_resource( diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 84e84ebc..3a8b31c5 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -96,7 +96,7 @@ def __init__( name The name to give this compute service; used for Task provenance, so typically set to a distinct value to distinguish different compute - resources, e.g. different hosts or HPC clusters. + resources, e.g. different hosts or HPC clusters. shared_basedir Filesystem path to use for `ProtocolDAG` `shared` space. scratch_basedir @@ -141,17 +141,12 @@ def __init__( self._stop = False - def _register(self): - """Register this compute service with the compute API. - - """ + """Register this compute service with the compute API.""" self.client.register(self.compute_service_id) def _deregister(self): - """Deregister this compute service with the compute API. - - """ + """Deregister this compute service with the compute API.""" self.client.deregister(self.compute_service_id) def heartbeat(self): @@ -231,8 +226,8 @@ def execute(self, task: ScopedKey) -> ScopedKey: protocoldagresult = execute_DAG( protocoldag, shared=shared, - scratch_basdir = self.scratch_basedir, - keep_scratch=self.keep_scratch + scratch_basdir=self.scratch_basedir, + keep_scratch=self.keep_scratch, ) # push the result (or failure) back to the compute API @@ -292,7 +287,7 @@ def stop(self): self._stop = True # TODO: drop claims on tasks - #self.unclaim_tasks() + # self.unclaim_tasks() # Interrupt the scheduler (will finish if in the middle of an update or # something, but will cancel running calculations) @@ -302,7 +297,6 @@ def stop(self): self._deregister() - class AsynchronousComputeService(SynchronousComputeService): """Asynchronous compute service. diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index d9f3eee0..68a70b70 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -92,7 +92,9 @@ def _select_task_from_taskpool(taskpool: Subgraph) -> Union[ScopedKey, None]: return chosen_one[0] -def _generate_claim_query(task_sk: ScopedKey, compute_service_id: ComputeServiceID) -> str: +def _generate_claim_query( + task_sk: ScopedKey, compute_service_id: ComputeServiceID +) -> str: """ Generate a query to claim a single Task. Parameters @@ -138,8 +140,8 @@ class Neo4jStore(AlchemiscaleStateStore): }, "ComputeServiceRegistration": { "name": "compute_service_registration_identifier", - "property": "identifier" - } + "property": "identifier", + }, } def __init__(self, graph: "py2neo.Graph"): @@ -742,8 +744,7 @@ def set_strategy( ... def register_computeservice( - self, - compute_service_registration: ComputeServiceRegistration + self, compute_service_registration: ComputeServiceRegistration ): """Register a ComputeServiceRegistration uniquely identifying a running ComputeService. @@ -753,18 +754,16 @@ def register_computeservice( """ - node = Node("ComputeServiceRegistration", - **compute_service_registration.dict()) + node = Node("ComputeServiceRegistration", **compute_service_registration.dict()) with self.transaction() as tx: tx.merge( - node, primary_label="ComputeServiceRegistration", primary_key="identifier" + node, + primary_label="ComputeServiceRegistration", + primary_key="identifier", ) - def deregister_computeservice( - self, - compute_service_id: ComputeServiceID - ): + def deregister_computeservice(self, compute_service_id: ComputeServiceID): """Remove the registration for the given ComputeServiceID from the state store. @@ -1689,6 +1688,8 @@ def q(t): OPTIONAL MATCH (t_)<-[ar:ACTIONS]-(th:TaskHub) DELETE ar + WITH t, t_ + // if we changed the status to complete, // drop CLAIMS relationship OPTIONAL MATCH (t_)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) @@ -1764,6 +1765,8 @@ def set_task_invalid( DELETE ar DELETE are + WITH t + // drop CLAIMS relationship if present OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl @@ -1802,6 +1805,8 @@ def set_task_deleted( DELETE ar DELETE are + WITH t + // drop CLAIMS relationship if present OPTIONAL MATCH (t)<-[cl:CLAIMS]-(csreg:ComputeServiceRegistration) DELETE cl diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index 6cd5e1f1..254dd7db 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -88,7 +88,9 @@ def test_claim_taskhub_task( taskhub_sks = compute_client.query_taskhubs([scope_test]) # claim a single task; should get highest priority task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sks[0], compute_service_id="me-123") + task_sks = compute_client.claim_taskhub_tasks( + taskhub_sks[0], compute_service_id="me-123" + ) all_tasks = n4js_preloaded.get_taskhub_tasks(taskhub_sks[0], return_gufe=True) assert len(task_sks) == 1 @@ -119,7 +121,9 @@ def test_get_task_transformation( taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, compute_service_id="me-123") + task_sks = compute_client.claim_taskhub_tasks( + taskhub_sk, compute_service_id="me-123" + ) # get the transformation corresponding to this task ( @@ -223,7 +227,9 @@ def test_set_task_result( taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task - task_sks = compute_client.claim_taskhub_tasks(taskhub_sk, compute_service_id="me-123") + task_sks = compute_client.claim_taskhub_tasks( + taskhub_sk, compute_service_id="me-123" + ) # get the transformation corresponding to this task ( From cf9062250220c6082a8875cdc284edb0bd903544 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 13 Mar 2023 22:37:52 -0700 Subject: [PATCH 06/41] Added heartbeats for compute services --- alchemiscale/compute/api.py | 15 +++++++++++++-- alchemiscale/compute/client.py | 7 ++++++- alchemiscale/compute/service.py | 1 + alchemiscale/storage/statestore.py | 13 +++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 4efa846e..65ad8f06 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -104,11 +104,11 @@ async def register_computeservice( n4js: Neo4jStore = Depends(get_n4js_depends), ): now = datetime.utcnow() - csid = ComputeServiceRegistration( + csreg = ComputeServiceRegistration( identitfier=ComputeServiceID(compute_service_id), registered=now, heartbeat=now ) - n4js.register_computeservice(csid) + n4js.register_computeservice(csreg) return compute_service_id @@ -123,6 +123,17 @@ async def deregister_computeservice( return compute_service_id +@router.post("/computeservice/{compute_service_id}/heartbeat") +async def heartbeat_computeservice( + compute_service_id, + n4js: Neo4jStore = Depends(get_n4js_depends), +): + now = datetime.utcnow() + n4js.heartbeat_computeservice(compute_service_id, now) + + return compute_service_id + + @router.get("/taskhubs") async def query_taskhubs( *, diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 50461b1b..789b94c3 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -38,11 +38,16 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): def register(self, compute_service_id: ComputeServiceID): return self._post_resource(f"computeservice/{compute_service_id}/register", {}) - def deregister(self, compute_service_id, ComputeServiceID): + def deregister(self, compute_service_id: ComputeServiceID): return self._post_resource( f"computeservice/{compute_service_id}/deregister", {} ) + def heartbeat(self, compute_service_id: ComputeServiceID): + return self._post_resource( + f"computeservice/{compute_service_id}/heartbeat", {} + ) + def list_scopes(self) -> List[Scope]: scopes = self._get_resource( f"/identities/{self.identifier}/scopes", diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 3a8b31c5..99e34884 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -152,6 +152,7 @@ def _deregister(self): def heartbeat(self): """Deliver a heartbeat to the compute API, indicating this service is still alive.""" ... + self.client.heartbeat(self.compute_service_id) def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: """Get a Task to execute from compute API. diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 68a70b70..071ef184 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -786,6 +786,19 @@ def deregister_computeservice(self, compute_service_id: ComputeServiceID): with self.transaction() as tx: tx.run(q) + def heartbeat_computeservice(self, compute_service_id: ComputeServiceID, heartbeat: datetime): + """Update the heartbeat for the given ComputeServiceID. + + """ + q = f""" + MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + SET n.heartbeat = datetime({heartbeat}) + + """ + + with self.transaction() as tx: + tx.run(q) + ## task hubs def create_taskhub( From 56aa0f8bb182d073e05155ea2a472904cc50ef0b Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 13 Mar 2023 22:38:28 -0700 Subject: [PATCH 07/41] Black! --- alchemiscale/compute/client.py | 4 +- alchemiscale/storage/statestore.py | 8 +- versioneer.py | 271 +++++++++++++++++------------ 3 files changed, 165 insertions(+), 118 deletions(-) diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 789b94c3..0b749483 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -44,9 +44,7 @@ def deregister(self, compute_service_id: ComputeServiceID): ) def heartbeat(self, compute_service_id: ComputeServiceID): - return self._post_resource( - f"computeservice/{compute_service_id}/heartbeat", {} - ) + return self._post_resource(f"computeservice/{compute_service_id}/heartbeat", {}) def list_scopes(self) -> List[Scope]: scopes = self._get_resource( diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 071ef184..31e6282f 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -786,10 +786,10 @@ def deregister_computeservice(self, compute_service_id: ComputeServiceID): with self.transaction() as tx: tx.run(q) - def heartbeat_computeservice(self, compute_service_id: ComputeServiceID, heartbeat: datetime): - """Update the heartbeat for the given ComputeServiceID. - - """ + def heartbeat_computeservice( + self, compute_service_id: ComputeServiceID, heartbeat: datetime + ): + """Update the heartbeat for the given ComputeServiceID.""" q = f""" MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) SET n.heartbeat = datetime({heartbeat}) diff --git a/versioneer.py b/versioneer.py index a142bf53..f1c45727 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.22 """The Versioneer - like a rocketeer, but for versions. @@ -310,11 +309,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -327,8 +328,10 @@ def get_root(): me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(my_path), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(my_path), versioneer_py) + ) except NameError: pass return root @@ -373,15 +376,16 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -397,10 +401,14 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None), **popen_kwargs) + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) break except OSError: e = sys.exc_info()[1] @@ -423,7 +431,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, process.returncode -LONG_VERSION_PY['git'] = r''' +LONG_VERSION_PY[ + "git" +] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -1139,7 +1149,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1148,7 +1158,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1156,24 +1166,31 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') - if not re.match(r'\d', r): + if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1195,8 +1212,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1206,9 +1222,11 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", *MATCH_ARGS], - cwd=root) + describe_out, rc = runner( + GITS, + ["describe", "--tags", "--dirty", "--always", "--long", *MATCH_ARGS], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1223,8 +1241,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") @@ -1264,17 +1281,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1283,10 +1299,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1359,15 +1377,21 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1396,11 +1420,13 @@ def versions_from_file(filename): contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1409,8 +1435,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1442,8 +1467,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1472,8 +1496,7 @@ def render_pep440_branch(pieces): rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1501,7 +1524,7 @@ def render_pep440_pre(pieces): tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -1634,11 +1657,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -1662,9 +1687,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } class VersioneerBadRootError(Exception): @@ -1687,8 +1716,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1742,9 +1772,13 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } def get_version(): @@ -1800,6 +1834,7 @@ def run(self): print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1818,8 +1853,8 @@ def run(self): # setup.py egg_info -> ? # we override different "build_py" commands for both environments - if 'build_py' in cmds: - _build_py = cmds['build_py'] + if "build_py" in cmds: + _build_py = cmds["build_py"] elif "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: @@ -1834,14 +1869,14 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py - if 'build_ext' in cmds: - _build_ext = cmds['build_ext'] + if "build_ext" in cmds: + _build_ext = cmds["build_ext"] elif "setuptools" in sys.modules: from setuptools.command.build_ext import build_ext as _build_ext else: @@ -1861,14 +1896,15 @@ def run(self): return # now locate _version.py in the new build/ directory and replace # it with an updated value - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ @@ -1889,17 +1925,21 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if 'py2exe' in sys.modules: # py2exe enabled? + if "py2exe" in sys.modules: # py2exe enabled? from py2exe.distutils_buildexe import py2exe as _py2exe class cmd_py2exe(_py2exe): @@ -1915,18 +1955,22 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments - if 'sdist' in cmds: - _sdist = cmds['sdist'] + if "sdist" in cmds: + _sdist = cmds["sdist"] elif "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: @@ -1950,8 +1994,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -2011,11 +2057,9 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (OSError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -2024,15 +2068,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -2080,8 +2127,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: From 66066bff36658a183dca21116c8f45a46260b7fa Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 10:00:32 -0700 Subject: [PATCH 08/41] Fixing broken tests --- alchemiscale/compute/api.py | 18 +++++++++++------- alchemiscale/compute/client.py | 4 ++-- alchemiscale/compute/service.py | 6 +++--- alchemiscale/storage/statestore.py | 8 +++++--- .../compute/client/test_compute_client.py | 8 ++++++-- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 65ad8f06..b811ca32 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List import os import json -from datetime import datetime +from datetime import datetime, timezone from fastapi import FastAPI, APIRouter, Body, Depends, HTTPException, status from gufe.tokenization import GufeTokenizable, JSON_HANDLER @@ -103,9 +103,9 @@ async def register_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - now = datetime.utcnow() + now = datetime.now(timezone.utc) csreg = ComputeServiceRegistration( - identitfier=ComputeServiceID(compute_service_id), registered=now, heartbeat=now + identifier=compute_service_id, registered=now, heartbeat=now ) n4js.register_computeservice(csreg) @@ -128,7 +128,7 @@ async def heartbeat_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - now = datetime.utcnow() + now = datetime.now(timezone.utc) n4js.heartbeat_computeservice(compute_service_id, now) return compute_service_id @@ -163,7 +163,7 @@ async def query_taskhubs( async def claim_taskhub_tasks( taskhub_scoped_key, *, - compute_service_id: str, + compute_service_id: str = Body(), count: int = Body(), n4js: Neo4jStore = Depends(get_n4js_depends), token: TokenData = Depends(get_token_data_depends), @@ -172,7 +172,7 @@ async def claim_taskhub_tasks( validate_scopes(sk.scope, token) tasks = n4js.claim_taskhub_tasks( - taskhub=taskhub_scoped_key, compute_service_id=compute_service_id, count=count + taskhub=taskhub_scoped_key, compute_service_id=ComputeServiceID(compute_service_id), count=count ) return [str(t) if t is not None else None for t in tasks] @@ -232,8 +232,12 @@ def set_task_result( task=task_sk, protocoldagresultref=protocoldagresultref ) - # TODO: if success, set task complete, remove from all hubs + # if success, set task complete, remove from all hubs # otherwise, set as errored, leave in hubs + if protocoldagresultref.ok: + n4js.set_task_complete(tasks=[task_sk]) + else: + n4js.set_task_error(tasks=[task_sk]) return result_sk diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 0b749483..5835ce15 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -73,10 +73,10 @@ def query_taskhubs( return taskhubs def claim_taskhub_tasks( - self, taskhub: ScopedKey, compute_service_id: str, count: int = 1 + self, taskhub: ScopedKey, compute_service_id: ComputeServiceID, count: int = 1 ) -> Task: """Claim a `Task` from the specified `TaskHub`""" - data = dict(compute_service_id=compute_service_id, count=count) + data = dict(compute_service_id=str(compute_service_id), count=count) tasks = self._post_resource(f"taskhubs/{taskhub}/claim", data) return [ScopedKey.from_str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 99e34884..3d69d877 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -221,13 +221,13 @@ def execute(self, task: ScopedKey) -> ScopedKey: # execute the task; this looks the same whether the ProtocolDAG is a # success or failure - shared = self.shared_basedir / str(protocoldag.key) / self.counter - shared.mkdir() + shared = self.shared_basedir / str(protocoldag.key) / str(self.counter) + shared.mkdir(parents=True) protocoldagresult = execute_DAG( protocoldag, shared=shared, - scratch_basdir=self.scratch_basedir, + scratch_basedir=self.scratch_basedir, keep_scratch=self.keep_scratch, ) diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 31e6282f..c931b99f 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -5,7 +5,7 @@ """ import abc -from datetime import datetime +from datetime import datetime, timezone from contextlib import contextmanager import json from functools import lru_cache @@ -115,9 +115,11 @@ def _generate_claim_query( WHERE NOT (t)<-[:CLAIMS]-(:ComputeServiceRegistration) SET t.status = 'running' + WITH t + // create CLAIMS relationship with given compute service MATCH (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) - CREATE (t)<-[cl:CLAIMS {{claimed: datetime({datetime.utcnow()})}}]-(csreg) + CREATE (t)<-[cl:CLAIMS {{claimed: datetime('{datetime.now(timezone.utc).isoformat()}')}}]-(csreg) RETURN t """ @@ -792,7 +794,7 @@ def heartbeat_computeservice( """Update the heartbeat for the given ComputeServiceID.""" q = f""" MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) - SET n.heartbeat = datetime({heartbeat}) + SET n.heartbeat = datetime('{heartbeat.isoformat()}') """ diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index 254dd7db..0d857ef3 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -5,7 +5,7 @@ from alchemiscale.models import ScopedKey from alchemiscale.compute import client -from alchemiscale.storage.models import TaskStatusEnum +from alchemiscale.storage.models import TaskStatusEnum, ComputeServiceID from alchemiscale.tests.integration.compute.utils import get_compute_settings_override @@ -87,9 +87,13 @@ def test_claim_taskhub_task( ): taskhub_sks = compute_client.query_taskhubs([scope_test]) + # register compute service id + compute_service_id = ComputeServiceID('me-123') + compute_client.register(compute_service_id) + # claim a single task; should get highest priority task task_sks = compute_client.claim_taskhub_tasks( - taskhub_sks[0], compute_service_id="me-123" + taskhub_sks[0], compute_service_id=compute_service_id ) all_tasks = n4js_preloaded.get_taskhub_tasks(taskhub_sks[0], return_gufe=True) From ac5778a138c25ada6df84c3d69d531fa03a36871 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 22:26:03 -0700 Subject: [PATCH 09/41] Test suite appears fixed. --- alchemiscale/compute/api.py | 6 +- alchemiscale/storage/models.py | 21 ++++++ alchemiscale/storage/statestore.py | 10 +-- .../integration/compute/client/conftest.py | 4 +- .../compute/client/test_compute_client.py | 22 ++++-- alchemiscale/tests/integration/conftest.py | 6 ++ .../integration/storage/test_statestore.py | 71 +++++++++++++------ 7 files changed, 103 insertions(+), 37 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index b811ca32..f74d988d 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List import os import json -from datetime import datetime, timezone +from datetime import datetime from fastapi import FastAPI, APIRouter, Body, Depends, HTTPException, status from gufe.tokenization import GufeTokenizable, JSON_HANDLER @@ -103,7 +103,7 @@ async def register_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - now = datetime.now(timezone.utc) + now = datetime.utcnow() csreg = ComputeServiceRegistration( identifier=compute_service_id, registered=now, heartbeat=now ) @@ -128,7 +128,7 @@ async def heartbeat_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - now = datetime.now(timezone.utc) + now = datetime.utcnow() n4js.heartbeat_computeservice(compute_service_id, now) return compute_service_id diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index a3ae7fcc..2a6da915 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -35,6 +35,27 @@ def __repr__(self): # pragma: no cover def __str__(self): return "-".join([self.identifier]) + @classmethod + def from_now(cls, identifier: ComputeServiceID): + now = datetime.utcnow() + return cls(identifier=identifier, + registered=now, + heartbeat=now) + + + def to_dict(self): + dct = self.dict() + dct['identifier'] = str(self.identifier) + + return dct + + @classmethod + def from_dict(cls, dct): + dct_ = copy(dct) + dct_['identifier'] = ComputeServiceID(dct_['identifier']) + + return cls(**dct_) + class TaskProvenance(BaseModel): computekey: ComputeServiceID diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index c931b99f..b8e95092 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -5,7 +5,7 @@ """ import abc -from datetime import datetime, timezone +from datetime import datetime from contextlib import contextmanager import json from functools import lru_cache @@ -119,7 +119,7 @@ def _generate_claim_query( // create CLAIMS relationship with given compute service MATCH (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) - CREATE (t)<-[cl:CLAIMS {{claimed: datetime('{datetime.now(timezone.utc).isoformat()}')}}]-(csreg) + CREATE (t)<-[cl:CLAIMS {{claimed: localdatetime('{datetime.utcnow().isoformat()}')}}]-(csreg) RETURN t """ @@ -756,7 +756,7 @@ def register_computeservice( """ - node = Node("ComputeServiceRegistration", **compute_service_registration.dict()) + node = Node("ComputeServiceRegistration", **compute_service_registration.to_dict()) with self.transaction() as tx: tx.merge( @@ -794,7 +794,7 @@ def heartbeat_computeservice( """Update the heartbeat for the given ComputeServiceID.""" q = f""" MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) - SET n.heartbeat = datetime('{heartbeat.isoformat()}') + SET n.heartbeat = localdatetime('{heartbeat.isoformat()}') """ @@ -1140,7 +1140,7 @@ def get_taskhub_unclaimed_tasks( q = f""" // get list of all unclaimed tasks in the hub MATCH (th:TaskHub {{_scoped_key: '{taskhub}'}})-[:ACTIONS]->(task:Task) - WHERE task.claim IS NULL + WHERE NOT (task)<-[:CLAIMS]-(:ComputeServiceRegistration) RETURN task """ with self.transaction() as tx: diff --git a/alchemiscale/tests/integration/compute/client/conftest.py b/alchemiscale/tests/integration/compute/client/conftest.py index ff71d742..ec3b9040 100644 --- a/alchemiscale/tests/integration/compute/client/conftest.py +++ b/alchemiscale/tests/integration/compute/client/conftest.py @@ -8,6 +8,7 @@ from alchemiscale.settings import get_base_api_settings from alchemiscale.base.api import get_n4js_depends, get_s3os_depends from alchemiscale.compute import api, client +from alchemiscale.storage.models import ComputeServiceID from alchemiscale.tests.integration.compute.utils import get_compute_settings_override from alchemiscale.tests.integration.utils import running_service @@ -49,7 +50,7 @@ def uvicorn_server(compute_api): @pytest.fixture(scope="module") def compute_client( - uvicorn_server, compute_identity, single_scoped_credentialed_compute + uvicorn_server, compute_identity, single_scoped_credentialed_compute, compute_service_id ): return client.AlchemiscaleComputeClient( api_url="http://127.0.0.1:8000/", @@ -60,6 +61,7 @@ def compute_client( ) + @pytest.fixture(scope="module") def compute_client_wrong_credential(uvicorn_server, compute_identity): return client.AlchemiscaleComputeClient( diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index 0d857ef3..a921e486 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -83,14 +83,14 @@ def test_claim_taskhub_task( scope_test, n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, + compute_service_id, uvicorn_server, ): - taskhub_sks = compute_client.query_taskhubs([scope_test]) - # register compute service id - compute_service_id = ComputeServiceID('me-123') compute_client.register(compute_service_id) - + + taskhub_sks = compute_client.query_taskhubs([scope_test]) + # claim a single task; should get highest priority task task_sks = compute_client.claim_taskhub_tasks( taskhub_sks[0], compute_service_id=compute_service_id @@ -106,7 +106,7 @@ def test_claim_taskhub_task( remaining_tasks = n4js_preloaded.get_taskhub_unclaimed_tasks(taskhub_sks[0]) # claim two more tasks task_sks2 = compute_client.claim_taskhub_tasks( - taskhub_sks[0], count=2, compute_service_id="me-123" + taskhub_sks[0], count=2, compute_service_id=compute_service_id ) assert task_sks2[0] in remaining_tasks assert task_sks2[1] in remaining_tasks @@ -116,17 +116,21 @@ def test_get_task_transformation( scope_test, n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, + compute_service_id, network_tyk2, transformation, uvicorn_server, ): + # register compute service id + compute_client.register(compute_service_id) + an_sk = ScopedKey(gufe_key=network_tyk2.key, **scope_test.dict()) taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task task_sks = compute_client.claim_taskhub_tasks( - taskhub_sk, compute_service_id="me-123" + taskhub_sk, compute_service_id=compute_service_id ) # get the transformation corresponding to this task @@ -221,18 +225,22 @@ def test_set_task_result( scope_test, n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, + compute_service_id, network_tyk2, transformation, protocoldagresults, uvicorn_server, ): + # register compute service id + compute_client.register(compute_service_id) + an_sk = ScopedKey(gufe_key=network_tyk2.key, **scope_test.dict()) tf_sk = ScopedKey(gufe_key=transformation.key, **scope_test.dict()) taskhub_sk = n4js_preloaded.get_taskhub(an_sk) # claim our first task task_sks = compute_client.claim_taskhub_tasks( - taskhub_sk, compute_service_id="me-123" + taskhub_sk, compute_service_id=compute_service_id ) # get the transformation corresponding to this task diff --git a/alchemiscale/tests/integration/conftest.py b/alchemiscale/tests/integration/conftest.py index a52b69fc..fbab1d18 100644 --- a/alchemiscale/tests/integration/conftest.py +++ b/alchemiscale/tests/integration/conftest.py @@ -26,6 +26,7 @@ from alchemiscale.models import Scope from alchemiscale.settings import Neo4jStoreSettings, S3ObjectStoreSettings from alchemiscale.storage import Neo4jStore, S3ObjectStore, get_s3os +from alchemiscale.storage.models import ComputeServiceID from alchemiscale.protocols import FAHOpenmmNonEquilibriumCyclingProtocol @@ -342,3 +343,8 @@ def multiple_scopes(scope_test): ] ) return scopes + + +@fixture(scope="module") +def compute_service_id(): + return ComputeServiceID('compute-service-123') diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index f6ec3b93..8a01a328 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -14,6 +14,8 @@ TaskHub, ProtocolDAGResultRef, TaskStatusEnum, + ComputeServiceID, + ComputeServiceRegistration ) from alchemiscale.models import Scope, ScopedKey from alchemiscale.security.models import ( @@ -492,11 +494,13 @@ def test_action_task_extends(self, n4js: Neo4jStore, network_tyk2, scope_test): actioned_task_sks = n4js.action_tasks(collected_sks, taskhub_sk) assert set(actioned_task_sks) == set(collected_sks) - def test_get_unclaimed_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): + def test_get_unclaimed_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test, compute_service_id): an = network_tyk2 network_sk = n4js.create_network(an, scope_test) taskhub_sk: ScopedKey = n4js.create_taskhub(network_sk) + n4js.register_computeservice(ComputeServiceRegistration.from_now(compute_service_id)) + transformation = list(an.edges)[0] transformation_sk = n4js.get_scoped_key(transformation, scope_test) @@ -508,7 +512,7 @@ def test_get_unclaimed_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): # claim a single task; There is no deterministic ordering of tasks, so # simply test that the claimed task is one of the actioned tasks - claimed = n4js.claim_taskhub_tasks(taskhub_sk, "the best task handler") + claimed = n4js.claim_taskhub_tasks(taskhub_sk, compute_service_id) assert claimed[0] in task_sks @@ -612,7 +616,9 @@ def test_claim_taskhub_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): random.shuffle(task_sks) # try to claim from an empty hub - nothing = n4js.claim_taskhub_tasks(taskhub_sk, "early bird task handler") + csid = ComputeServiceID("early bird task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + nothing = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert nothing[0] is None @@ -621,7 +627,9 @@ def test_claim_taskhub_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): # claim a single task; there is no deterministic ordering of tasks, so # simply test that the claimed task is one of the actioned tasks - claimed = n4js.claim_taskhub_tasks(taskhub_sk, "the best task handler") + csid = ComputeServiceID("the best task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + claimed = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed[0] in task_sks @@ -635,28 +643,34 @@ def test_claim_taskhub_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): n4js.set_task_priority(task_sk, 5) n4js.set_task_priority(remaining_tasks[0], 1) - claimed2 = n4js.claim_taskhub_tasks(taskhub_sk, "another task handler") + csid = ComputeServiceID("another task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + claimed2 = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed2[0] == remaining_tasks[0] remaining_tasks = n4js.get_taskhub_unclaimed_tasks(taskhub_sk) # next task claimed should be one of the remaining tasks - claimed3 = n4js.claim_taskhub_tasks(taskhub_sk, "yet another task handler") + csid = ComputeServiceID("yet another task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + claimed3 = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed3[0] in remaining_tasks remaining_tasks = n4js.get_taskhub_unclaimed_tasks(taskhub_sk) # try to claim multiple tasks - claimed4 = n4js.claim_taskhub_tasks(taskhub_sk, "last task handler", count=4) + csid = ComputeServiceID("last task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + claimed4 = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=4) assert len(claimed4) == 4 for sk in claimed4: assert sk in remaining_tasks # exhaust the hub - claimed5 = n4js.claim_taskhub_tasks(taskhub_sk, "last task handler", count=3) + claimed5 = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=3) # try to claim from a hub with no tasks available - claimed6 = n4js.claim_taskhub_tasks(taskhub_sk, "last task handler", count=2) + claimed6 = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=2) assert claimed6 == [None] * 2 def test_action_claim_task_extends( @@ -684,13 +698,16 @@ def test_action_claim_task_extends( actioned_task_sks = n4js.action_tasks(collected_sks, taskhub_sk) assert set(actioned_task_sks) == set(collected_sks) + csid = ComputeServiceID("task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + # claim the first task - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler") + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed_task_sks == collected_sks[:1] # claim the next 9 tasks - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=9) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=9) # oops the extends task is still running! assert claimed_task_sks == [None] * 9 @@ -698,7 +715,7 @@ def test_action_claim_task_extends( n4js.set_task_complete([first_task]) # claim the next task again - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=1) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=1) assert claimed_task_sks == collected_sks[1:2] def test_action_claim_task_extends_non_extends( @@ -732,15 +749,18 @@ def test_action_claim_task_extends_non_extends( actioned_task_sks = n4js.action_tasks(collected_sks, taskhub_sk) assert set(actioned_task_sks) == set(collected_sks) + csid = ComputeServiceID("task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + # claim the first task **3** tasks, this set should be the first extends # task and the two non-extends tasks - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=3) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=3) assert set(claimed_task_sks) == set([first_task] + extra_tasks) # claim the next 10 tasks claimed_task_sks = n4js.claim_taskhub_tasks( - taskhub_sk, "task handler", count=10 + taskhub_sk, csid, count=10 ) # oops the extends task is still running and there should be no other tasks to grab assert claimed_task_sks == [None] * 10 @@ -749,7 +769,7 @@ def test_action_claim_task_extends_non_extends( n4js.set_task_complete([first_task]) # claim the next task again - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=1) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=1) assert claimed_task_sks == collected_sks[1:2] def test_action_claim_task_extends_bifuricating( @@ -789,22 +809,25 @@ def test_action_claim_task_extends_bifuricating( actioned_task_sks = n4js.action_tasks(collected_sks, taskhub_sk) assert set(actioned_task_sks) == set(collected_sks) + csid = ComputeServiceID("task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + # claim the first task - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler") + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed_task_sks == [first_task] # complete the first task n4js.set_task_complete([first_task]) # claim the next layer of tasks, should be all of layer two - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=2) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=2) assert set(claimed_task_sks) == set([layer_two_1, layer_two_2]) # complete the layer two tasks n4js.set_task_complete([layer_two_1, layer_two_2]) # claim the next layer of tasks, should be all of layer three - claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, "task handler", count=4) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=4) assert set(claimed_task_sks) == set( [layer_three_1, layer_three_2, layer_three_3, layer_three_4] ) @@ -834,12 +857,15 @@ def test_claim_task_byweight(self, n4js: Neo4jStore, network_tyk2, scope_test): weight_dict = {task_sks[0]: 10} n4js.set_task_weights(weight_dict, taskhub_sk) + csid = ComputeServiceID("the best task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + # check that the claimed task is the first task - claimed = n4js.claim_taskhub_tasks(taskhub_sk, "the best task handler") + claimed = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed[0] == task_sks[0] # claim again; should get None as no other tasks have any weight - claimed_again = n4js.claim_taskhub_tasks(taskhub_sk, "the best task handler") + claimed_again = n4js.claim_taskhub_tasks(taskhub_sk, csid) assert claimed_again[0] == None def test_get_task_transformation( @@ -1586,9 +1612,12 @@ def test_set_task_status_removes_actions_relationship( task_sks = [n4js.create_task(transformation_sk) for i in range(3)] n4js.action_tasks(task_sks, taskhub_sk) + + csid = ComputeServiceID("claimer") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) # claim all the tasks - n4js.claim_taskhub_tasks(taskhub_sk, "claimer", count=3) + n4js.claim_taskhub_tasks(taskhub_sk, csid, count=3) q = f""" MATCH (taskhub:TaskHub {{_scoped_key: '{taskhub_sk}'}}) From 05b4c1928dc84501017594d751373768fed01f93 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 22:26:27 -0700 Subject: [PATCH 10/41] Black! --- alchemiscale/compute/api.py | 4 +++- alchemiscale/storage/models.py | 9 +++------ alchemiscale/storage/statestore.py | 4 +++- .../tests/integration/compute/client/conftest.py | 6 ++++-- alchemiscale/tests/integration/conftest.py | 2 +- .../tests/integration/storage/test_statestore.py | 16 +++++++++------- 6 files changed, 23 insertions(+), 18 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index f74d988d..d3d28543 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -172,7 +172,9 @@ async def claim_taskhub_tasks( validate_scopes(sk.scope, token) tasks = n4js.claim_taskhub_tasks( - taskhub=taskhub_scoped_key, compute_service_id=ComputeServiceID(compute_service_id), count=count + taskhub=taskhub_scoped_key, + compute_service_id=ComputeServiceID(compute_service_id), + count=count, ) return [str(t) if t is not None else None for t in tasks] diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index 2a6da915..fd4c71ee 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -38,21 +38,18 @@ def __str__(self): @classmethod def from_now(cls, identifier: ComputeServiceID): now = datetime.utcnow() - return cls(identifier=identifier, - registered=now, - heartbeat=now) - + return cls(identifier=identifier, registered=now, heartbeat=now) def to_dict(self): dct = self.dict() - dct['identifier'] = str(self.identifier) + dct["identifier"] = str(self.identifier) return dct @classmethod def from_dict(cls, dct): dct_ = copy(dct) - dct_['identifier'] = ComputeServiceID(dct_['identifier']) + dct_["identifier"] = ComputeServiceID(dct_["identifier"]) return cls(**dct_) diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index b8e95092..b492384b 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -756,7 +756,9 @@ def register_computeservice( """ - node = Node("ComputeServiceRegistration", **compute_service_registration.to_dict()) + node = Node( + "ComputeServiceRegistration", **compute_service_registration.to_dict() + ) with self.transaction() as tx: tx.merge( diff --git a/alchemiscale/tests/integration/compute/client/conftest.py b/alchemiscale/tests/integration/compute/client/conftest.py index ec3b9040..a2ba6bf0 100644 --- a/alchemiscale/tests/integration/compute/client/conftest.py +++ b/alchemiscale/tests/integration/compute/client/conftest.py @@ -50,7 +50,10 @@ def uvicorn_server(compute_api): @pytest.fixture(scope="module") def compute_client( - uvicorn_server, compute_identity, single_scoped_credentialed_compute, compute_service_id + uvicorn_server, + compute_identity, + single_scoped_credentialed_compute, + compute_service_id, ): return client.AlchemiscaleComputeClient( api_url="http://127.0.0.1:8000/", @@ -61,7 +64,6 @@ def compute_client( ) - @pytest.fixture(scope="module") def compute_client_wrong_credential(uvicorn_server, compute_identity): return client.AlchemiscaleComputeClient( diff --git a/alchemiscale/tests/integration/conftest.py b/alchemiscale/tests/integration/conftest.py index fbab1d18..51c14fa0 100644 --- a/alchemiscale/tests/integration/conftest.py +++ b/alchemiscale/tests/integration/conftest.py @@ -347,4 +347,4 @@ def multiple_scopes(scope_test): @fixture(scope="module") def compute_service_id(): - return ComputeServiceID('compute-service-123') + return ComputeServiceID("compute-service-123") diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 8a01a328..977ad5fb 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -15,7 +15,7 @@ ProtocolDAGResultRef, TaskStatusEnum, ComputeServiceID, - ComputeServiceRegistration + ComputeServiceRegistration, ) from alchemiscale.models import Scope, ScopedKey from alchemiscale.security.models import ( @@ -494,12 +494,16 @@ def test_action_task_extends(self, n4js: Neo4jStore, network_tyk2, scope_test): actioned_task_sks = n4js.action_tasks(collected_sks, taskhub_sk) assert set(actioned_task_sks) == set(collected_sks) - def test_get_unclaimed_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test, compute_service_id): + def test_get_unclaimed_tasks( + self, n4js: Neo4jStore, network_tyk2, scope_test, compute_service_id + ): an = network_tyk2 network_sk = n4js.create_network(an, scope_test) taskhub_sk: ScopedKey = n4js.create_taskhub(network_sk) - n4js.register_computeservice(ComputeServiceRegistration.from_now(compute_service_id)) + n4js.register_computeservice( + ComputeServiceRegistration.from_now(compute_service_id) + ) transformation = list(an.edges)[0] transformation_sk = n4js.get_scoped_key(transformation, scope_test) @@ -759,9 +763,7 @@ def test_action_claim_task_extends_non_extends( assert set(claimed_task_sks) == set([first_task] + extra_tasks) # claim the next 10 tasks - claimed_task_sks = n4js.claim_taskhub_tasks( - taskhub_sk, csid, count=10 - ) + claimed_task_sks = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=10) # oops the extends task is still running and there should be no other tasks to grab assert claimed_task_sks == [None] * 10 @@ -1612,7 +1614,7 @@ def test_set_task_status_removes_actions_relationship( task_sks = [n4js.create_task(transformation_sk) for i in range(3)] n4js.action_tasks(task_sks, taskhub_sk) - + csid = ComputeServiceID("claimer") n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) From 9d5a8e44aeafd3399d3bf5df60717b204a3e5e1e Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 22:35:28 -0700 Subject: [PATCH 11/41] Unblackify versioneer.py --- versioneer.py | 271 +++++++++++++++++++++----------------------------- 1 file changed, 111 insertions(+), 160 deletions(-) diff --git a/versioneer.py b/versioneer.py index f1c45727..a142bf53 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,3 +1,4 @@ + # Version: 0.22 """The Versioneer - like a rocketeer, but for versions. @@ -309,13 +310,11 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ( - "Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND')." - ) + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -328,10 +327,8 @@ def get_root(): me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: - print( - "Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(my_path), versioneer_py) - ) + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(my_path), versioneer_py)) except NameError: pass return root @@ -376,16 +373,15 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f - return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -401,14 +397,10 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen( - [command] + args, - cwd=cwd, - env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr else None), - **popen_kwargs, - ) + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None), **popen_kwargs) break except OSError: e = sys.exc_info()[1] @@ -431,9 +423,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= return stdout, process.returncode -LONG_VERSION_PY[ - "git" -] = r''' +LONG_VERSION_PY['git'] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -1149,7 +1139,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1158,7 +1148,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r"\d", r)} + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1166,31 +1156,24 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix) :] + r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') - if not re.match(r"\d", r): + if not re.match(r'\d', r): continue if verbose: print("picking %s" % r) - return { - "version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": None, - "date": date, - } + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return { - "version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": "no suitable tags", - "date": None, - } + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") @@ -1212,7 +1195,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1222,11 +1206,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner( - GITS, - ["describe", "--tags", "--dirty", "--always", "--long", *MATCH_ARGS], - cwd=root, - ) + describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", *MATCH_ARGS], + cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1241,7 +1223,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") @@ -1281,16 +1264,17 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[: git_describe.rindex("-dirty")] + git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) return pieces # tag @@ -1299,12 +1283,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, - ) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1377,21 +1359,15 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - "date": None, - } + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1420,13 +1396,11 @@ def versions_from_file(filename): contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") - mo = re.search( - r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) if not mo: - mo = re.search( - r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1435,7 +1409,8 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1467,7 +1442,8 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1496,7 +1472,8 @@ def render_pep440_branch(pieces): rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + rendered += "+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1524,7 +1501,7 @@ def render_pep440_pre(pieces): tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -1657,13 +1634,11 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return { - "version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None, - } + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} if not style or style == "default": style = "pep440" # the default @@ -1687,13 +1662,9 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return { - "version": rendered, - "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], - "error": None, - "date": pieces.get("date"), - } + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} class VersioneerBadRootError(Exception): @@ -1716,9 +1687,8 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert ( - cfg.versionfile_source is not None - ), "please set versioneer.versionfile_source" + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1772,13 +1742,9 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", - "date": None, - } + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} def get_version(): @@ -1834,7 +1800,6 @@ def run(self): print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) - cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1853,8 +1818,8 @@ def run(self): # setup.py egg_info -> ? # we override different "build_py" commands for both environments - if "build_py" in cmds: - _build_py = cmds["build_py"] + if 'build_py' in cmds: + _build_py = cmds['build_py'] elif "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: @@ -1869,14 +1834,14 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) - cmds["build_py"] = cmd_build_py - if "build_ext" in cmds: - _build_ext = cmds["build_ext"] + if 'build_ext' in cmds: + _build_ext = cmds['build_ext'] elif "setuptools" in sys.modules: from setuptools.command.build_ext import build_ext as _build_ext else: @@ -1896,15 +1861,14 @@ def run(self): return # now locate _version.py in the new build/ directory and replace # it with an updated value - target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) - cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe - # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ @@ -1925,21 +1889,17 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if "py2exe" in sys.modules: # py2exe enabled? + if 'py2exe' in sys.modules: # py2exe enabled? from py2exe.distutils_buildexe import py2exe as _py2exe class cmd_py2exe(_py2exe): @@ -1955,22 +1915,18 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments - if "sdist" in cmds: - _sdist = cmds["sdist"] + if 'sdist' in cmds: + _sdist = cmds['sdist'] elif "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: @@ -1994,10 +1950,8 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file( - target_versionfile, self._versioneer_generated_versions - ) - + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds @@ -2057,9 +2011,11 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: + except (OSError, configparser.NoSectionError, + configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -2068,18 +2024,15 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -2127,10 +2080,8 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print( - " appending versionfile_source ('%s') to MANIFEST.in" - % cfg.versionfile_source - ) + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: From b0e67de244ee93eddaeee9367840e02709a5e12c Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 22:56:12 -0700 Subject: [PATCH 12/41] Switching to WIP branch for gufe changes used here --- devtools/conda-envs/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/test.yml b/devtools/conda-envs/test.yml index 59aa8d56..c950268f 100644 --- a/devtools/conda-envs/test.yml +++ b/devtools/conda-envs/test.yml @@ -56,7 +56,7 @@ dependencies: - pip: - git+https://github.com/dotsdl/grolt@relax-cryptography # neo4j test server deployment - - git+https://github.com/OpenFreeEnergy/gufe + - git+https://github.com/OpenFreeEnergy/gufe@alchemiscale-compute - git+https://github.com/OpenFreeEnergy/openfe - git+https://github.com/dotsdl/openfe-benchmarks@ligandnetwork - git+https://github.com/mikemhenry/openff-models.git@support_nested_models From 1800c6f65d5c34a3a7e8ad4a9158ce92c62c73e5 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 22:58:15 -0700 Subject: [PATCH 13/41] Added state store tests for computeserviceregistration --- .../integration/storage/test_statestore.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 977ad5fb..775fc7a6 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -1,3 +1,4 @@ +from datetime import datetime, timedelta import random from time import sleep from typing import List, Dict @@ -272,6 +273,67 @@ def test_get_transformation_failures( ### compute + def test_register_computeservice(self, n4js, compute_service_id): + now = datetime.utcnow() + registration = ComputeServiceRegistration(identifier=compute_service_id, + registered=now, + heartbeat=now) + + n4js.register_computeservice(registration) + + csreg = n4js.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg['identifier'] == compute_service_id + assert csreg['registered'] == now + assert csreg['heartbeat'] == now + + def test_deregister(self, n4js, compute_service_id): + now = datetime.utcnow() + registration = ComputeServiceRegistration(identifier=compute_service_id, + registered=now, + heartbeat=now) + + n4js.register_computeservice(registration) + + # try deregistering + n4js.deregister_computeservice(compute_service_id) + + csreg = n4js.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg is None + + def test_heartbeat(self, n4js, compute_service_id): + now = datetime.utcnow() + registration = ComputeServiceRegistration(identifier=compute_service_id, + registered=now, + heartbeat=now) + + n4js.register_computeservice(registration) + + # perform a heartbeat + tomorrow = now + timedelta(days=1) + n4js.heartbeat_computeservice(compute_service_id, tomorrow) + + csreg = n4js.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg['registered'] == now + assert csreg['heartbeat'] == tomorrow + def test_create_task(self, n4js, network_tyk2, scope_test): # add alchemical network, then try generating task an = network_tyk2 From e722c8373c3cd9c6140ea3aaa18bb04f92fc6d7d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 23:16:57 -0700 Subject: [PATCH 14/41] Build container with temporary gufe branch --- devtools/conda-envs/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/docker.yml b/devtools/conda-envs/docker.yml index 0c1d2c70..2ef0722e 100644 --- a/devtools/conda-envs/docker.yml +++ b/devtools/conda-envs/docker.yml @@ -64,7 +64,7 @@ dependencies: - pip: #- git+https://github.com/dotsdl/grolt@relax-cryptography # neo4j test server deployment - - git+https://github.com/OpenFreeEnergy/gufe + - git+https://github.com/OpenFreeEnergy/gufe@alchemiscale-compute - git+https://github.com/OpenFreeEnergy/openfe - git+https://github.com/OpenFreeEnergy/openfe-benchmarks - git+https://github.com/mikemhenry/openff-models.git@support_nested_models From 2c554b857d2bb4251d269db1cba64f7e48701b09 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 23:17:53 -0700 Subject: [PATCH 15/41] Black! --- .../integration/storage/test_statestore.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 775fc7a6..852d52fa 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -275,9 +275,9 @@ def test_get_transformation_failures( def test_register_computeservice(self, n4js, compute_service_id): now = datetime.utcnow() - registration = ComputeServiceRegistration(identifier=compute_service_id, - registered=now, - heartbeat=now) + registration = ComputeServiceRegistration( + identifier=compute_service_id, registered=now, heartbeat=now + ) n4js.register_computeservice(registration) @@ -288,15 +288,15 @@ def test_register_computeservice(self, n4js, compute_service_id): """ ).to_subgraph() - assert csreg['identifier'] == compute_service_id - assert csreg['registered'] == now - assert csreg['heartbeat'] == now + assert csreg["identifier"] == compute_service_id + assert csreg["registered"] == now + assert csreg["heartbeat"] == now def test_deregister(self, n4js, compute_service_id): now = datetime.utcnow() - registration = ComputeServiceRegistration(identifier=compute_service_id, - registered=now, - heartbeat=now) + registration = ComputeServiceRegistration( + identifier=compute_service_id, registered=now, heartbeat=now + ) n4js.register_computeservice(registration) @@ -314,9 +314,9 @@ def test_deregister(self, n4js, compute_service_id): def test_heartbeat(self, n4js, compute_service_id): now = datetime.utcnow() - registration = ComputeServiceRegistration(identifier=compute_service_id, - registered=now, - heartbeat=now) + registration = ComputeServiceRegistration( + identifier=compute_service_id, registered=now, heartbeat=now + ) n4js.register_computeservice(registration) @@ -331,8 +331,8 @@ def test_heartbeat(self, n4js, compute_service_id): """ ).to_subgraph() - assert csreg['registered'] == now - assert csreg['heartbeat'] == tomorrow + assert csreg["registered"] == now + assert csreg["heartbeat"] == tomorrow def test_create_task(self, n4js, network_tyk2, scope_test): # add alchemical network, then try generating task From d5e81b37d727941707a8145f0201ac015b830ef7 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 23:35:38 -0700 Subject: [PATCH 16/41] Added ComputeServiceRegistration tests to ComputeClient tests --- alchemiscale/compute/client.py | 9 ++- .../compute/client/test_compute_client.py | 64 +++++++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 5835ce15..53a08cce 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -36,15 +36,18 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): _exception = AlchemiscaleComputeClientError def register(self, compute_service_id: ComputeServiceID): - return self._post_resource(f"computeservice/{compute_service_id}/register", {}) + res = self._post_resource(f"computeservice/{compute_service_id}/register", {}) + return ComputeServiceID(res) def deregister(self, compute_service_id: ComputeServiceID): - return self._post_resource( + res = self._post_resource( f"computeservice/{compute_service_id}/deregister", {} ) + return ComputeServiceID(res) def heartbeat(self, compute_service_id: ComputeServiceID): - return self._post_resource(f"computeservice/{compute_service_id}/heartbeat", {}) + res = self._post_resource(f"computeservice/{compute_service_id}/heartbeat", {}) + return ComputeServiceID(res) def list_scopes(self) -> List[Scope]: scopes = self._get_resource( diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index a921e486..49b81674 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -51,6 +51,70 @@ def test_api_check( ): compute_client._api_check() + def test_register( + self, + n4js_preloaded, + compute_client: client.AlchemiscaleComputeClient, + uvicorn_server, + compute_service_id + ): + out = compute_client.register(compute_service_id) + assert out == compute_service_id + + csreg = n4js_preloaded.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg is not None + assert csreg['registered'] == csreg['heartbeat'] + + def test_deregister( + self, + n4js_preloaded, + compute_client: client.AlchemiscaleComputeClient, + uvicorn_server, + compute_service_id + ): + out = compute_client.register(compute_service_id) + assert out == compute_service_id + + out = compute_client.deregister(compute_service_id) + assert out == compute_service_id + + csreg = n4js_preloaded.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg is None + + def test_heartbeat( + self, + n4js_preloaded, + compute_client: client.AlchemiscaleComputeClient, + uvicorn_server, + compute_service_id + ): + compute_client.register(compute_service_id) + + out = compute_client.heartbeat(compute_service_id) + assert out == compute_service_id + + csreg = n4js_preloaded.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg is not None + assert csreg['registered'] < csreg['heartbeat'] + def test_list_scope( self, n4js_preloaded, From a8e6e00045250dc0706f0a278a59c59b613ad9e3 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 15 Mar 2023 23:36:09 -0700 Subject: [PATCH 17/41] Black! --- alchemiscale/compute/client.py | 6 ++---- .../compute/client/test_compute_client.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 53a08cce..9b543288 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -36,13 +36,11 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): _exception = AlchemiscaleComputeClientError def register(self, compute_service_id: ComputeServiceID): - res = self._post_resource(f"computeservice/{compute_service_id}/register", {}) + res = self._post_resource(f"computeservice/{compute_service_id}/register", {}) return ComputeServiceID(res) def deregister(self, compute_service_id: ComputeServiceID): - res = self._post_resource( - f"computeservice/{compute_service_id}/deregister", {} - ) + res = self._post_resource(f"computeservice/{compute_service_id}/deregister", {}) return ComputeServiceID(res) def heartbeat(self, compute_service_id: ComputeServiceID): diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index 49b81674..ca50b50c 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -56,8 +56,8 @@ def test_register( n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, uvicorn_server, - compute_service_id - ): + compute_service_id, + ): out = compute_client.register(compute_service_id) assert out == compute_service_id @@ -69,15 +69,15 @@ def test_register( ).to_subgraph() assert csreg is not None - assert csreg['registered'] == csreg['heartbeat'] + assert csreg["registered"] == csreg["heartbeat"] def test_deregister( self, n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, uvicorn_server, - compute_service_id - ): + compute_service_id, + ): out = compute_client.register(compute_service_id) assert out == compute_service_id @@ -98,8 +98,8 @@ def test_heartbeat( n4js_preloaded, compute_client: client.AlchemiscaleComputeClient, uvicorn_server, - compute_service_id - ): + compute_service_id, + ): compute_client.register(compute_service_id) out = compute_client.heartbeat(compute_service_id) @@ -113,7 +113,7 @@ def test_heartbeat( ).to_subgraph() assert csreg is not None - assert csreg['registered'] < csreg['heartbeat'] + assert csreg["registered"] < csreg["heartbeat"] def test_list_scope( self, From 1ecaa16dcbb4d83ccc8a0e9adfc7c401f3d0690a Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 17 Mar 2023 20:18:25 -0700 Subject: [PATCH 18/41] CLI entrypoint in place for SynchronousComputeService Performing live tests. --- alchemiscale/cli.py | 48 +++++++++++--- alchemiscale/compute/service.py | 113 ++++++++++++++++++++------------ alchemiscale/settings.py | 12 ++-- docker/docker-compose.yml | 5 ++ 4 files changed, 121 insertions(+), 57 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 2bbb1543..35671d2d 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -5,11 +5,12 @@ """ import click +import yaml +import signal import gunicorn.app.base from typing import Type -from .models import Scope -from .security.auth import hash_key, authenticate, AuthenticationError +from .security.auth import hash_key from .security.models import ( CredentialedEntity, CredentialedUserIdentity, @@ -228,7 +229,7 @@ def cli(): name="api", help="Start the user-facing API service", ) -@api_starting_params("FA_API_HOST", "FA_API_PORT", "FA_API_LOGLEVEL") +@api_starting_params("ALCHEMISCALE_API_HOST", "ALCHEMISCALE_API_PORT", "ALCHEMISCALE_API_LOGLEVEL") @db_params @s3os_params @jwt_params @@ -268,7 +269,7 @@ def get_settings_override(): app.dependency_overrides[get_base_api_settings] = get_settings_override - start_api(app, workers, host["FA_API_HOST"], port["FA_API_PORT"]) + start_api(app, workers, host["ALCHEMISCALE_API_HOST"], port["ALCHEMISCALE_API_PORT"]) @cli.group(help="Subcommands for the compute service") @@ -278,7 +279,7 @@ def compute(): @compute.command(help="Start the compute API service.") @api_starting_params( - "FA_COMPUTE_API_HOST", "FA_COMPUTE_API_PORT", "FA_COMPUTE_API_LOGLEVEL" + "ALCHEMISCALE_COMPUTE_API_HOST", "ALCHEMISCALE_COMPUTE_API_PORT", "ALCHEMISCALE_COMPUTE_API_LOGLEVEL" ) @db_params @s3os_params @@ -316,12 +317,41 @@ def get_settings_override(): app.dependency_overrides[get_base_api_settings] = get_settings_override - start_api(app, workers, host["FA_COMPUTE_API_HOST"], port["FA_COMPUTE_API_PORT"]) + start_api(app, workers, host["ALCHEMISCALE_COMPUTE_API_HOST"], port["ALCHEMISCALE_COMPUTE_API_PORT"]) @compute.command(help="Start the synchronous compute service.") -def synchronous(): - ... +@click.option( + "--config-file", + "-c", + type=click.File(), + help="YAML-based configuration file giving the settings for this service", + required=True + ) +def synchronous(config_file): + from alchemiscale.models import Scope + from alchemiscale.compute.service import SynchronousComputeService + + params = yaml.load(config_file, Loader=yaml.Loader) + + if 'scopes' in params: + params['scopes'] = [Scope.from_str(scope) for scope in params['scopes']] + + service = SynchronousComputeService(**params) + + # add signal handling + for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: + + def stop(*args, **kwargs): + service.stop() + raise KeyboardInterrupt() + + signal.signal(getattr(signal, signame), stop) + + try: + service.start() + except KeyboardInterrupt: + pass @cli.group(help="Subcommands for the database") @@ -491,6 +521,7 @@ def remove(url, user, password, dbname, identity_type, identifier): @scope def add_scope(url, user, password, dbname, identity_type, identifier, scope): """Add a scope for the given identity.""" + from .models import Scope from .storage.statestore import get_n4js from .settings import Neo4jStoreSettings @@ -532,6 +563,7 @@ def list_scope(url, user, password, dbname, identity_type, identifier): @scope def remove_scope(url, user, password, dbname, identity_type, identifier, scope): """Remove a scope for the given identity(s).""" + from .models import Scope from .storage.statestore import get_n4js from .settings import Neo4jStoreSettings diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 3d69d877..d93940ab 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -4,9 +4,11 @@ """ +import os import asyncio import sched import time +import logging from uuid import uuid4 import random import threading @@ -75,20 +77,21 @@ def __init__( identifier: str, key: str, name: str, - shared_basedir: Path, - scratch_basedir: Path, + shared_basedir: os.PathLike, + scratch_basedir: os.PathLike, keep_scratch: bool = False, sleep_interval: int = 30, heartbeat_frequency: int = 30, scopes: Optional[List[Scope]] = None, limit: int = 1, + loglevel='WARN', ): """Create a `SynchronousComputeService` instance. Parameters ---------- api_url - URL pointing to the compute API to execute Tasks for. + URL of the compute API to execute Tasks for. identifier Identifier for the compute identity used for authentication. key @@ -125,11 +128,13 @@ def __init__( if scopes is None: self.scopes = [Scope()] + else: + self.scopes = scopes - self.shared_basedir = shared_basedir + self.shared_basedir = Path(shared_basedir) self.shared_basedir.mkdir(exist_ok=True) - self.scratch_basedir = scratch_basedir + self.scratch_basedir = Path(scratch_basedir) self.scratch_basedir.mkdir(exist_ok=True) self.keep_scratch = keep_scratch @@ -139,7 +144,12 @@ def __init__( self.compute_service_id = ComputeServiceID(f"{self.name}-{uuid4()}") - self._stop = False + self.int_sleep = InterruptableSleep() + self.logger = logging.getLogger("AlchemiscaleSynchronousComputeService") + self.logger.setLevel(loglevel) + + self.logger.addHandler(logging.StreamHandler()) + def _register(self): """Register this compute service with the compute API.""" @@ -164,6 +174,9 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: scopes=self.scopes, return_gufe=True ) + if len(taskhubs) == 0: + return [] + # based on weights, choose taskhub to draw from taskhub: List[ScopedKey] = random.choices( list(taskhubs.keys()), weights=[tq.weight for tq in taskhubs.values()] @@ -229,6 +242,7 @@ def execute(self, task: ScopedKey) -> ScopedKey: shared=shared, scratch_basedir=self.scratch_basedir, keep_scratch=self.keep_scratch, + raise_error=False ) # push the result (or failure) back to the compute API @@ -236,6 +250,35 @@ def execute(self, task: ScopedKey) -> ScopedKey: return result_sk + def cycle(self, task_limit): + if task_limit is not None: + if self.counter >= task_limit: + self.logger.info("Performed %s tasks; beyond task limit %s", self.counter, task_limit) + return + + # claim tasks from the compute API + self.logger.info("Claiming tasks") + tasks: List[ScopedKey] = self.claim_tasks(self.limit) + self.logger.info("Claimed %d tasks", len([t for t in tasks if t is not None])) + + # if no tasks claimed, sleep + if all([task is None for task in tasks]): + self.logger.info("No tasks claimed; sleeping for %d seconds", self.sleep_interval) + time.sleep(self.sleep_interval) + return + + # otherwise, process tasks + self.logger.info("Executing tasks...") + for task in tasks: + if task is None: + continue + + # execute each task + self.logger.info("Executing task '%s'...", task) + self.execute(task) + self.logger.info("Completed task '%s'", task) + self.counter += 1 + def start(self, task_limit: Optional[int] = None): """Start the service. @@ -246,57 +289,41 @@ def start(self, task_limit: Optional[int] = None): If `None`, the service will continue until told to stop. """ + # add ComputeServiceRegistration + self.logger.info("Starting up service '%s'", self.name) self._register() + self.logger.info("Registered service with registration '%s'", + str(self.compute_service_id)) + + def scheduler_cycle(): + self.cycle(task_limit) + self.scheduler.enter(0, 1, scheduler_cycle) def scheduler_heartbeat(): self.heartbeat() self.scheduler.enter(self.heartbeat_frequency, 1, scheduler_heartbeat) + self.scheduler.enter(0, 1, scheduler_cycle) self.scheduler.enter(0, 2, scheduler_heartbeat) - while True: - if task_limit is not None: - if self.counter >= task_limit: - break - - if self._stop: - return - - # claim tasks from the compute API - tasks: List[ScopedKey] = self.claim_tasks(self.limit) - - # if no tasks claimed, sleep - if all([task is None for task in tasks]): - if self._stop: - return - time.sleep(self.sleep_interval) - continue - - # otherwise, process tasks - for task in tasks: - if self._stop: - return - - if task is None: - continue - - # execute each task - self.execute(task) - self.counter += 1 + try: + self.logger.info("Starting main loop") + self.scheduler.run() + except KeyboardInterrupt: + self.logger.info("Caught SIGINT/Keyboard interrupt.") + except SleepInterrupted: + self.logger.info("Service stopping.") + finally: + # remove ComputeServiceRegistration, drop all claims + self._deregister() + self.logger.info("Deregistered service with registration '%s'", + str(self.compute_service_id)) def stop(self): - self._stop = True - - # TODO: drop claims on tasks - # self.unclaim_tasks() - # Interrupt the scheduler (will finish if in the middle of an update or # something, but will cancel running calculations) self.int_sleep.interrupt() - # remove ComputeServiceID, drop all claims - self._deregister() - class AsynchronousComputeService(SynchronousComputeService): """Asynchronous compute service. diff --git a/alchemiscale/settings.py b/alchemiscale/settings.py index d70e1dab..5f7f61a4 100644 --- a/alchemiscale/settings.py +++ b/alchemiscale/settings.py @@ -66,9 +66,9 @@ class APISettings(BaseAPISettings): """ - FA_API_HOST: str = "127.0.0.1" - FA_API_PORT: int = 80 - FA_API_LOGLEVEL: str = "info" + ALCHEMISCALE_API_HOST: str = "127.0.0.1" + ALCHEMISCALE_API_PORT: int = 80 + ALCHEMISCALE_API_LOGLEVEL: str = "info" class ComputeAPISettings(BaseAPISettings): @@ -77,9 +77,9 @@ class ComputeAPISettings(BaseAPISettings): """ - FA_COMPUTE_API_HOST: str = "127.0.0.1" - FA_COMPUTE_API_PORT: int = 80 - FA_COMPUTE_API_LOGLEVEL: str = "info" + ALCHEMISCALE_COMPUTE_API_HOST: str = "127.0.0.1" + ALCHEMISCALE_COMPUTE_API_PORT: int = 80 + ALCHEMISCALE_COMPUTE_API_LOGLEVEL: str = "info" @lru_cache() diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 2caa78b9..035daa24 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -176,6 +176,11 @@ services: - internal - web # Enables the web UI and tells Traefik to listen to docker + depends_on: + alchemiscale-client-api: + condition: service_healthy + alchemiscale-compute-api: + condition: service_healthy command: - "--log.level=DEBUG" - "--providers.docker" From 35a70cb29534e19cb2e504a5ebcc775f1f88f989 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 20 Mar 2023 23:13:20 -0700 Subject: [PATCH 19/41] SynchronousComputeService now hits multiple taskhubs when claiming if necessary --- alchemiscale/compute/service.py | 28 ++++++++++++++++++++-------- devtools/conda-envs/docker.yml | 1 + 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index d93940ab..2ed4f7eb 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -170,6 +170,9 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: Returns `None` if no Task was available matching service configuration. """ + # list of tasks to return + tasks = [] + taskhubs: Dict[ScopedKey, TaskHub] = self.client.query_taskhubs( scopes=self.scopes, return_gufe=True ) @@ -177,15 +180,24 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: if len(taskhubs) == 0: return [] - # based on weights, choose taskhub to draw from - taskhub: List[ScopedKey] = random.choices( - list(taskhubs.keys()), weights=[tq.weight for tq in taskhubs.values()] - )[0] + while len(tasks) < count and len(taskhubs) > 0: + # based on weights, choose taskhub to draw from + taskhub: List[ScopedKey] = random.choices( + list(taskhubs.keys()), weights=[tq.weight for tq in taskhubs.values()] + )[0] - # claim tasks from the taskhub - tasks = self.client.claim_taskhub_tasks( - taskhub, compute_service_id=self.compute_service_id, count=count - ) + # claim tasks from the taskhub + claimed_tasks = self.client.claim_taskhub_tasks( + taskhub, compute_service_id=self.compute_service_id, count=count + ) + + # gather up claimed tasks, if present + for t in claimed_tasks: + if t is not None: + tasks.append(t) + + # remove this taskhub from the options available; repeat + taskhubs.pop(taskhub) return tasks diff --git a/devtools/conda-envs/docker.yml b/devtools/conda-envs/docker.yml index 2ef0722e..98de551e 100644 --- a/devtools/conda-envs/docker.yml +++ b/devtools/conda-envs/docker.yml @@ -61,6 +61,7 @@ dependencies: - dask - distributed - numba + - pymbar >=3.0.6,<4 - pip: #- git+https://github.com/dotsdl/grolt@relax-cryptography # neo4j test server deployment From a1b668a3c9accc735e76a77a56e97d509ed38618 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 22 Mar 2023 10:37:30 -0700 Subject: [PATCH 20/41] Added check for Transformation, Task in create_task --- alchemiscale/cli.py | 37 +++++++++++++++++++----------- alchemiscale/compute/service.py | 32 +++++++++++++++++--------- alchemiscale/storage/statestore.py | 8 +++++++ docker/docker-compose.yml | 4 ++-- 4 files changed, 55 insertions(+), 26 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 35671d2d..f64b56b0 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -229,7 +229,9 @@ def cli(): name="api", help="Start the user-facing API service", ) -@api_starting_params("ALCHEMISCALE_API_HOST", "ALCHEMISCALE_API_PORT", "ALCHEMISCALE_API_LOGLEVEL") +@api_starting_params( + "ALCHEMISCALE_API_HOST", "ALCHEMISCALE_API_PORT", "ALCHEMISCALE_API_LOGLEVEL" +) @db_params @s3os_params @jwt_params @@ -269,17 +271,21 @@ def get_settings_override(): app.dependency_overrides[get_base_api_settings] = get_settings_override - start_api(app, workers, host["ALCHEMISCALE_API_HOST"], port["ALCHEMISCALE_API_PORT"]) + start_api( + app, workers, host["ALCHEMISCALE_API_HOST"], port["ALCHEMISCALE_API_PORT"] + ) -@cli.group(help="Subcommands for the compute service") +@cli.group(help="Subcommands for compute services") def compute(): ... @compute.command(help="Start the compute API service.") @api_starting_params( - "ALCHEMISCALE_COMPUTE_API_HOST", "ALCHEMISCALE_COMPUTE_API_PORT", "ALCHEMISCALE_COMPUTE_API_LOGLEVEL" + "ALCHEMISCALE_COMPUTE_API_HOST", + "ALCHEMISCALE_COMPUTE_API_PORT", + "ALCHEMISCALE_COMPUTE_API_LOGLEVEL", ) @db_params @s3os_params @@ -317,25 +323,30 @@ def get_settings_override(): app.dependency_overrides[get_base_api_settings] = get_settings_override - start_api(app, workers, host["ALCHEMISCALE_COMPUTE_API_HOST"], port["ALCHEMISCALE_COMPUTE_API_PORT"]) + start_api( + app, + workers, + host["ALCHEMISCALE_COMPUTE_API_HOST"], + port["ALCHEMISCALE_COMPUTE_API_PORT"], + ) @compute.command(help="Start the synchronous compute service.") @click.option( - "--config-file", - "-c", - type=click.File(), - help="YAML-based configuration file giving the settings for this service", - required=True - ) + "--config-file", + "-c", + type=click.File(), + help="YAML-based configuration file giving the settings for this service", + required=True, +) def synchronous(config_file): from alchemiscale.models import Scope from alchemiscale.compute.service import SynchronousComputeService params = yaml.load(config_file, Loader=yaml.Loader) - if 'scopes' in params: - params['scopes'] = [Scope.from_str(scope) for scope in params['scopes']] + if "scopes" in params: + params["scopes"] = [Scope.from_str(scope) for scope in params["scopes"]] service = SynchronousComputeService(**params) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 2ed4f7eb..eea0c935 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -84,7 +84,7 @@ def __init__( heartbeat_frequency: int = 30, scopes: Optional[List[Scope]] = None, limit: int = 1, - loglevel='WARN', + loglevel="WARN", ): """Create a `SynchronousComputeService` instance. @@ -150,7 +150,6 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) - def _register(self): """Register this compute service with the compute API.""" self.client.register(self.compute_service_id) @@ -161,8 +160,8 @@ def _deregister(self): def heartbeat(self): """Deliver a heartbeat to the compute API, indicating this service is still alive.""" - ... self.client.heartbeat(self.compute_service_id) + self.logger.info("Updated heartbeat") def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: """Get a Task to execute from compute API. @@ -180,6 +179,8 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: if len(taskhubs) == 0: return [] + # claim tasks from taskhubs based on weight; keep going till we hit our + # total desired task count, or we run out of taskhubs to draw from while len(tasks) < count and len(taskhubs) > 0: # based on weights, choose taskhub to draw from taskhub: List[ScopedKey] = random.choices( @@ -188,7 +189,9 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: # claim tasks from the taskhub claimed_tasks = self.client.claim_taskhub_tasks( - taskhub, compute_service_id=self.compute_service_id, count=count + taskhub, + compute_service_id=self.compute_service_id, + count=(count - len(tasks)), ) # gather up claimed tasks, if present @@ -254,7 +257,7 @@ def execute(self, task: ScopedKey) -> ScopedKey: shared=shared, scratch_basedir=self.scratch_basedir, keep_scratch=self.keep_scratch, - raise_error=False + raise_error=False, ) # push the result (or failure) back to the compute API @@ -265,7 +268,9 @@ def execute(self, task: ScopedKey) -> ScopedKey: def cycle(self, task_limit): if task_limit is not None: if self.counter >= task_limit: - self.logger.info("Performed %s tasks; beyond task limit %s", self.counter, task_limit) + self.logger.info( + "Performed %s tasks; beyond task limit %s", self.counter, task_limit + ) return # claim tasks from the compute API @@ -275,7 +280,9 @@ def cycle(self, task_limit): # if no tasks claimed, sleep if all([task is None for task in tasks]): - self.logger.info("No tasks claimed; sleeping for %d seconds", self.sleep_interval) + self.logger.info( + "No tasks claimed; sleeping for %d seconds", self.sleep_interval + ) time.sleep(self.sleep_interval) return @@ -304,8 +311,9 @@ def start(self, task_limit: Optional[int] = None): # add ComputeServiceRegistration self.logger.info("Starting up service '%s'", self.name) self._register() - self.logger.info("Registered service with registration '%s'", - str(self.compute_service_id)) + self.logger.info( + "Registered service with registration '%s'", str(self.compute_service_id) + ) def scheduler_cycle(): self.cycle(task_limit) @@ -328,8 +336,10 @@ def scheduler_heartbeat(): finally: # remove ComputeServiceRegistration, drop all claims self._deregister() - self.logger.info("Deregistered service with registration '%s'", - str(self.compute_service_id)) + self.logger.info( + "Deregistered service with registration '%s'", + str(self.compute_service_id), + ) def stop(self): # Interrupt the scheduler (will finish if in the middle of an update or diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index b492384b..462d6a6f 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -1272,6 +1272,14 @@ def create_task( """ scope = transformation.scope + if "Transformation" not in transformation.qualname: + raise ValueError( + "`transformation` ScopedKey does not correspond to a `Transformation`" + ) + + if extends is not None and transformation.qualname != "Task": + raise ValueError("`extends` ScopedKey does not correspond to a `Task`") + transformation_node = self._get_node(transformation) # create a new task for the supplied transformation diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 035daa24..be88b87c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -177,9 +177,9 @@ services: - web # Enables the web UI and tells Traefik to listen to docker depends_on: - alchemiscale-client-api: + alchemiscale-client-API: condition: service_healthy - alchemiscale-compute-api: + alchemiscale-compute-API: condition: service_healthy command: - "--log.level=DEBUG" From 4ab9d4cd64b4026e62a75e60d65f67986a9a3605 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 22 Mar 2023 22:45:58 -0700 Subject: [PATCH 21/41] Dropped use of sched in SynchronousComputeService; using thread for heartbeat Need at least a separate thread for heartbeat to avoid it being blocked by a long calculation. --- alchemiscale/compute/service.py | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index eea0c935..0d7a5210 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -81,7 +81,7 @@ def __init__( scratch_basedir: os.PathLike, keep_scratch: bool = False, sleep_interval: int = 30, - heartbeat_frequency: int = 30, + heartbeat_interval: int = 30, scopes: Optional[List[Scope]] = None, limit: int = 1, loglevel="WARN", @@ -109,7 +109,7 @@ def __init__( completion. sleep_interval Time in seconds to sleep if no Tasks claimed from compute API. - heartbeat_frequency + heartbeat_interval Frequency at which to send heartbeats to compute API. scopes Scopes to limit Task claiming to; defaults to all Scopes accessible @@ -121,7 +121,7 @@ def __init__( self.api_url = api_url self.name = name self.sleep_interval = sleep_interval - self.heartbeat_frequency = heartbeat_frequency + self.heartbeat_interval = heartbeat_interval self.limit = limit self.client = AlchemiscaleComputeClient(api_url, identifier, key) @@ -150,6 +150,8 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) + self._stop = False + def _register(self): """Register this compute service with the compute API.""" self.client.register(self.compute_service_id) @@ -158,11 +160,19 @@ def _deregister(self): """Deregister this compute service with the compute API.""" self.client.deregister(self.compute_service_id) - def heartbeat(self): + def beat(self): """Deliver a heartbeat to the compute API, indicating this service is still alive.""" self.client.heartbeat(self.compute_service_id) self.logger.info("Updated heartbeat") + def heartbeat(self): + """Start up the heartbeat, sleeping for `self.heartbeat_interval`""" + while True: + if self._stop: + break + self.beat() + time.sleep(self.heartbeat_interval) + def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: """Get a Task to execute from compute API. @@ -315,25 +325,29 @@ def start(self, task_limit: Optional[int] = None): "Registered service with registration '%s'", str(self.compute_service_id) ) - def scheduler_cycle(): - self.cycle(task_limit) - self.scheduler.enter(0, 1, scheduler_cycle) - - def scheduler_heartbeat(): - self.heartbeat() - self.scheduler.enter(self.heartbeat_frequency, 1, scheduler_heartbeat) - - self.scheduler.enter(0, 1, scheduler_cycle) - self.scheduler.enter(0, 2, scheduler_heartbeat) + # start up heartbeat thread + self.heartbeat_thread = threading.Thread(target=self.heartbeat, daemon=True) + self.heartbeat_thread.run() try: self.logger.info("Starting main loop") - self.scheduler.run() + while True: + # check that heartbeat is still alive; if not, resurrect it + if not self.heartbeat_thread.is_alive(): + self.heartbeat_thread = threading.Thread( + target=self.heartbeat, daemon=True + ) + self.heartbeat_thread.run() + + # perform main loop cycle + self.cycle(task_limit=task_limit) except KeyboardInterrupt: self.logger.info("Caught SIGINT/Keyboard interrupt.") except SleepInterrupted: self.logger.info("Service stopping.") finally: + self.heartbeat_thread.join() + # remove ComputeServiceRegistration, drop all claims self._deregister() self.logger.info( @@ -342,9 +356,8 @@ def scheduler_heartbeat(): ) def stop(self): - # Interrupt the scheduler (will finish if in the middle of an update or - # something, but will cancel running calculations) self.int_sleep.interrupt() + self._stop = True class AsynchronousComputeService(SynchronousComputeService): From 64d3b4011bed07f828df983b9dcab10618f9e281 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 22 Mar 2023 22:49:33 -0700 Subject: [PATCH 22/41] Thread.run -> Thread.start :P --- alchemiscale/compute/service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 0d7a5210..92b8620d 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -327,7 +327,7 @@ def start(self, task_limit: Optional[int] = None): # start up heartbeat thread self.heartbeat_thread = threading.Thread(target=self.heartbeat, daemon=True) - self.heartbeat_thread.run() + self.heartbeat_thread.start() try: self.logger.info("Starting main loop") @@ -337,7 +337,7 @@ def start(self, task_limit: Optional[int] = None): self.heartbeat_thread = threading.Thread( target=self.heartbeat, daemon=True ) - self.heartbeat_thread.run() + self.heartbeat_thread.start() # perform main loop cycle self.cycle(task_limit=task_limit) From 38d5bbcb3c3a2ec3bf2512a6812c6251a01a8d96 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 22 Mar 2023 22:52:00 -0700 Subject: [PATCH 23/41] Remove thread join; not necessary if daemon --- alchemiscale/compute/service.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 92b8620d..a68b6955 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -346,8 +346,6 @@ def start(self, task_limit: Optional[int] = None): except SleepInterrupted: self.logger.info("Service stopping.") finally: - self.heartbeat_thread.join() - # remove ComputeServiceRegistration, drop all claims self._deregister() self.logger.info( From c2e94e095953de4192570984cb907a765d0a5d73 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 00:14:38 -0700 Subject: [PATCH 24/41] Added expiry periodic to compute API --- alchemiscale/cli.py | 18 ++++++++++--- alchemiscale/compute/api.py | 25 +++++++++++++------ alchemiscale/settings.py | 1 + alchemiscale/storage/statestore.py | 18 +++++++++++++ .../integration/storage/test_statestore.py | 24 ++++++++++++++++++ 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index f64b56b0..2e19e405 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -45,7 +45,7 @@ def get_settings_from_options(kwargs, settings_cls): return settings_cls(**update) -def api_starting_params(envvar_host, envvar_port, envvar_loglevel): +def api_starting_params(envvar_host, envvar_port, envvar_loglevel, envvar_registration_expire_seconds): def inner(func): workers = click.option( "--workers", type=int, help="number of workers", default=1 @@ -72,7 +72,16 @@ def inner(func): envvar=envvar_loglevel, **SETTINGS_OPTION_KWARGS, ) - return workers(host(port(loglevel(func)))) + registration_expire_seconds = click.option( + "--registration-expire-seconds", + type=int, + default=3600, + help="number of seconds since last heartbeat at which to expire a compute service registration", + envvar=envvar_registration_expire_seconds, + **SETTINGS_OPTION_KWARGS, + ) + + return workers(host(port(loglevel(registration_expire_seconds(func))))) return inner @@ -286,12 +295,13 @@ def compute(): "ALCHEMISCALE_COMPUTE_API_HOST", "ALCHEMISCALE_COMPUTE_API_PORT", "ALCHEMISCALE_COMPUTE_API_LOGLEVEL", + "ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", ) @db_params @s3os_params @jwt_params def api( - workers, host, port, loglevel, # API + workers, host, port, loglevel, registration_expire_seconds, # API url, user, password, dbname, # DB jwt_secret, jwt_expire_seconds, jwt_algorithm, #JWT access_key_id, secret_access_key, session_token, s3_bucket, s3_prefix, default_region # AWS @@ -306,7 +316,7 @@ def api( def get_settings_override(): # inject settings from CLI arguments - api_dict = host | port | loglevel + api_dict = host | port | loglevel | registration_expire_seconds jwt_dict = jwt_secret | jwt_expire_seconds | jwt_algorithm db_dict = url | user | password | dbname s3_dict = ( diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index d3d28543..9832c562 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -4,11 +4,11 @@ """ - +import asyncio from typing import Any, Dict, List import os import json -from datetime import datetime +from datetime import datetime, timedelta from fastapi import FastAPI, APIRouter, Body, Depends, HTTPException, status from gufe.tokenization import GufeTokenizable, JSON_HANDLER @@ -28,7 +28,7 @@ gufe_to_json, ) from ..settings import get_base_api_settings, get_compute_api_settings -from ..storage.statestore import Neo4jStore +from ..storage.statestore import Neo4jStore, get_n4js from ..storage.objectstore import S3ObjectStore from ..storage.models import ( ProtocolDAGResultRef, @@ -45,14 +45,25 @@ ) +app = FastAPI(title="AlchemiscaleComputeAPI") +app.dependency_overrides[get_base_api_settings] = get_compute_api_settings +app.include_router(base_router) + + # TODO: # - add periodic removal of task claims from compute services that are no longer alive # - can be done with an asyncio.sleeping task added to event loop: https://stackoverflow.com/questions/67154839/fastapi-best-way-to-run-continuous-get-requests-in-the-background # - on startup, - -app = FastAPI(title="AlchemiscaleComputeAPI") -app.dependency_overrides[get_base_api_settings] = get_compute_api_settings -app.include_router(base_router) +@app.on_event("startup") +async def expire_stale_compute_service_registrations() -> None: + settings = get_compute_api_settings() + n4js = get_n4js(settings) + while True: + now = datetime.utcnow() + expire_delta = timedelta(seconds=settings.ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS) + expire_time = now - expire_delta + n4js.expire_registrations(expire_time) + asyncio.sleep(60) def get_cred_compute(): diff --git a/alchemiscale/settings.py b/alchemiscale/settings.py index 5f7f61a4..d64399f8 100644 --- a/alchemiscale/settings.py +++ b/alchemiscale/settings.py @@ -80,6 +80,7 @@ class ComputeAPISettings(BaseAPISettings): ALCHEMISCALE_COMPUTE_API_HOST: str = "127.0.0.1" ALCHEMISCALE_COMPUTE_API_PORT: int = 80 ALCHEMISCALE_COMPUTE_API_LOGLEVEL: str = "info" + ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS: int = 3600 @lru_cache() diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 462d6a6f..0af3ea9d 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -803,6 +803,24 @@ def heartbeat_computeservice( with self.transaction() as tx: tx.run(q) + def expire_registrations(self, expire_time: datetime): + """Remove all registrations with last heartbeat prior to the given `expire_time`. + + """ + q = f""" + MATCH (n:ComputeServiceRegistration) + WHERE n.heartbeat < localdatetime({expire_time}) + + WITH n + + OPTIONAL MATCH (n)-[cl:CLAIMS]->(t:Task {{status: 'running'}}) + SET t.status = 'waiting' + + DETACH DELETE n + """ + with self.transaction() as tx: + tx.run(q) + ## task hubs def create_taskhub( diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 852d52fa..2f84bd97 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -334,6 +334,30 @@ def test_heartbeat(self, n4js, compute_service_id): assert csreg["registered"] == now assert csreg["heartbeat"] == tomorrow + def test_expire_registrations(self, n4js, compute_service_id): + now = datetime.utcnow() + yesterday = now - timedelta(days=1) + an_hour_ago = now - timedelta(hours=1) + registration = ComputeServiceRegistration( + identifier=compute_service_id, registered=yesterday, heartbeat=an_hour_ago + ) + + n4js.register_computeservice(registration) + + # expire any compute service that had a heartbeat more than 30 mins ago + thirty_mins_ago = now - timedelta(minutes=30) + + n4js.expire_registrations(expire_time=thirty_mins_ago) + + csreg = n4js.graph.run( + f""" + match (csreg:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) + return csreg + """ + ).to_subgraph() + + assert csreg is None + def test_create_task(self, n4js, network_tyk2, scope_test): # add alchemical network, then try generating task an = network_tyk2 From 7fe4c12af01786fc4672a67f02b3259a704e7832 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 00:21:06 -0700 Subject: [PATCH 25/41] Expiration test in place for state store --- alchemiscale/cli.py | 4 +++- alchemiscale/compute/api.py | 4 +++- alchemiscale/storage/statestore.py | 6 ++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 2e19e405..2fa2ace4 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -45,7 +45,9 @@ def get_settings_from_options(kwargs, settings_cls): return settings_cls(**update) -def api_starting_params(envvar_host, envvar_port, envvar_loglevel, envvar_registration_expire_seconds): +def api_starting_params( + envvar_host, envvar_port, envvar_loglevel, envvar_registration_expire_seconds +): def inner(func): workers = click.option( "--workers", type=int, help="number of workers", default=1 diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 9dea8d82..93ebff35 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -60,7 +60,9 @@ async def expire_stale_compute_service_registrations() -> None: n4js = get_n4js(settings) while True: now = datetime.utcnow() - expire_delta = timedelta(seconds=settings.ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS) + expire_delta = timedelta( + seconds=settings.ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS + ) expire_time = now - expire_delta n4js.expire_registrations(expire_time) asyncio.sleep(60) diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 0af3ea9d..ca1e3143 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -804,12 +804,10 @@ def heartbeat_computeservice( tx.run(q) def expire_registrations(self, expire_time: datetime): - """Remove all registrations with last heartbeat prior to the given `expire_time`. - - """ + """Remove all registrations with last heartbeat prior to the given `expire_time`.""" q = f""" MATCH (n:ComputeServiceRegistration) - WHERE n.heartbeat < localdatetime({expire_time}) + WHERE n.heartbeat < localdatetime('{expire_time.isoformat()}') WITH n From 8b337eb98322781596eec548510f8dc5b99a2d5a Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 01:34:47 -0700 Subject: [PATCH 26/41] Attempt at periodic expiry failed; causing API service to hang on requests Instead, making expiry part of the heartbeat API call; should be fairly cheap, and should still get the desired effect. --- alchemiscale/cli.py | 22 +++++++------- alchemiscale/compute/api.py | 29 +++++++------------ alchemiscale/storage/statestore.py | 2 +- .../integration/compute/client/conftest.py | 8 ++--- .../compute/client/test_compute_service.py | 2 ++ .../tests/integration/compute/utils.py | 5 ++-- .../integration/interface/client/conftest.py | 8 ++--- .../tests/integration/interface/utils.py | 4 +-- 8 files changed, 36 insertions(+), 44 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 2fa2ace4..8d9f1672 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -46,7 +46,7 @@ def get_settings_from_options(kwargs, settings_cls): def api_starting_params( - envvar_host, envvar_port, envvar_loglevel, envvar_registration_expire_seconds + envvar_host, envvar_port, envvar_loglevel ): def inner(func): workers = click.option( @@ -74,16 +74,7 @@ def inner(func): envvar=envvar_loglevel, **SETTINGS_OPTION_KWARGS, ) - registration_expire_seconds = click.option( - "--registration-expire-seconds", - type=int, - default=3600, - help="number of seconds since last heartbeat at which to expire a compute service registration", - envvar=envvar_registration_expire_seconds, - **SETTINGS_OPTION_KWARGS, - ) - - return workers(host(port(loglevel(registration_expire_seconds(func))))) + return workers(host(port(loglevel(func)))) return inner @@ -297,8 +288,15 @@ def compute(): "ALCHEMISCALE_COMPUTE_API_HOST", "ALCHEMISCALE_COMPUTE_API_PORT", "ALCHEMISCALE_COMPUTE_API_LOGLEVEL", - "ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", ) +@click.option( + "--registration-expire-seconds", + type=int, + default=3600, + help="number of seconds since last heartbeat at which to expire a compute service registration", + envvar="ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", + **SETTINGS_OPTION_KWARGS, + ) @db_params @s3os_params @jwt_params diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index 93ebff35..c4653e1f 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -27,7 +27,7 @@ _check_store_connectivity, gufe_to_json, ) -from ..settings import get_base_api_settings, get_compute_api_settings +from ..settings import get_base_api_settings, get_compute_api_settings, ComputeAPISettings from ..storage.statestore import Neo4jStore, get_n4js from ..storage.objectstore import S3ObjectStore from ..storage.models import ( @@ -50,24 +50,6 @@ app.include_router(base_router) -# TODO: -# - add periodic removal of task claims from compute services that are no longer alive -# - can be done with an asyncio.sleeping task added to event loop: https://stackoverflow.com/questions/67154839/fastapi-best-way-to-run-continuous-get-requests-in-the-background -# - on startup, -@app.on_event("startup") -async def expire_stale_compute_service_registrations() -> None: - settings = get_compute_api_settings() - n4js = get_n4js(settings) - while True: - now = datetime.utcnow() - expire_delta = timedelta( - seconds=settings.ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS - ) - expire_time = now - expire_delta - n4js.expire_registrations(expire_time) - asyncio.sleep(60) - - def get_cred_compute(): return CredentialedComputeIdentity @@ -140,8 +122,17 @@ async def deregister_computeservice( async def heartbeat_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), + settings: ComputeAPISettings = Depends(get_base_api_settings), ): now = datetime.utcnow() + + # expire any stale registrations, along with their claims + expire_delta = timedelta( + seconds=settings.ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS + ) + expire_time = now - expire_delta + n4js.expire_registrations(expire_time) + n4js.heartbeat_computeservice(compute_service_id, now) return compute_service_id diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index ca1e3143..75def31b 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -1293,7 +1293,7 @@ def create_task( "`transformation` ScopedKey does not correspond to a `Transformation`" ) - if extends is not None and transformation.qualname != "Task": + if extends is not None and extends.qualname != "Task": raise ValueError("`extends` ScopedKey does not correspond to a `Task`") transformation_node = self._get_node(transformation) diff --git a/alchemiscale/tests/integration/compute/client/conftest.py b/alchemiscale/tests/integration/compute/client/conftest.py index a2ba6bf0..66d4d0af 100644 --- a/alchemiscale/tests/integration/compute/client/conftest.py +++ b/alchemiscale/tests/integration/compute/client/conftest.py @@ -33,9 +33,9 @@ def get_s3os_override(): def run_server(fastapi_app, settings): uvicorn.run( fastapi_app, - host=settings.FA_COMPUTE_API_HOST, - port=settings.FA_COMPUTE_API_PORT, - log_level=settings.FA_COMPUTE_API_LOGLEVEL, + host=settings.ALCHEMISCALE_COMPUTE_API_HOST, + port=settings.ALCHEMISCALE_COMPUTE_API_PORT, + log_level=settings.ALCHEMISCALE_COMPUTE_API_LOGLEVEL, ) @@ -43,7 +43,7 @@ def run_server(fastapi_app, settings): def uvicorn_server(compute_api): settings = get_compute_settings_override() with running_service( - run_server, port=settings.FA_COMPUTE_API_PORT, args=(compute_api, settings) + run_server, port=settings.ALCHEMISCALE_COMPUTE_API_PORT, args=(compute_api, settings) ): yield diff --git a/alchemiscale/tests/integration/compute/client/test_compute_service.py b/alchemiscale/tests/integration/compute/client/test_compute_service.py index a800f1e1..b5b70de3 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_service.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_service.py @@ -27,6 +27,8 @@ def service(self, n4js_preloaded, compute_client, tmpdir): def test_claim_tasks(self, n4js_preloaded, service): n4js: Neo4jStore = n4js_preloaded + service._register() + task_sks: List[Optional[ScopedKey]] = service.claim_tasks(count=2) # should have 2 tasks diff --git a/alchemiscale/tests/integration/compute/utils.py b/alchemiscale/tests/integration/compute/utils.py index ff6047d5..2af037db 100644 --- a/alchemiscale/tests/integration/compute/utils.py +++ b/alchemiscale/tests/integration/compute/utils.py @@ -7,8 +7,9 @@ def get_compute_settings_override(): NEO4J_USER="neo4j", NEO4J_PASS="password", NEO4J_URL="bolt://localhost:7687", - FA_COMPUTE_API_HOST="127.0.0.1", - FA_COMPUTE_API_PORT=8000, + ALCHEMISCALE_COMPUTE_API_HOST="127.0.0.1", + ALCHEMISCALE_COMPUTE_API_PORT=8000, + ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS = 3600, JWT_SECRET_KEY="98d11ba9ca329a4e5a6626faeffc6a9b9fb04e2745cff030f7d6793751bb8245", JWT_EXPIRE_SECONDS=10, AWS_ACCESS_KEY_ID="test-key-id", diff --git a/alchemiscale/tests/integration/interface/client/conftest.py b/alchemiscale/tests/integration/interface/client/conftest.py index 9925cfb7..7364b5a6 100644 --- a/alchemiscale/tests/integration/interface/client/conftest.py +++ b/alchemiscale/tests/integration/interface/client/conftest.py @@ -32,9 +32,9 @@ def get_s3os_override(): def run_server(fastapi_app, settings): uvicorn.run( fastapi_app, - host=settings.FA_API_HOST, - port=settings.FA_API_PORT, - log_level=settings.FA_API_LOGLEVEL, + host=settings.ALCHEMISCALE_API_HOST, + port=settings.ALCHEMISCALE_API_PORT, + log_level=settings.ALCHEMISCALE_API_LOGLEVEL, ) @@ -42,7 +42,7 @@ def run_server(fastapi_app, settings): def uvicorn_server(user_api): settings = get_user_settings_override() with running_service( - run_server, port=settings.FA_API_PORT, args=(user_api, settings) + run_server, port=settings.ALCHEMISCALE_API_PORT, args=(user_api, settings) ): yield diff --git a/alchemiscale/tests/integration/interface/utils.py b/alchemiscale/tests/integration/interface/utils.py index 25eadfe7..9c0d4ef7 100644 --- a/alchemiscale/tests/integration/interface/utils.py +++ b/alchemiscale/tests/integration/interface/utils.py @@ -7,8 +7,8 @@ def get_user_settings_override(): NEO4J_USER="neo4j", NEO4J_PASS="password", NEO4J_URL="bolt://localhost:7687", - FA_API_HOST="127.0.0.1", - FA_API_PORT=8000, + ALCHEMISCALE_API_HOST="127.0.0.1", + ALCHEMISCALE_API_PORT=8000, JWT_SECRET_KEY="3f072449f5f496d30c0e46e6bc116ba27937a1482c3a4e41195be899a299c7e4", JWT_EXPIRE_SECONDS=3, AWS_ACCESS_KEY_ID="test-key-id", From d1d86104e2d8f96f5edb48c9f5b39c3fd0202812 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 01:36:15 -0700 Subject: [PATCH 27/41] Black! --- alchemiscale/cli.py | 18 ++++++++---------- alchemiscale/compute/api.py | 6 +++++- .../integration/compute/client/conftest.py | 4 +++- .../tests/integration/compute/utils.py | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 8d9f1672..e5855fcc 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -45,9 +45,7 @@ def get_settings_from_options(kwargs, settings_cls): return settings_cls(**update) -def api_starting_params( - envvar_host, envvar_port, envvar_loglevel -): +def api_starting_params(envvar_host, envvar_port, envvar_loglevel): def inner(func): workers = click.option( "--workers", type=int, help="number of workers", default=1 @@ -290,13 +288,13 @@ def compute(): "ALCHEMISCALE_COMPUTE_API_LOGLEVEL", ) @click.option( - "--registration-expire-seconds", - type=int, - default=3600, - help="number of seconds since last heartbeat at which to expire a compute service registration", - envvar="ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", - **SETTINGS_OPTION_KWARGS, - ) + "--registration-expire-seconds", + type=int, + default=3600, + help="number of seconds since last heartbeat at which to expire a compute service registration", + envvar="ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", + **SETTINGS_OPTION_KWARGS, +) @db_params @s3os_params @jwt_params diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index c4653e1f..edd577dc 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -27,7 +27,11 @@ _check_store_connectivity, gufe_to_json, ) -from ..settings import get_base_api_settings, get_compute_api_settings, ComputeAPISettings +from ..settings import ( + get_base_api_settings, + get_compute_api_settings, + ComputeAPISettings, +) from ..storage.statestore import Neo4jStore, get_n4js from ..storage.objectstore import S3ObjectStore from ..storage.models import ( diff --git a/alchemiscale/tests/integration/compute/client/conftest.py b/alchemiscale/tests/integration/compute/client/conftest.py index 66d4d0af..f4c92f8a 100644 --- a/alchemiscale/tests/integration/compute/client/conftest.py +++ b/alchemiscale/tests/integration/compute/client/conftest.py @@ -43,7 +43,9 @@ def run_server(fastapi_app, settings): def uvicorn_server(compute_api): settings = get_compute_settings_override() with running_service( - run_server, port=settings.ALCHEMISCALE_COMPUTE_API_PORT, args=(compute_api, settings) + run_server, + port=settings.ALCHEMISCALE_COMPUTE_API_PORT, + args=(compute_api, settings), ): yield diff --git a/alchemiscale/tests/integration/compute/utils.py b/alchemiscale/tests/integration/compute/utils.py index 2af037db..f05de179 100644 --- a/alchemiscale/tests/integration/compute/utils.py +++ b/alchemiscale/tests/integration/compute/utils.py @@ -9,7 +9,7 @@ def get_compute_settings_override(): NEO4J_URL="bolt://localhost:7687", ALCHEMISCALE_COMPUTE_API_HOST="127.0.0.1", ALCHEMISCALE_COMPUTE_API_PORT=8000, - ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS = 3600, + ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS=3600, JWT_SECRET_KEY="98d11ba9ca329a4e5a6626faeffc6a9b9fb04e2745cff030f7d6793751bb8245", JWT_EXPIRE_SECONDS=10, AWS_ACCESS_KEY_ID="test-key-id", From 15704e346a857208eb804c18ac77faa4dd7e037f Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 19:31:17 -0700 Subject: [PATCH 28/41] Updated heartbeat interval default, along with expiry heartbeat interval for SynchronousComputeService set to 300 seconds (5 min); expiry set to 1800 seconds (30 minutes) --- alchemiscale/cli.py | 2 +- alchemiscale/compute/client.py | 4 ++-- alchemiscale/compute/service.py | 2 +- alchemiscale/settings.py | 2 +- alchemiscale/tests/integration/compute/utils.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index e5855fcc..191e11d6 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -290,7 +290,7 @@ def compute(): @click.option( "--registration-expire-seconds", type=int, - default=3600, + default=1800, help="number of seconds since last heartbeat at which to expire a compute service registration", envvar="ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS", **SETTINGS_OPTION_KWARGS, diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index 7319b888..baa19931 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -54,7 +54,7 @@ def list_scopes(self) -> List[Scope]: return [Scope.from_str(s) for s in scopes] def query_taskhubs( - self, scopes: List[Scope], return_gufe=False, limit=None, skip=None + self, scopes: List[Scope], return_gufe=False ) -> Union[List[ScopedKey], Dict[ScopedKey, TaskHub]]: """Return all `TaskHub`s corresponding to given `Scope`.""" if return_gufe: @@ -64,7 +64,7 @@ def query_taskhubs( for scope in scopes: params = dict( - return_gufe=return_gufe, limit=limit, skip=skip, **scope.dict() + return_gufe=return_gufe, **scope.dict() ) if return_gufe: taskhubs.update(self._query_resource("/taskhubs", params=params)) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index a68b6955..515c1b2b 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -81,7 +81,7 @@ def __init__( scratch_basedir: os.PathLike, keep_scratch: bool = False, sleep_interval: int = 30, - heartbeat_interval: int = 30, + heartbeat_interval: int = 300, scopes: Optional[List[Scope]] = None, limit: int = 1, loglevel="WARN", diff --git a/alchemiscale/settings.py b/alchemiscale/settings.py index d64399f8..3afc90ac 100644 --- a/alchemiscale/settings.py +++ b/alchemiscale/settings.py @@ -80,7 +80,7 @@ class ComputeAPISettings(BaseAPISettings): ALCHEMISCALE_COMPUTE_API_HOST: str = "127.0.0.1" ALCHEMISCALE_COMPUTE_API_PORT: int = 80 ALCHEMISCALE_COMPUTE_API_LOGLEVEL: str = "info" - ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS: int = 3600 + ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS: int = 1800 @lru_cache() diff --git a/alchemiscale/tests/integration/compute/utils.py b/alchemiscale/tests/integration/compute/utils.py index f05de179..a3349c8a 100644 --- a/alchemiscale/tests/integration/compute/utils.py +++ b/alchemiscale/tests/integration/compute/utils.py @@ -9,7 +9,7 @@ def get_compute_settings_override(): NEO4J_URL="bolt://localhost:7687", ALCHEMISCALE_COMPUTE_API_HOST="127.0.0.1", ALCHEMISCALE_COMPUTE_API_PORT=8000, - ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS=3600, + ALCHEMISCALE_COMPUTE_API_REGISTRATION_EXPIRE_SECONDS=1800, JWT_SECRET_KEY="98d11ba9ca329a4e5a6626faeffc6a9b9fb04e2745cff030f7d6793751bb8245", JWT_EXPIRE_SECONDS=10, AWS_ACCESS_KEY_ID="test-key-id", From 2c4418b0710d3669cbfd18b3a26464d26afe99c5 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 22:25:55 -0700 Subject: [PATCH 29/41] Small convenience fix to user client set_task_status --- alchemiscale/interface/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/alchemiscale/interface/client.py b/alchemiscale/interface/client.py index 6d9dd9f6..1da4ea12 100644 --- a/alchemiscale/interface/client.py +++ b/alchemiscale/interface/client.py @@ -301,6 +301,9 @@ def set_tasks_status( """ if isinstance(tasks, ScopedKey): tasks = [tasks] + + status = TaskStatusEnum(status) + task_sks = [self._set_task_status(t, status) for t in tasks] return task_sks From 172b54efbafea1cbcc91e181e260a25314560cb7 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 22:39:12 -0700 Subject: [PATCH 30/41] Black! --- alchemiscale/compute/client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index baa19931..e28c58dd 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -63,9 +63,7 @@ def query_taskhubs( taskhubs = [] for scope in scopes: - params = dict( - return_gufe=return_gufe, **scope.dict() - ) + params = dict(return_gufe=return_gufe, **scope.dict()) if return_gufe: taskhubs.update(self._query_resource("/taskhubs", params=params)) else: From cf5ca0d2d6831d0613c5df682b1e1adb27c2e980 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 24 Mar 2023 14:27:49 -0700 Subject: [PATCH 31/41] Update alchemiscale/cli.py yaml.load -> yaml.safe_load Co-authored-by: Hugo MacDermott-Opeskin --- alchemiscale/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 191e11d6..89520925 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -351,7 +351,7 @@ def synchronous(config_file): from alchemiscale.models import Scope from alchemiscale.compute.service import SynchronousComputeService - params = yaml.load(config_file, Loader=yaml.Loader) + params = yaml.safe_load(config_file, Loader=yaml.Loader) if "scopes" in params: params["scopes"] = [Scope.from_str(scope) for scope in params["scopes"]] From b271549819c2657888ab2822f8de72b7e07a8c3a Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 23 Mar 2023 22:55:06 -0700 Subject: [PATCH 32/41] Review fixes --- alchemiscale/storage/models.py | 4 ++-- alchemiscale/tests/integration/storage/test_statestore.py | 4 ++-- docker/docker-compose.yml | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/alchemiscale/storage/models.py b/alchemiscale/storage/models.py index fd4c71ee..b428d6e3 100644 --- a/alchemiscale/storage/models.py +++ b/alchemiscale/storage/models.py @@ -30,7 +30,7 @@ class ComputeServiceRegistration(BaseModel): heartbeat: datetime def __repr__(self): # pragma: no cover - return f"" + return f"" def __str__(self): return "-".join([self.identifier]) @@ -55,7 +55,7 @@ def from_dict(cls, dct): class TaskProvenance(BaseModel): - computekey: ComputeServiceID + computeserviceid: ComputeServiceID datetime_start: datetime datetime_end: datetime diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 2f84bd97..357abb7a 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -292,7 +292,7 @@ def test_register_computeservice(self, n4js, compute_service_id): assert csreg["registered"] == now assert csreg["heartbeat"] == now - def test_deregister(self, n4js, compute_service_id): + def test_deregister_computeservice(self, n4js, compute_service_id): now = datetime.utcnow() registration = ComputeServiceRegistration( identifier=compute_service_id, registered=now, heartbeat=now @@ -312,7 +312,7 @@ def test_deregister(self, n4js, compute_service_id): assert csreg is None - def test_heartbeat(self, n4js, compute_service_id): + def test_heartbeat_computeservice(self, n4js, compute_service_id): now = datetime.utcnow() registration = ComputeServiceRegistration( identifier=compute_service_id, registered=now, heartbeat=now diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index be88b87c..91fa3d5c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -177,10 +177,8 @@ services: - web # Enables the web UI and tells Traefik to listen to docker depends_on: - alchemiscale-client-API: - condition: service_healthy - alchemiscale-compute-API: - condition: service_healthy + - alchemiscale-client-API + - alchemiscale-compute-API command: - "--log.level=DEBUG" - "--providers.docker" From e180ca98a81ddd3dd72c9e8e1aa5392c09959dc6 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 24 Mar 2023 17:28:57 -0700 Subject: [PATCH 33/41] Added CLI test for synchronous compute service Also put in better stop conditions for the service itself. --- alchemiscale/cli.py | 11 ++- alchemiscale/compute/service.py | 109 ++++++++++++--------- alchemiscale/tests/integration/test_cli.py | 91 ++++++++++++++++- 3 files changed, 158 insertions(+), 53 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 89520925..23ac0285 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -353,10 +353,13 @@ def synchronous(config_file): params = yaml.safe_load(config_file, Loader=yaml.Loader) - if "scopes" in params: - params["scopes"] = [Scope.from_str(scope) for scope in params["scopes"]] + params_init = params['init'] + params_start = params['start'] - service = SynchronousComputeService(**params) + if "scopes" in params_init: + params_init["scopes"] = [Scope.from_str(scope) for scope in params_init["scopes"]] + + service = SynchronousComputeService(**params_init) # add signal handling for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: @@ -368,7 +371,7 @@ def stop(*args, **kwargs): signal.signal(getattr(signal, signame), stop) try: - service.start() + service.start(**params_start) except KeyboardInterrupt: pass diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 515c1b2b..0217acff 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -15,6 +15,7 @@ from typing import Union, Optional, List, Dict, Tuple from pathlib import Path from threading import Thread +import tempfile import requests @@ -79,11 +80,12 @@ def __init__( name: str, shared_basedir: os.PathLike, scratch_basedir: os.PathLike, + keep_shared: bool = False, keep_scratch: bool = False, sleep_interval: int = 30, heartbeat_interval: int = 300, scopes: Optional[List[Scope]] = None, - limit: int = 1, + claim_limit: int = 1, loglevel="WARN", ): """Create a `SynchronousComputeService` instance. @@ -104,6 +106,9 @@ def __init__( Filesystem path to use for `ProtocolDAG` `shared` space. scratch_basedir Filesystem path to use for `ProtocolUnit` `scratch` space. + keep_shared + If True, don't remove shared directories for `ProtocolDAG`s after + completion. keep_scratch If True, don't remove scratch directories for `ProtocolUnit`s after completion. @@ -114,7 +119,7 @@ def __init__( scopes Scopes to limit Task claiming to; defaults to all Scopes accessible by compute identity. - limit + claim_limit Maximum number of Tasks to claim at a time from a TaskHub. """ @@ -122,7 +127,7 @@ def __init__( self.name = name self.sleep_interval = sleep_interval self.heartbeat_interval = heartbeat_interval - self.limit = limit + self.claim_limit = claim_limit self.client = AlchemiscaleComputeClient(api_url, identifier, key) @@ -133,6 +138,7 @@ def __init__( self.shared_basedir = Path(shared_basedir) self.shared_basedir.mkdir(exist_ok=True) + self.keep_shared = keep_shared self.scratch_basedir = Path(scratch_basedir) self.scratch_basedir.mkdir(exist_ok=True) @@ -140,8 +146,6 @@ def __init__( self.scheduler = sched.scheduler(time.monotonic, time.sleep) - self.counter = 0 - self.compute_service_id = ComputeServiceID(f"{self.name}-{uuid4()}") self.int_sleep = InterruptableSleep() @@ -259,8 +263,10 @@ def execute(self, task: ScopedKey) -> ScopedKey: # execute the task; this looks the same whether the ProtocolDAG is a # success or failure - shared = self.shared_basedir / str(protocoldag.key) / str(self.counter) - shared.mkdir(parents=True) + shared_tmp = tempfile.TemporaryDirectory( + prefix=f"{str(protocoldag.key)}__", + dir=self.shared_basedir) + shared = Path(shared_tmp.name) protocoldagresult = execute_DAG( protocoldag, @@ -270,22 +276,39 @@ def execute(self, task: ScopedKey) -> ScopedKey: raise_error=False, ) + if not self.keep_shared: + shared_tmp.cleanup() + # push the result (or failure) back to the compute API result_sk = self.push_result(task, protocoldagresult) return result_sk - def cycle(self, task_limit): - if task_limit is not None: - if self.counter >= task_limit: + def _check_max_tasks(self, max_tasks): + if max_tasks is not None: + if self._tasks_counter >= max_tasks: self.logger.info( - "Performed %s tasks; beyond task limit %s", self.counter, task_limit + "Performed %s tasks; at or beyond max tasks = %s", self._tasks_counter, max_tasks ) - return + self._stop = True + + def _check_max_time(self, max_time): + if max_time is not None: + run_time = time.time() - self._start_time + if run_time >= max_time: + self.logger.info( + "Ran for %s seconds; at or beyond max time = %s seconds", run_time, max_time + ) + self._stop = True + + def cycle(self, max_tasks, max_time): + + self._check_max_tasks(max_tasks) + self._check_max_time(max_time) # claim tasks from the compute API self.logger.info("Claiming tasks") - tasks: List[ScopedKey] = self.claim_tasks(self.limit) + tasks: List[ScopedKey] = self.claim_tasks(self.claim_limit) self.logger.info("Claimed %d tasks", len([t for t in tasks if t is not None])) # if no tasks claimed, sleep @@ -306,16 +329,32 @@ def cycle(self, task_limit): self.logger.info("Executing task '%s'...", task) self.execute(task) self.logger.info("Completed task '%s'", task) - self.counter += 1 + self._tasks_counter += 1 + + # stop checks + self._check_max_tasks(max_tasks) + self._check_max_time(max_time) - def start(self, task_limit: Optional[int] = None): + self._check_max_tasks(max_tasks) + self._check_max_time(max_time) + + def start(self, + max_tasks: Optional[int] = None, + max_time: Optional[int] = None): """Start the service. + Limits to the maximum number of executed tasks or seconds to run for + can be set. The first maximum to be hit will trigger the service to + exit. + Parameters ---------- - task_limit - Number of Tasks to complete before exiting. - If `None`, the service will continue until told to stop. + max_tasks + Max number of Tasks to execute before exiting. + If `None`, the service will have no task limit. + max_time + Max number of seconds to run before exiting. + If `None`, the service will have no time limit. """ # add ComputeServiceRegistration @@ -329,9 +368,13 @@ def start(self, task_limit: Optional[int] = None): self.heartbeat_thread = threading.Thread(target=self.heartbeat, daemon=True) self.heartbeat_thread.start() + # stop conditions will use these + self._tasks_counter = 0 + self._start_time = time.time() + try: self.logger.info("Starting main loop") - while True: + while not self._stop: # check that heartbeat is still alive; if not, resurrect it if not self.heartbeat_thread.is_alive(): self.heartbeat_thread = threading.Thread( @@ -340,7 +383,7 @@ def start(self, task_limit: Optional[int] = None): self.heartbeat_thread.start() # perform main loop cycle - self.cycle(task_limit=task_limit) + self.cycle(max_tasks, max_time) except KeyboardInterrupt: self.logger.info("Caught SIGINT/Keyboard interrupt.") except SleepInterrupted: @@ -383,29 +426,3 @@ def start(self): def stop(self): self._stop = True - - -class AlchemiscaleComputeService(AsynchronousComputeService): - """Folding@Home-based compute service. - - This service is designed for production use with Folding@Home. - - """ - - def __init__(self, object_store, fah_work_server): - self.scheduler = sched.scheduler(time.time, self.int_sleep) - self.loop = asyncio.get_event_loop() - - self._stop = False - - async def get_new_tasks(self): - ... - - def start(self): - ... - while True: - if self._stop: - return - - def stop(self): - ... diff --git a/alchemiscale/tests/integration/test_cli.py b/alchemiscale/tests/integration/test_cli.py index 7ea81f1f..16fa8b80 100644 --- a/alchemiscale/tests/integration/test_cli.py +++ b/alchemiscale/tests/integration/test_cli.py @@ -1,9 +1,14 @@ import pytest from click.testing import CliRunner +import time import contextlib import os import traceback +import multiprocessing +from datetime import datetime, timedelta + +import yaml import requests from fastapi import FastAPI @@ -106,7 +111,8 @@ def test_api(n4js, s3os): assert response.json() == expected_ping -def test_compute_api(n4js, s3os): +@pytest.fixture +def compute_api_args(): workers = 2 host = "127.0.0.1" port = 50100 @@ -136,11 +142,38 @@ def test_compute_api(n4js, s3os): ] jwt_opts = [] # leaving empty, we have default behavior for these + return host, port, (command + api_opts + db_opts + s3_opts + jwt_opts) + + +@pytest.fixture +def compute_service_config(compute_api_args): + host, port, _ = compute_api_args + + config = {'init': { + 'api_url': f'http://{host}:{port}', + 'identifier': 'test-compute-user', + 'key': "test-comute-user-key", + 'name': 'test-compute-service', + 'shared_basedir': "./shared", + 'scratch_basedir': "./scratch", + 'loglevel': 'INFO' + }, + 'start': { + 'max_time': None + } + } + + return config + + +def test_compute_api(n4js, s3os, compute_api_args): + host, port, args = compute_api_args + expected_ping = {"api": "AlchemiscaleComputeAPI"} runner = CliRunner() with running_service( - runner.invoke, port, (cli, command + api_opts + db_opts + s3_opts + jwt_opts) + runner.invoke, port, (cli, args) ): response = requests.get(f"http://{host}:{port}/ping") @@ -148,6 +181,57 @@ def test_compute_api(n4js, s3os): assert response.json() == expected_ping +def test_compute_synchronous(n4js_fresh, s3os, compute_api_args, compute_service_config, tmpdir): + host, port, args = compute_api_args + n4js = n4js_fresh + + # create compute identity; add all scope access + identity = CredentialedComputeIdentity( + identifier=compute_service_config['init']['identifier'], + hashed_key=hash_key(compute_service_config['init']['key']), + ) + + n4js.create_credentialed_entity(identity) + n4js.add_scope(identity.identifier, CredentialedComputeIdentity, Scope()) + + # start up compute API + runner = CliRunner() + with running_service( + runner.invoke, port, (cli, args) + ): + # start up compute service + with tmpdir.as_cwd(): + command = ["compute", "synchronous"] + opts = ["--config-file", 'config.yaml'] + + with open('config.yaml', 'w') as f: + yaml.dump(compute_service_config, f) + + multiprocessing.set_start_method("fork", force=True) + proc = multiprocessing.Process(target=runner.invoke, + args=(cli, command + opts), + daemon=True) + proc.start() + + q = f""" + match (csreg:ComputeServiceRegistration) + where csreg.identifier =~ "{compute_service_config['init']['name']}.*" + return csreg + """ + # try 5 times to be safe; depends on running host as to how fast + # process comes up + for i in range(5): + csreg = n4js.graph.run(q).to_subgraph() + if csreg is None: + time.sleep(1) + else: + break + + assert csreg['registered'] > datetime.utcnow() - timedelta(seconds=30) + + proc.terminate() + proc.join() + @pytest.mark.parametrize( "cli_vars", [ @@ -183,7 +267,8 @@ def test_get_settings_from_options(cli_vars): assert expected[key] == settings_dict[key] -def test_database_init(n4js): +def test_database_init(n4js_fresh): + n4js = n4js_fresh # ensure the database is empty n4js.graph.run("MATCH (n) WHERE NOT n:NOPE DETACH DELETE n") From 20770f3cfd1e8569c47bc07e0b66b887b10a927d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Fri, 24 Mar 2023 17:29:25 -0700 Subject: [PATCH 34/41] Black! --- alchemiscale/cli.py | 8 ++-- alchemiscale/compute/service.py | 17 +++---- alchemiscale/tests/integration/test_cli.py | 54 +++++++++++----------- 3 files changed, 40 insertions(+), 39 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 23ac0285..b2419439 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -353,11 +353,13 @@ def synchronous(config_file): params = yaml.safe_load(config_file, Loader=yaml.Loader) - params_init = params['init'] - params_start = params['start'] + params_init = params["init"] + params_start = params["start"] if "scopes" in params_init: - params_init["scopes"] = [Scope.from_str(scope) for scope in params_init["scopes"]] + params_init["scopes"] = [ + Scope.from_str(scope) for scope in params_init["scopes"] + ] service = SynchronousComputeService(**params_init) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 0217acff..b975c8b1 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -264,8 +264,8 @@ def execute(self, task: ScopedKey) -> ScopedKey: # execute the task; this looks the same whether the ProtocolDAG is a # success or failure shared_tmp = tempfile.TemporaryDirectory( - prefix=f"{str(protocoldag.key)}__", - dir=self.shared_basedir) + prefix=f"{str(protocoldag.key)}__", dir=self.shared_basedir + ) shared = Path(shared_tmp.name) protocoldagresult = execute_DAG( @@ -288,7 +288,9 @@ def _check_max_tasks(self, max_tasks): if max_tasks is not None: if self._tasks_counter >= max_tasks: self.logger.info( - "Performed %s tasks; at or beyond max tasks = %s", self._tasks_counter, max_tasks + "Performed %s tasks; at or beyond max tasks = %s", + self._tasks_counter, + max_tasks, ) self._stop = True @@ -297,12 +299,13 @@ def _check_max_time(self, max_time): run_time = time.time() - self._start_time if run_time >= max_time: self.logger.info( - "Ran for %s seconds; at or beyond max time = %s seconds", run_time, max_time + "Ran for %s seconds; at or beyond max time = %s seconds", + run_time, + max_time, ) self._stop = True def cycle(self, max_tasks, max_time): - self._check_max_tasks(max_tasks) self._check_max_time(max_time) @@ -338,9 +341,7 @@ def cycle(self, max_tasks, max_time): self._check_max_tasks(max_tasks) self._check_max_time(max_time) - def start(self, - max_tasks: Optional[int] = None, - max_time: Optional[int] = None): + def start(self, max_tasks: Optional[int] = None, max_time: Optional[int] = None): """Start the service. Limits to the maximum number of executed tasks or seconds to run for diff --git a/alchemiscale/tests/integration/test_cli.py b/alchemiscale/tests/integration/test_cli.py index 16fa8b80..851fe9af 100644 --- a/alchemiscale/tests/integration/test_cli.py +++ b/alchemiscale/tests/integration/test_cli.py @@ -149,19 +149,18 @@ def compute_api_args(): def compute_service_config(compute_api_args): host, port, _ = compute_api_args - config = {'init': { - 'api_url': f'http://{host}:{port}', - 'identifier': 'test-compute-user', - 'key': "test-comute-user-key", - 'name': 'test-compute-service', - 'shared_basedir': "./shared", - 'scratch_basedir': "./scratch", - 'loglevel': 'INFO' + config = { + "init": { + "api_url": f"http://{host}:{port}", + "identifier": "test-compute-user", + "key": "test-comute-user-key", + "name": "test-compute-service", + "shared_basedir": "./shared", + "scratch_basedir": "./scratch", + "loglevel": "INFO", }, - 'start': { - 'max_time': None - } - } + "start": {"max_time": None}, + } return config @@ -172,23 +171,23 @@ def test_compute_api(n4js, s3os, compute_api_args): expected_ping = {"api": "AlchemiscaleComputeAPI"} runner = CliRunner() - with running_service( - runner.invoke, port, (cli, args) - ): + with running_service(runner.invoke, port, (cli, args)): response = requests.get(f"http://{host}:{port}/ping") assert response.status_code == 200 assert response.json() == expected_ping -def test_compute_synchronous(n4js_fresh, s3os, compute_api_args, compute_service_config, tmpdir): +def test_compute_synchronous( + n4js_fresh, s3os, compute_api_args, compute_service_config, tmpdir +): host, port, args = compute_api_args n4js = n4js_fresh # create compute identity; add all scope access identity = CredentialedComputeIdentity( - identifier=compute_service_config['init']['identifier'], - hashed_key=hash_key(compute_service_config['init']['key']), + identifier=compute_service_config["init"]["identifier"], + hashed_key=hash_key(compute_service_config["init"]["key"]), ) n4js.create_credentialed_entity(identity) @@ -196,21 +195,19 @@ def test_compute_synchronous(n4js_fresh, s3os, compute_api_args, compute_service # start up compute API runner = CliRunner() - with running_service( - runner.invoke, port, (cli, args) - ): + with running_service(runner.invoke, port, (cli, args)): # start up compute service with tmpdir.as_cwd(): command = ["compute", "synchronous"] - opts = ["--config-file", 'config.yaml'] + opts = ["--config-file", "config.yaml"] - with open('config.yaml', 'w') as f: + with open("config.yaml", "w") as f: yaml.dump(compute_service_config, f) - + multiprocessing.set_start_method("fork", force=True) - proc = multiprocessing.Process(target=runner.invoke, - args=(cli, command + opts), - daemon=True) + proc = multiprocessing.Process( + target=runner.invoke, args=(cli, command + opts), daemon=True + ) proc.start() q = f""" @@ -227,11 +224,12 @@ def test_compute_synchronous(n4js_fresh, s3os, compute_api_args, compute_service else: break - assert csreg['registered'] > datetime.utcnow() - timedelta(seconds=30) + assert csreg["registered"] > datetime.utcnow() - timedelta(seconds=30) proc.terminate() proc.join() + @pytest.mark.parametrize( "cli_vars", [ From 09fb52719cca050ee3380682e9fc583fa4229b4d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 27 Mar 2023 14:06:36 -0700 Subject: [PATCH 35/41] Added additional tests for SynchronousComputeService --- alchemiscale/compute/service.py | 6 +- .../compute/client/test_compute_service.py | 145 +++++++++++++++++- 2 files changed, 146 insertions(+), 5 deletions(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index b975c8b1..78a21f2d 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -305,7 +305,7 @@ def _check_max_time(self, max_time): ) self._stop = True - def cycle(self, max_tasks, max_time): + def cycle(self, max_tasks: Optional[int] = None, max_time: Optional[int] = None): self._check_max_tasks(max_tasks) self._check_max_time(max_time) @@ -332,7 +332,9 @@ def cycle(self, max_tasks, max_time): self.logger.info("Executing task '%s'...", task) self.execute(task) self.logger.info("Completed task '%s'", task) - self._tasks_counter += 1 + + if max_tasks is not None: + self._tasks_counter += 1 # stop checks self._check_max_tasks(max_tasks) diff --git a/alchemiscale/tests/integration/compute/client/test_compute_service.py b/alchemiscale/tests/integration/compute/client/test_compute_service.py index b5b70de3..988fc106 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_service.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_service.py @@ -1,11 +1,15 @@ from typing import List, Optional +import threading +import time +import os import pytest from pathlib import Path -from alchemiscale.models import ScopedKey +from alchemiscale.models import ScopedKey, Scope from alchemiscale.storage.statestore import Neo4jStore +from alchemiscale.storage.objectstore import S3ObjectStore from alchemiscale.compute.service import SynchronousComputeService @@ -22,8 +26,36 @@ def service(self, n4js_preloaded, compute_client, tmpdir): name="test_compute_service", shared_basedir=Path("shared").absolute(), scratch_basedir=Path("scratch").absolute(), + heartbeat_interval=1, + sleep_interval=1 ) + def test_heartbeat(self, n4js_preloaded, service): + n4js: Neo4jStore = n4js_preloaded + + # register service; normally happens on service start, but needed + # for heartbeats + service._register() + + # start up heartbeat thread + heartbeat_thread = threading.Thread(target=service.heartbeat, daemon=True) + heartbeat_thread.start() + + # give time for a heartbeat + time.sleep(2) + + q = f""" + match (csreg:ComputeServiceRegistration {{identifier: '{service.compute_service_id}'}}) + return csreg + """ + csreg = n4js.graph.run(q).to_subgraph() + assert csreg["registered"] < csreg["heartbeat"] + + # stop the service; should trigger heartbeat to stop + service.stop() + time.sleep(2) + assert not heartbeat_thread.is_alive() + def test_claim_tasks(self, n4js_preloaded, service): n4js: Neo4jStore = n4js_preloaded @@ -34,7 +66,15 @@ def test_claim_tasks(self, n4js_preloaded, service): # should have 2 tasks assert len(task_sks) == 2 - def test_get_task_transformation( + subgraph = n4js.graph.run(f""" + match (csreg:ComputeServiceRegistration {{identifier: '{service.compute_service_id}'}}), + (csreg)-[:CLAIMS]->(t:Task) + return csreg, t + """).to_subgraph() + + assert len([node for node in subgraph.nodes if 'Task' in node.labels]) == 2 + + def test_task_to_protocoldag( self, n4js_preloaded, service, network_tyk2, scope_test ): n4js: Neo4jStore = n4js_preloaded @@ -49,10 +89,16 @@ def test_get_task_transformation( assert len(protocoldag.protocol_units) == 23 + def test_push_result(self): + # already tested with with client test for `set_task_result` + # expand this test when we have result path handling added in + ... + def test_execute( self, n4js_preloaded, s3os_server_fresh, service, network_tyk2, scope_test ): n4js: Neo4jStore = n4js_preloaded + s3os: S3ObjectStore = s3os_server_fresh network_sk = n4js.get_scoped_key(network_tyk2, scope_test) tq_sk = n4js.get_taskhub(network_sk) @@ -60,4 +106,97 @@ def test_execute( protocoldagresultref_sk = service.execute(task_sks[0]) - # TODO: check that we can pull the result + # examine object metadata + protocoldagresultref = n4js.get_gufe(protocoldagresultref_sk) + objs = list(s3os.resource.Bucket(s3os.bucket).objects.all()) + assert len(objs) == 1 + assert objs[0].key == os.path.join(s3os.prefix, protocoldagresultref.location) + + def test_cycle(self, n4js_preloaded, s3os_server_fresh, service): + n4js: Neo4jStore = n4js_preloaded + s3os: S3ObjectStore = s3os_server_fresh + + service._register() + + q = """ + match (pdr:ProtocolDAGResultRef) + return pdr + """ + + # preconditions + protocoldagresultref = n4js.graph.run(q).to_subgraph() + assert protocoldagresultref is None + + service.cycle() + + # postconditions + protocoldagresultref = n4js.graph.run(q).to_subgraph() + assert protocoldagresultref is not None + assert protocoldagresultref['ok'] == True + + task = n4js.graph.run(""" + match (t:Task {status: 'complete'}) + return t + """).to_subgraph() + + assert task is not None + + def test_cycle_max_tasks(self): + ... + + def test_cycle_max_time(self): + ... + + def test_start(self, n4js_preloaded, s3os_server_fresh, service): + n4js: Neo4jStore = n4js_preloaded + s3os: S3ObjectStore = s3os_server_fresh + + # start up service in a thread; will register itself + service_thread = threading.Thread(target=service.start, daemon=True) + service_thread.start() + + # give time for execution + time.sleep(2) + + q = f""" + match (csreg:ComputeServiceRegistration {{identifier: '{service.compute_service_id}'}}) + return csreg + """ + csreg = n4js.graph.run(q).to_subgraph() + assert csreg["registered"] < csreg["heartbeat"] + + # stop the service + service.stop() + while True: + if service_thread.is_alive(): + time.sleep(1) + else: + break + + task = n4js.graph.run(""" + match (t:Task {status: 'complete'}) + return t + """).to_subgraph() + + assert task is not None + + # service should now be deregistered + csreg = n4js.graph.run(q).to_subgraph() + assert csreg is None + + def test_stop(self): + # tested as part of tests above to stop threaded components that + # otherwise run forever + ... + + # init kwarg tests + + def test_kwarg_keep_shared(self): + ... + + def test_kwarg_keep_scratch(self): + ... + + def test_kwarg_scopes(self): + # TODO: add test here with alternative settings to `service` fixture + scope = Scope('totally', 'different', 'scope') From bdbd223fcc7b8e596258fa2142b8e4dc7824ca4e Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 27 Mar 2023 14:07:02 -0700 Subject: [PATCH 36/41] Black! --- .../compute/client/test_compute_service.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/alchemiscale/tests/integration/compute/client/test_compute_service.py b/alchemiscale/tests/integration/compute/client/test_compute_service.py index 988fc106..535cd898 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_service.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_service.py @@ -27,7 +27,7 @@ def service(self, n4js_preloaded, compute_client, tmpdir): shared_basedir=Path("shared").absolute(), scratch_basedir=Path("scratch").absolute(), heartbeat_interval=1, - sleep_interval=1 + sleep_interval=1, ) def test_heartbeat(self, n4js_preloaded, service): @@ -40,7 +40,7 @@ def test_heartbeat(self, n4js_preloaded, service): # start up heartbeat thread heartbeat_thread = threading.Thread(target=service.heartbeat, daemon=True) heartbeat_thread.start() - + # give time for a heartbeat time.sleep(2) @@ -66,13 +66,15 @@ def test_claim_tasks(self, n4js_preloaded, service): # should have 2 tasks assert len(task_sks) == 2 - subgraph = n4js.graph.run(f""" + subgraph = n4js.graph.run( + f""" match (csreg:ComputeServiceRegistration {{identifier: '{service.compute_service_id}'}}), (csreg)-[:CLAIMS]->(t:Task) return csreg, t - """).to_subgraph() + """ + ).to_subgraph() - assert len([node for node in subgraph.nodes if 'Task' in node.labels]) == 2 + assert len([node for node in subgraph.nodes if "Task" in node.labels]) == 2 def test_task_to_protocoldag( self, n4js_preloaded, service, network_tyk2, scope_test @@ -132,12 +134,14 @@ def test_cycle(self, n4js_preloaded, s3os_server_fresh, service): # postconditions protocoldagresultref = n4js.graph.run(q).to_subgraph() assert protocoldagresultref is not None - assert protocoldagresultref['ok'] == True + assert protocoldagresultref["ok"] == True - task = n4js.graph.run(""" + task = n4js.graph.run( + """ match (t:Task {status: 'complete'}) return t - """).to_subgraph() + """ + ).to_subgraph() assert task is not None @@ -154,7 +158,7 @@ def test_start(self, n4js_preloaded, s3os_server_fresh, service): # start up service in a thread; will register itself service_thread = threading.Thread(target=service.start, daemon=True) service_thread.start() - + # give time for execution time.sleep(2) @@ -173,10 +177,12 @@ def test_start(self, n4js_preloaded, s3os_server_fresh, service): else: break - task = n4js.graph.run(""" + task = n4js.graph.run( + """ match (t:Task {status: 'complete'}) return t - """).to_subgraph() + """ + ).to_subgraph() assert task is not None @@ -199,4 +205,4 @@ def test_kwarg_keep_scratch(self): def test_kwarg_scopes(self): # TODO: add test here with alternative settings to `service` fixture - scope = Scope('totally', 'different', 'scope') + scope = Scope("totally", "different", "scope") From c2c850c7687cfbb70be801fd576ac63a58d7fbb3 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 28 Mar 2023 00:08:15 -0700 Subject: [PATCH 37/41] Fix tests --- alchemiscale/cli.py | 6 +++--- alchemiscale/compute/api.py | 12 ++++++------ alchemiscale/compute/service.py | 2 +- alchemiscale/storage/statestore.py | 18 +++++++++++++++++- alchemiscale/tests/integration/conftest.py | 16 ++++++++++++---- .../interface/client/test_client.py | 9 ++++++++- .../integration/storage/test_statestore.py | 10 ++++++++-- alchemiscale/tests/integration/test_cli.py | 6 ++---- 8 files changed, 57 insertions(+), 22 deletions(-) diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index b2419439..ab9c5cbc 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -351,10 +351,10 @@ def synchronous(config_file): from alchemiscale.models import Scope from alchemiscale.compute.service import SynchronousComputeService - params = yaml.safe_load(config_file, Loader=yaml.Loader) + params = yaml.safe_load(config_file) - params_init = params["init"] - params_start = params["start"] + params_init = params.get("init", {}) + params_start = params.get("start", {}) if "scopes" in params_init: params_init["scopes"] = [ diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index edd577dc..e5d8272e 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -107,9 +107,9 @@ async def register_computeservice( identifier=compute_service_id, registered=now, heartbeat=now ) - n4js.register_computeservice(csreg) + compute_service_id_ = n4js.register_computeservice(csreg) - return compute_service_id + return compute_service_id_ @router.post("/computeservice/{compute_service_id}/deregister") @@ -117,9 +117,9 @@ async def deregister_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - n4js.deregister_computeservice(ComputeServiceID(compute_service_id)) + compute_service_id_ = n4js.deregister_computeservice(ComputeServiceID(compute_service_id)) - return compute_service_id + return compute_service_id_ @router.post("/computeservice/{compute_service_id}/heartbeat") @@ -137,9 +137,9 @@ async def heartbeat_computeservice( expire_time = now - expire_delta n4js.expire_registrations(expire_time) - n4js.heartbeat_computeservice(compute_service_id, now) + compute_service_id_ = n4js.heartbeat_computeservice(compute_service_id, now) - return compute_service_id + return compute_service_id_ @router.get("/taskhubs") diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 78a21f2d..a925674b 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -331,7 +331,7 @@ def cycle(self, max_tasks: Optional[int] = None, max_time: Optional[int] = None) # execute each task self.logger.info("Executing task '%s'...", task) self.execute(task) - self.logger.info("Completed task '%s'", task) + self.logger.info("Finished task '%s'", task) if max_tasks is not None: self._tasks_counter += 1 diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 75def31b..8be357b7 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -767,6 +767,8 @@ def register_computeservice( primary_key="identifier", ) + return compute_service_registration.identifier + def deregister_computeservice(self, compute_service_id: ComputeServiceID): """Remove the registration for the given ComputeServiceID from the state store. @@ -790,6 +792,8 @@ def deregister_computeservice(self, compute_service_id: ComputeServiceID): with self.transaction() as tx: tx.run(q) + return compute_service_id + def heartbeat_computeservice( self, compute_service_id: ComputeServiceID, heartbeat: datetime ): @@ -803,6 +807,8 @@ def heartbeat_computeservice( with self.transaction() as tx: tx.run(q) + return compute_service_id + def expire_registrations(self, expire_time: datetime): """Remove all registrations with last heartbeat prior to the given `expire_time`.""" q = f""" @@ -814,10 +820,20 @@ def expire_registrations(self, expire_time: datetime): OPTIONAL MATCH (n)-[cl:CLAIMS]->(t:Task {{status: 'running'}}) SET t.status = 'waiting' + WITH n, n.identifier as ident + DETACH DELETE n + + RETURN ident """ with self.transaction() as tx: - tx.run(q) + res = tx.run(q) + + identities = set() + for rec in res: + identities.add(rec['ident']) + + return list(identities) ## task hubs diff --git a/alchemiscale/tests/integration/conftest.py b/alchemiscale/tests/integration/conftest.py index 7a9833e8..4f0653a8 100644 --- a/alchemiscale/tests/integration/conftest.py +++ b/alchemiscale/tests/integration/conftest.py @@ -277,7 +277,12 @@ def protocoldagresults(tmpdir_factory, transformation): # execute the task with tmpdir_factory.mktemp("protocol_dag").as_cwd(): - protocoldagresult = execute_DAG(protocoldag, shared=Path(".").absolute()) + shared = Path("shared").absolute() + shared.mkdir() + scratch_basedir = Path("scratch").absolute() + scratch_basedir.mkdir() + + protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir) pdrs.append(protocoldagresult) return pdrs @@ -313,9 +318,12 @@ def protocoldagresults_failure(tmpdir_factory, transformation_failure): # execute the task with tmpdir_factory.mktemp("protocol_dag").as_cwd(): - protocoldagresult = execute_DAG( - protocoldag, shared=Path(".").absolute(), raise_error=False - ) + shared = Path("shared").absolute() + shared.mkdir() + scratch_basedir = Path("scratch").absolute() + scratch_basedir.mkdir() + + protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir, raise_error=False) pdrs.append(protocoldagresult) return pdrs diff --git a/alchemiscale/tests/integration/interface/client/test_client.py b/alchemiscale/tests/integration/interface/client/test_client.py index 6fda0209..82d8413d 100644 --- a/alchemiscale/tests/integration/interface/client/test_client.py +++ b/alchemiscale/tests/integration/interface/client/test_client.py @@ -1,5 +1,6 @@ import pytest from time import sleep +from pathlib import Path from gufe import AlchemicalNetwork, ChemicalSystem, Transformation from gufe.tokenization import TOKENIZABLE_REGISTRY, GufeKey @@ -251,6 +252,12 @@ def test_cancel_tasks( @staticmethod def _execute_tasks(tasks, n4js, s3os_server): + + shared = Path("shared").absolute() + shared.mkdir() + scratch_basedir = Path("scratch").absolute() + scratch_basedir.mkdir() + protocoldagresults = [] for task_sk in tasks: if task_sk is None: @@ -267,7 +274,7 @@ def _execute_tasks(tasks, n4js, s3os_server): name=str(task_sk), ) - protocoldagresult = execute_DAG(protocoldag, raise_error=False) + protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir, raise_error=False) assert protocoldagresult.transformation_key == transformation.key if extends_protocoldagresult: diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 357abb7a..acb2bbfb 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -347,7 +347,7 @@ def test_expire_registrations(self, n4js, compute_service_id): # expire any compute service that had a heartbeat more than 30 mins ago thirty_mins_ago = now - timedelta(minutes=30) - n4js.expire_registrations(expire_time=thirty_mins_ago) + identities = n4js.expire_registrations(expire_time=thirty_mins_ago) csreg = n4js.graph.run( f""" @@ -357,6 +357,7 @@ def test_expire_registrations(self, n4js, compute_service_id): ).to_subgraph() assert csreg is None + assert compute_service_id in identities def test_create_task(self, n4js, network_tyk2, scope_test): # add alchemical network, then try generating task @@ -1024,7 +1025,12 @@ def test_set_task_result(self, n4js: Neo4jStore, network_tyk2, scope_test, tmpdi # execute the task with tmpdir.as_cwd(): - protocoldagresult = execute_DAG(protocoldag, shared=Path(".").absolute()) + shared = Path("shared").absolute() + shared.mkdir() + scratch_basedir = Path("scratch").absolute() + scratch_basedir.mkdir() + + protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir) pdr_ref = ProtocolDAGResultRef( scope=task_sk.scope, obj_key=protocoldagresult.key, ok=True diff --git a/alchemiscale/tests/integration/test_cli.py b/alchemiscale/tests/integration/test_cli.py index 851fe9af..8e986316 100644 --- a/alchemiscale/tests/integration/test_cli.py +++ b/alchemiscale/tests/integration/test_cli.py @@ -215,9 +215,7 @@ def test_compute_synchronous( where csreg.identifier =~ "{compute_service_config['init']['name']}.*" return csreg """ - # try 5 times to be safe; depends on running host as to how fast - # process comes up - for i in range(5): + while True: csreg = n4js.graph.run(q).to_subgraph() if csreg is None: time.sleep(1) @@ -268,7 +266,7 @@ def test_get_settings_from_options(cli_vars): def test_database_init(n4js_fresh): n4js = n4js_fresh # ensure the database is empty - n4js.graph.run("MATCH (n) WHERE NOT n:NOPE DETACH DELETE n") + n4js.reset() with pytest.raises(Neo4JStoreError): n4js.check() From e4b8559ea8a4e8e7a1a7409af71597ea6abecf8d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 28 Mar 2023 00:08:58 -0700 Subject: [PATCH 38/41] Black! --- alchemiscale/compute/api.py | 4 +++- alchemiscale/storage/statestore.py | 2 +- alchemiscale/tests/integration/conftest.py | 11 +++++++++-- .../tests/integration/interface/client/test_client.py | 8 ++++++-- .../tests/integration/storage/test_statestore.py | 4 +++- 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index e5d8272e..28e5e2f4 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -117,7 +117,9 @@ async def deregister_computeservice( compute_service_id, n4js: Neo4jStore = Depends(get_n4js_depends), ): - compute_service_id_ = n4js.deregister_computeservice(ComputeServiceID(compute_service_id)) + compute_service_id_ = n4js.deregister_computeservice( + ComputeServiceID(compute_service_id) + ) return compute_service_id_ diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index 8be357b7..d3757f88 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -831,7 +831,7 @@ def expire_registrations(self, expire_time: datetime): identities = set() for rec in res: - identities.add(rec['ident']) + identities.add(rec["ident"]) return list(identities) diff --git a/alchemiscale/tests/integration/conftest.py b/alchemiscale/tests/integration/conftest.py index 4f0653a8..499058b5 100644 --- a/alchemiscale/tests/integration/conftest.py +++ b/alchemiscale/tests/integration/conftest.py @@ -282,7 +282,9 @@ def protocoldagresults(tmpdir_factory, transformation): scratch_basedir = Path("scratch").absolute() scratch_basedir.mkdir() - protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir) + protocoldagresult = execute_DAG( + protocoldag, shared=shared, scratch_basedir=scratch_basedir + ) pdrs.append(protocoldagresult) return pdrs @@ -323,7 +325,12 @@ def protocoldagresults_failure(tmpdir_factory, transformation_failure): scratch_basedir = Path("scratch").absolute() scratch_basedir.mkdir() - protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir, raise_error=False) + protocoldagresult = execute_DAG( + protocoldag, + shared=shared, + scratch_basedir=scratch_basedir, + raise_error=False, + ) pdrs.append(protocoldagresult) return pdrs diff --git a/alchemiscale/tests/integration/interface/client/test_client.py b/alchemiscale/tests/integration/interface/client/test_client.py index 82d8413d..32b5277d 100644 --- a/alchemiscale/tests/integration/interface/client/test_client.py +++ b/alchemiscale/tests/integration/interface/client/test_client.py @@ -252,7 +252,6 @@ def test_cancel_tasks( @staticmethod def _execute_tasks(tasks, n4js, s3os_server): - shared = Path("shared").absolute() shared.mkdir() scratch_basedir = Path("scratch").absolute() @@ -274,7 +273,12 @@ def _execute_tasks(tasks, n4js, s3os_server): name=str(task_sk), ) - protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir, raise_error=False) + protocoldagresult = execute_DAG( + protocoldag, + shared=shared, + scratch_basedir=scratch_basedir, + raise_error=False, + ) assert protocoldagresult.transformation_key == transformation.key if extends_protocoldagresult: diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index acb2bbfb..a5d8552a 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -1030,7 +1030,9 @@ def test_set_task_result(self, n4js: Neo4jStore, network_tyk2, scope_test, tmpdi scratch_basedir = Path("scratch").absolute() scratch_basedir.mkdir() - protocoldagresult = execute_DAG(protocoldag, shared=shared, scratch_basedir=scratch_basedir) + protocoldagresult = execute_DAG( + protocoldag, shared=shared, scratch_basedir=scratch_basedir + ) pdr_ref = ProtocolDAGResultRef( scope=task_sk.scope, obj_key=protocoldagresult.key, ok=True From 5fca2269a6780c13c706a7a5e450513ded92a2ec Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 29 Mar 2023 22:29:19 -0700 Subject: [PATCH 39/41] Update alchemiscale/compute/service.py Co-authored-by: Hugo MacDermott-Opeskin --- alchemiscale/compute/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index a925674b..73183625 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -198,7 +198,7 @@ def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: while len(tasks) < count and len(taskhubs) > 0: # based on weights, choose taskhub to draw from taskhub: List[ScopedKey] = random.choices( - list(taskhubs.keys()), weights=[tq.weight for tq in taskhubs.values()] + list(taskhubs.keys()), weights=[th.weight for th in taskhubs.values()] )[0] # claim tasks from the taskhub From af7f7eb2145776a9b42e527abf4aded917682eb3 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 30 Mar 2023 20:35:41 -0700 Subject: [PATCH 40/41] Additions from @hmacdope review --- alchemiscale/compute/service.py | 3 + alchemiscale/storage/statestore.py | 66 +++++++++++++++---- .../integration/storage/test_statestore.py | 8 ++- .../configs/synchronous-compute-settings.yaml | 57 ++++++++++++++++ 4 files changed, 120 insertions(+), 14 deletions(-) create mode 100644 devtools/configs/synchronous-compute-settings.yaml diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 73183625..f6dd4220 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -121,6 +121,9 @@ def __init__( by compute identity. claim_limit Maximum number of Tasks to claim at a time from a TaskHub. + loglevel + The loglevel at which to report via STDOUT; see the :mod:`logging` + docs for available levels. """ self.api_url = api_url diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index d3757f88..d5c519c3 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -160,6 +160,7 @@ def transaction(self, readonly=False, ignore_exceptions=False) -> Transaction: self.graph.rollback(tx) if not ignore_exceptions: raise + else: self.graph.commit(tx) @@ -377,7 +378,8 @@ def _subgraph_to_gufe( ) -> Dict[Node, GufeTokenizable]: """Get a Dict `GufeTokenizable` objects within the given subgraph. - Any `GufeTokenizable` that requires nodes or relationships missing from the subgraph will not be returned. + Any `GufeTokenizable` that requires nodes or relationships missing from + the subgraph will not be returned. """ nxg = self._subgraph_to_networkx(subgraph) @@ -743,7 +745,12 @@ def set_strategy( network: ScopedKey, ) -> ScopedKey: """Set the compute Strategy for the given AlchemicalNetwork.""" - ... + + if network.qualname != "AlchemicalNetwork": + raise ValueError( + "`network` ScopedKey does not correspond to an `AlchemicalNetwork`" + ) + raise NotImplementedError def register_computeservice( self, compute_service_registration: ComputeServiceRegistration @@ -761,11 +768,7 @@ def register_computeservice( ) with self.transaction() as tx: - tx.merge( - node, - primary_label="ComputeServiceRegistration", - primary_key="identifier", - ) + tx.create(node) return compute_service_registration.identifier @@ -780,24 +783,31 @@ def deregister_computeservice(self, compute_service_id: ComputeServiceID): and with status `running` will have their status set to `waiting`. """ + q = f""" MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) OPTIONAL MATCH (n)-[cl:CLAIMS]->(t:Task {{status: 'running'}}) SET t.status = 'waiting' + WITH n, n.identifier as identifier + DETACH DELETE n + + RETURN identifier """ with self.transaction() as tx: - tx.run(q) + res = tx.run(q) + identifier = next(res)['identifier'] - return compute_service_id + return ComputeServiceID(identifier) def heartbeat_computeservice( self, compute_service_id: ComputeServiceID, heartbeat: datetime ): """Update the heartbeat for the given ComputeServiceID.""" + q = f""" MATCH (n:ComputeServiceRegistration {{identifier: '{compute_service_id}'}}) SET n.heartbeat = localdatetime('{heartbeat.isoformat()}') @@ -833,7 +843,7 @@ def expire_registrations(self, expire_time: datetime): for rec in res: identities.add(rec["ident"]) - return list(identities) + return [ComputeServiceID(i) for i in identities] ## task hubs @@ -851,6 +861,11 @@ def create_taskhub( either way. """ + if network.qualname != "AlchemicalNetwork": + raise ValueError( + "`network` ScopedKey does not correspond to an `AlchemicalNetwork`" + ) + scope = network.scope network_node = self._get_node(network) @@ -905,6 +920,11 @@ def get_taskhub( Otherwise, return a `ScopedKey`. """ + if network.qualname != "AlchemicalNetwork": + raise ValueError( + "`network` ScopedKey does not correspond to an `AlchemicalNetwork`" + ) + node = self.graph.run( f""" match (th:TaskHub {{network: "{network}"}})-[:PERFORMS]->(an:AlchemicalNetwork) @@ -922,6 +942,12 @@ def delete_taskhub( network: ScopedKey, ) -> ScopedKey: """Delete a TaskHub for a given AlchemicalNetwork.""" + + if network.qualname != "AlchemicalNetwork": + raise ValueError( + "`network` ScopedKey does not correspond to an `AlchemicalNetwork`" + ) + taskhub = self.get_taskhub(network) q = f""" @@ -933,6 +959,16 @@ def delete_taskhub( return taskhub def set_taskhub_weight(self, network: ScopedKey, weight: float): + """Set the weight for the TaskHub associated with the given + AlchemicalNetwork. + + """ + + if network.qualname != "AlchemicalNetwork": + raise ValueError( + "`network` ScopedKey does not correspond to an `AlchemicalNetwork`" + ) + q = f""" MATCH (th:TaskHub {{network: "{network}"}}) SET th.weight = {weight} @@ -1303,8 +1339,7 @@ def create_task( `extends` input for the Task's eventual call to `Protocol.create`. """ - scope = transformation.scope - if "Transformation" not in transformation.qualname: + if transformation.qualname not in ["Transformation", "NonTransformation"]: raise ValueError( "`transformation` ScopedKey does not correspond to a `Transformation`" ) @@ -1312,6 +1347,7 @@ def create_task( if extends is not None and extends.qualname != "Task": raise ValueError("`extends` ScopedKey does not correspond to a `Task`") + scope = transformation.scope transformation_node = self._get_node(transformation) # create a new task for the supplied transformation @@ -1537,6 +1573,12 @@ def set_task_result( self, task: ScopedKey, protocoldagresultref: ProtocolDAGResultRef ) -> ScopedKey: """Set a `ProtocolDAGResultRef` pointing to a `ProtocolDAGResult` for the given `Task`.""" + + if task.qualname != "Task": + raise ValueError( + "`task` ScopedKey does not correspond to a `Task`" + ) + scope = task.scope task_node = self._get_node(task) diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index a5d8552a..bdc47968 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -279,7 +279,9 @@ def test_register_computeservice(self, n4js, compute_service_id): identifier=compute_service_id, registered=now, heartbeat=now ) - n4js.register_computeservice(registration) + compute_service_id_ = n4js.register_computeservice(registration) + + assert compute_service_id == compute_service_id_ csreg = n4js.graph.run( f""" @@ -301,7 +303,9 @@ def test_deregister_computeservice(self, n4js, compute_service_id): n4js.register_computeservice(registration) # try deregistering - n4js.deregister_computeservice(compute_service_id) + compute_service_id_ = n4js.deregister_computeservice(compute_service_id) + + assert compute_service_id == compute_service_id_ csreg = n4js.graph.run( f""" diff --git a/devtools/configs/synchronous-compute-settings.yaml b/devtools/configs/synchronous-compute-settings.yaml new file mode 100644 index 00000000..77e728b3 --- /dev/null +++ b/devtools/configs/synchronous-compute-settings.yaml @@ -0,0 +1,57 @@ +--- +# options for service initialization +init: + + # URL of the compute API to execute Tasks for. + api_url: https://compute.alchemiscale-instance.localdomain + + # Identifier for the compute identity used for authentication. + identifier: compute-identity + + # Credential for the compute identity used for authentication. + key: "compute-identity-key" + + # The name to give this compute service; used for Task provenance, so + # typically set to a distinct value to distinguish different compute + # resources, e.g. different hosts or HPC clusters. + name: compute-resource-name + + # Filesystem path to use for `ProtocolDAG` `shared` space. + shared_basedir: "./shared" + + # Filesystem path to use for `ProtocolUnit` `scratch` space. + scratch_basedir: "./scratch" + + # If True, don't remove shared directories for `ProtocolDAG`s after + # completion. + keep_shared: False + + # If True, don't remove scratch directories for `ProtocolUnit`s after + # completion. + keep_scratch: False + + # Time in seconds to sleep if no Tasks claimed from compute API. + sleep_interval: 30 + + # Frequency at which to send heartbeats to compute API. + heartbeat_interval: 300 + + # Scopes to limit Task claiming to; defaults to all Scopes accessible by + # compute identity. + scopes: + - '*-*-*' + + # The loglevel at which to report via STDOUT; see the :mod:`logging` docs for + # available levels. + loglevel: 'WARN' + +# options for service execution +start: + + # Max number of Tasks to execute before exiting. If `null`, the service will + # have no task limit. + max_tasks: null + + # Max number of seconds to run before exiting. If `null`, the service will + # have no time limit. + max_time: null From 83a72bcb305109b08644838ce96b3d4f8c4f38f4 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 30 Mar 2023 20:36:02 -0700 Subject: [PATCH 41/41] Black! --- alchemiscale/storage/statestore.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index d5c519c3..d5ad080e 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -160,7 +160,7 @@ def transaction(self, readonly=False, ignore_exceptions=False) -> Transaction: self.graph.rollback(tx) if not ignore_exceptions: raise - + else: self.graph.commit(tx) @@ -799,7 +799,7 @@ def deregister_computeservice(self, compute_service_id: ComputeServiceID): with self.transaction() as tx: res = tx.run(q) - identifier = next(res)['identifier'] + identifier = next(res)["identifier"] return ComputeServiceID(identifier) @@ -1575,9 +1575,7 @@ def set_task_result( """Set a `ProtocolDAGResultRef` pointing to a `ProtocolDAGResult` for the given `Task`.""" if task.qualname != "Task": - raise ValueError( - "`task` ScopedKey does not correspond to a `Task`" - ) + raise ValueError("`task` ScopedKey does not correspond to a `Task`") scope = task.scope task_node = self._get_node(task)