Add wait_for_rest_service fn to evaluate method

Signed-off-by: Abhishree <[email protected]>
NVIDIA · Sep 30, 2024 · 30ee09e · 30ee09e
1 parent a1832c4
commit 30ee09e
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 131 deletions.
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -427,7 +427,51 @@ def evaluate(
 
     from lm_eval import tasks, evaluator
     from lm_eval.api.model import LM
+    import time
     import requests
+    from requests.exceptions import RequestException
+
+    def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2):
+        """
+        Wait for REST service to be ready.
+
+        Args:
+        rest_url (str): URL of the REST service's health endpoint
+        max_retries (int): Maximum number of retry attempts
+        retry_interval (int): Time to wait between retries in seconds
+
+        Returns:
+        bool: True if rest service is ready, False otherwise
+        """
+        for _ in range(max_retries):
+            rest_ready = check_service(rest_url)
+
+            if rest_ready:
+                print("REST service is ready.")
+                return True
+
+            print(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
+            time.sleep(retry_interval)
+
+        print("Timeout: One or both services did not become ready.")
+        return False
+
+    def check_service(url):
+        """
+        Check if a service is ready by making a GET request to its health endpoint.
+
+        Args:
+        url (str): URL of the service's health endpoint
+
+        Returns:
+        bool: True if the service is ready, False otherwise
+        """
+        try:
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except RequestException:
+            return False
+
     class CustomModel(LM):
         def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
@@ -475,8 +519,9 @@ def generate_until(self, inputs):
                 results.append(generated_text)
 
             return results
-    model = CustomModel(model_name, url, temperature, top_p, top_k)
 
+    wait_for_rest_service(rest_url=f"{url}/health")
+    model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k)
     results = evaluator.simple_evaluate(
         model=model,
         tasks=eval_task,

diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
@@ -20,7 +20,6 @@
 
 from nemo.deploy.nlp import NemoQueryLLM
 
-
 class TritonSettings(BaseSettings):
     _triton_service_port: int
     _triton_service_ip: str
@@ -63,7 +62,6 @@ def openai_format_response(self):
 app = FastAPI()
 triton_settings = TritonSettings()
 
-
 class CompletionRequest(BaseModel):
     model: str
     prompt: str
@@ -76,15 +74,15 @@ class CompletionRequest(BaseModel):
     frequency_penalty: float = 1.0
 
 
-@app.get("/hello")
-def root():
-    return {"message": "Hello World"}
+@app.get("/v1/health")
+def health_check():
+    return {"status": "ok"}
 
-@app.get("/triton_health")
+@app.get("/v1/triton_health")
 async def check_triton_health():
     """
     This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application.
-    Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible.
+    Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible.
     """
     triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
     print(f"Attempting to connect to Triton server at: {triton_url}")

diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py