From 594675e1981696ea754751c2e73c4d71db0869c8 Mon Sep 17 00:00:00 2001 From: william Date: Fri, 22 Nov 2024 15:47:33 -0600 Subject: [PATCH 1/2] Hopefully improve reliability and debugging output a bit. --- .../mistral/modelgauge/suts/mistral_client.py | 4 +-- src/modelbench/benchmark_runner.py | 25 +++++++++++-------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/plugins/mistral/modelgauge/suts/mistral_client.py b/plugins/mistral/modelgauge/suts/mistral_client.py index 3c603c8c..0e683d1c 100644 --- a/plugins/mistral/modelgauge/suts/mistral_client.py +++ b/plugins/mistral/modelgauge/suts/mistral_client.py @@ -1,15 +1,13 @@ from mistralai import Mistral - from mistralai.models import HTTPValidationError, SDKError from mistralai.utils import BackoffStrategy, RetryConfig from modelgauge.secret_values import RequiredSecret, SecretDescription - BACKOFF_INITIAL_MILLIS = 500 BACKOFF_MAX_INTERVAL_MILLIS = 10_000 BACKOFF_EXPONENT = 1.1 -BACKOFF_MAX_ELAPSED_MILLIS = 60_000 +BACKOFF_MAX_ELAPSED_MILLIS = 120_000 class MistralAIAPIKey(RequiredSecret): diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index fe53dfe3..7e178655 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -11,6 +11,14 @@ from multiprocessing.pool import ThreadPool from typing import Any, Iterable, Optional, Sequence +from pydantic import BaseModel +from tqdm import tqdm + +from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem, Timer +from modelbench.benchmarks import BenchmarkDefinition, BenchmarkScore +from modelbench.cache import DiskCache, MBCache +from modelbench.run_journal import RunJournal +from modelbench.suts import ModelGaugeSut from modelgauge.annotator import CompletionAnnotator from modelgauge.annotator_registry import ANNOTATORS from modelgauge.base_test import PromptResponseTest, TestResult @@ -21,15 +29,6 @@ from modelgauge.single_turn_prompt_response import PromptWithContext, TestItem from modelgauge.sut import SUTCompletion, SUTResponse -from pydantic import BaseModel -from tqdm import tqdm - -from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem, Timer -from modelbench.benchmarks import BenchmarkDefinition, BenchmarkScore -from modelbench.cache import DiskCache, MBCache -from modelbench.run_journal import RunJournal -from modelbench.suts import ModelGaugeSut - logger = logging.getLogger(__name__) @@ -297,7 +296,11 @@ def handle_item(self, item: TestRunItem): else: self._debug(f"cache entry not found; processing and saving") with Timer() as timer: - raw_response = mg_sut.evaluate(raw_request) + try: + raw_response = mg_sut.evaluate(raw_request) + except Exception as e: + logger.error(f"failure fetching sut {mg_sut.uid} on first try: {raw_request}", exc_info=True) + raw_response = mg_sut.evaluate(raw_request) self.cache[cache_key] = raw_response self.test_run.journal.item_entry( "fetched sut response", item, run_time=timer, request=raw_request, response=raw_response @@ -315,7 +318,7 @@ def handle_item(self, item: TestRunItem): pass item.exceptions.append(e) self.test_run.journal.item_exception_entry("sut exception", item, e, **extra_info) - logger.error(f"failure handling sut item {item}:", exc_info=e) + logger.error(f"failure handling sut item {item}:", exc_info=True) return item From eb7e87880930c197b5a044c10af0c13aaa15c369 Mon Sep 17 00:00:00 2001 From: william Date: Fri, 22 Nov 2024 15:56:22 -0600 Subject: [PATCH 2/2] Add ability to dump threads so we can debug when stuck. --- src/modelbench/run.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/modelbench/run.py b/src/modelbench/run.py index 592f654b..5c283f04 100644 --- a/src/modelbench/run.py +++ b/src/modelbench/run.py @@ -1,3 +1,5 @@ +import faulthandler +import io import json import logging import os @@ -5,6 +7,7 @@ import pkgutil import platform import random +import signal import sys import warnings from collections import defaultdict @@ -51,6 +54,11 @@ def load_local_plugins(_, __, path: pathlib.Path): @click.group() @local_plugin_dir_option def cli() -> None: + try: + faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True, chain=False) + except io.UnsupportedOperation: + pass # just an issue with some tests that capture sys.stderr + log_dir = pathlib.Path("run/logs") log_dir.mkdir(exist_ok=True, parents=True) logging.basicConfig(