From eb203c051ad23ef34522fc9d0c17019e67ebfa88 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 1 Jun 2024 17:26:21 -0700 Subject: [PATCH 1/3] feat - set custom AllowedFailsPolicy --- litellm/router.py | 59 +++++++++++++++++++++++++++++++++++++++-- litellm/types/router.py | 21 ++++++++++++++- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index 88eb54a04c7b..2a1e8a122c88 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -47,6 +47,7 @@ updateDeployment, updateLiteLLMParams, RetryPolicy, + AllowedFailsPolicy, AlertingConfig, DeploymentTypedDict, ModelGroupInfo, @@ -113,6 +114,9 @@ def __init__( allowed_fails: Optional[ int ] = None, # Number of times a deployment can failbefore being added to cooldown + allowed_fails_policy: Optional[ + AllowedFailsPolicy + ] = None, # set custom allowed fails policy cooldown_time: Optional[ float ] = None, # (seconds) time to cooldown a deployment after failure @@ -355,6 +359,7 @@ def __init__( self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = ( model_group_retry_policy ) + self.allowed_fails_policy: Optional[AllowedFailsPolicy] = allowed_fails_policy self.alerting_config: Optional[AlertingConfig] = alerting_config if self.alerting_config is not None: self._initialize_alerting() @@ -2350,6 +2355,7 @@ def deployment_callback_on_failure( deployment_id = _model_info.get("id", None) self._set_cooldown_deployments( exception_status=exception_status, + original_exception=exception, deployment=deployment_id, time_to_cooldown=_time_to_cooldown, ) # setting deployment_id in cooldown deployments @@ -2455,6 +2461,7 @@ def _is_cooldown_required(self, exception_status: Union[str, int]): def _set_cooldown_deployments( self, + original_exception: Any, exception_status: Union[str, int], deployment: Optional[str] = None, time_to_cooldown: Optional[float] = None, @@ -2473,6 +2480,12 @@ def _set_cooldown_deployments( if self._is_cooldown_required(exception_status=exception_status) == False: return + _allowed_fails = self.get_allowed_fails_from_policy( + exception=original_exception, + ) + + allowed_fails = _allowed_fails or self.allowed_fails + dt = get_utc_datetime() current_minute = dt.strftime("%H-%M") # get current fails for deployment @@ -2482,7 +2495,7 @@ def _set_cooldown_deployments( current_fails = self.failed_calls.get_cache(key=deployment) or 0 updated_fails = current_fails + 1 verbose_router_logger.debug( - f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" + f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {allowed_fails}" ) cooldown_time = self.cooldown_time or 1 if time_to_cooldown is not None: @@ -2499,7 +2512,8 @@ def _set_cooldown_deployments( ) exception_status = 500 _should_retry = litellm._should_retry(status_code=exception_status) - if updated_fails > self.allowed_fails or _should_retry == False: + + if updated_fails > allowed_fails or _should_retry == False: # get the current cooldown list for that minute cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cached_value = self.cache.get_cache(key=cooldown_key) @@ -2642,6 +2656,7 @@ async def async_routing_strategy_pre_call_checks(self, deployment: dict): except litellm.RateLimitError as e: self._set_cooldown_deployments( exception_status=e.status_code, + original_exception=e, deployment=deployment["model_info"]["id"], time_to_cooldown=self.cooldown_time, ) @@ -4334,6 +4349,46 @@ def get_num_retries_from_retry_policy( ): return retry_policy.ContentPolicyViolationErrorRetries + def get_allowed_fails_from_policy(self, exception: Exception): + """ + BadRequestErrorRetries: Optional[int] = None + AuthenticationErrorRetries: Optional[int] = None + TimeoutErrorRetries: Optional[int] = None + RateLimitErrorRetries: Optional[int] = None + ContentPolicyViolationErrorRetries: Optional[int] = None + """ + # if we can find the exception then in the retry policy -> return the number of retries + allowed_fails_policy: Optional[AllowedFailsPolicy] = self.allowed_fails_policy + + if allowed_fails_policy is None: + return None + + if ( + isinstance(exception, litellm.BadRequestError) + and allowed_fails_policy.BadRequestErrorAllowedFails is not None + ): + return allowed_fails_policy.BadRequestErrorAllowedFails + if ( + isinstance(exception, litellm.AuthenticationError) + and allowed_fails_policy.AuthenticationErrorAllowedFails is not None + ): + return allowed_fails_policy.AuthenticationErrorAllowedFails + if ( + isinstance(exception, litellm.Timeout) + and allowed_fails_policy.TimeoutErrorAllowedFails is not None + ): + return allowed_fails_policy.TimeoutErrorAllowedFails + if ( + isinstance(exception, litellm.RateLimitError) + and allowed_fails_policy.RateLimitErrorAllowedFails is not None + ): + return allowed_fails_policy.RateLimitErrorAllowedFails + if ( + isinstance(exception, litellm.ContentPolicyViolationError) + and allowed_fails_policy.ContentPolicyViolationErrorAllowedFails is not None + ): + return allowed_fails_policy.ContentPolicyViolationErrorAllowedFails + def _initialize_alerting(self): from litellm.integrations.slack_alerting import SlackAlerting diff --git a/litellm/types/router.py b/litellm/types/router.py index 8fed461cb628..4a1f4498c609 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -76,7 +76,9 @@ class ModelInfo(BaseModel): id: Optional[ str ] # Allow id to be optional on input, but it will always be present as a str in the model instance - db_model: bool = False # used for proxy - to separate models which are stored in the db vs. config. + db_model: bool = ( + False # used for proxy - to separate models which are stored in the db vs. config. + ) updated_at: Optional[datetime.datetime] = None updated_by: Optional[str] = None @@ -381,6 +383,23 @@ class RouterErrors(enum.Enum): no_deployments_available = "No deployments available for selected model" +class AllowedFailsPolicy(BaseModel): + """ + Use this to set a custom number of allowed_fails for each exception type before cooling down a deployment + If RateLimitErrorRetries = 3, then 3 retries will be made for RateLimitError + + Mapping of Exception type to allowed_fails for each exception + https://docs.litellm.ai/docs/exception_mapping + """ + + BadRequestErrorAllowedFails: Optional[int] = None + AuthenticationErrorAllowedFails: Optional[int] = None + TimeoutErrorAllowedFails: Optional[int] = None + RateLimitErrorAllowedFails: Optional[int] = None + ContentPolicyViolationErrorAllowedFails: Optional[int] = None + InternalServerErrorAllowedFails: Optional[int] = None + + class RetryPolicy(BaseModel): """ Use this to set a custom number of retries per exception type From 4eaeec2a2dfcd645182fde7b559e4adc6faf7fc2 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 1 Jun 2024 17:39:44 -0700 Subject: [PATCH 2/3] feat - set allowed fails policy --- litellm/tests/test_router_retries.py | 8 +++++++- litellm/types/router.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/litellm/tests/test_router_retries.py b/litellm/tests/test_router_retries.py index 1c0c3e49f834..879af261abbb 100644 --- a/litellm/tests/test_router_retries.py +++ b/litellm/tests/test_router_retries.py @@ -128,12 +128,17 @@ async def test_router_retries_errors(sync_mode, error_type): ["AuthenticationErrorRetries", "ContentPolicyViolationErrorRetries"], # ) async def test_router_retry_policy(error_type): - from litellm.router import RetryPolicy + from litellm.router import RetryPolicy, AllowedFailsPolicy retry_policy = RetryPolicy( ContentPolicyViolationErrorRetries=3, AuthenticationErrorRetries=0 ) + allowed_fails_policy = AllowedFailsPolicy( + ContentPolicyViolationErrorAllowedFails=1000, + RateLimitErrorAllowedFails=100, + ) + router = Router( model_list=[ { @@ -156,6 +161,7 @@ async def test_router_retry_policy(error_type): }, ], retry_policy=retry_policy, + allowed_fails_policy=allowed_fails_policy, ) customHandler = MyCustomHandler() diff --git a/litellm/types/router.py b/litellm/types/router.py index 4a1f4498c609..38ddef361e73 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -385,8 +385,8 @@ class RouterErrors(enum.Enum): class AllowedFailsPolicy(BaseModel): """ - Use this to set a custom number of allowed_fails for each exception type before cooling down a deployment - If RateLimitErrorRetries = 3, then 3 retries will be made for RateLimitError + Use this to set a custom number of allowed fails/minute before cooling down a deployment + If `AuthenticationErrorAllowedFails = 1000`, then 1000 AuthenticationError will be allowed before cooling down a deployment Mapping of Exception type to allowed_fails for each exception https://docs.litellm.ai/docs/exception_mapping From 9f0ae21ef54d21b247e3b58cb78cc437a94ea36e Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 1 Jun 2024 17:56:57 -0700 Subject: [PATCH 3/3] docs - AllowedFailsPolicy --- docs/my-website/docs/routing.md | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 5ba3221c9764..d91912644fc8 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -713,26 +713,43 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -#### Retries based on Error Type +### [Advanced]: Custom Retries, Cooldowns based on Error Type -Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved +- Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved +- Use `AllowedFailsPolicy` to set a custom number of `allowed_fails`/minute before cooling down a deployment Example: -- 4 retries for `ContentPolicyViolationError` -- 0 retries for `RateLimitErrors` + +```python +retry_policy = RetryPolicy( + ContentPolicyViolationErrorRetries=3, # run 3 retries for ContentPolicyViolationErrors + AuthenticationErrorRetries=0, # run 0 retries for AuthenticationErrorRetries +) + +allowed_fails_policy = AllowedFailsPolicy( + ContentPolicyViolationErrorAllowedFails=1000, # Allow 1000 ContentPolicyViolationError before cooling down a deployment + RateLimitErrorAllowedFails=100, # Allow 100 RateLimitErrors before cooling down a deployment +) +``` Example Usage ```python -from litellm.router import RetryPolicy +from litellm.router import RetryPolicy, AllowedFailsPolicy + retry_policy = RetryPolicy( - ContentPolicyViolationErrorRetries=3, # run 3 retries for ContentPolicyViolationErrors - AuthenticationErrorRetries=0, # run 0 retries for AuthenticationErrorRetries + ContentPolicyViolationErrorRetries=3, # run 3 retries for ContentPolicyViolationErrors + AuthenticationErrorRetries=0, # run 0 retries for AuthenticationErrorRetries BadRequestErrorRetries=1, TimeoutErrorRetries=2, RateLimitErrorRetries=3, ) +allowed_fails_policy = AllowedFailsPolicy( + ContentPolicyViolationErrorAllowedFails=1000, # Allow 1000 ContentPolicyViolationError before cooling down a deployment + RateLimitErrorAllowedFails=100, # Allow 100 RateLimitErrors before cooling down a deployment +) + router = litellm.Router( model_list=[ { @@ -755,6 +772,7 @@ router = litellm.Router( }, ], retry_policy=retry_policy, + allowed_fails_policy=allowed_fails_policy, ) response = await router.acompletion(