From 16f619c5d1ef08788adfa1f9d29ae267277c0c79 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Thu, 13 Jun 2024 15:50:11 +0200
Subject: [PATCH] Support HF_TOKEN environement variable

---
 .github/workflows/build.yaml                  |  2 +-
 .github/workflows/load_test.yaml              |  2 +-
 .github/workflows/tests.yaml                  |  2 +-
 README.md                                     |  6 ++--
 benchmark/src/main.rs                         |  2 +-
 .../basic_tutorials/gated_model_access.md     |  8 ++---
 integration-tests/conftest.py                 | 36 +++++++++----------
 launcher/src/main.rs                          |  6 ++--
 router/src/main.rs                            |  2 +-
 9 files changed, 33 insertions(+), 33 deletions(-)
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 22fa06e33bf..90fb9d45f21 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -178,6 +178,6 @@ jobs:
           export DOCKER_VOLUME=/mnt/cache
           export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
           export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           echo $DOCKER_IMAGE
           pytest -s -vv integration-tests
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
index fd22e395780..4afe9bbd348 100644
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -70,7 +70,7 @@ jobs:
 
       - name: Start starcoder
         run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
           sleep 10
           wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
 
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a8074dddf9b..e21344d1041 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -72,7 +72,7 @@ jobs:
       - name: Run server tests
         run: |
           pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           pytest -s -vv server/tests
       - name: Pre-commit checks
         run: |
diff --git a/README.md b/README.md
index 74616748efa..3b54af45149 100644
--- a/README.md
+++ b/README.md
@@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
 
 ### Using a private or gated model
 
-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.
 
 For example, if you want to serve the gated Llama V2 model variants:
 
 1. Go to https://huggingface.co/settings/tokens
 2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+3. Export `HF_TOKEN=<your cli READ token>`
 
 or with Docker:
 
@@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index b9d80b7a9a6..603b4087c52 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             tracing::info!("Downloading tokenizer");
 
             // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
 
             // Download and instantiate tokenizer
             // We need to download it outside of the Tokio runtime
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index b49c59c92ef..ef3a1db7d2f 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -2,13 +2,13 @@
 
 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
 
-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
 
 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```
 
-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
 
 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@@ -17,7 +17,7 @@ token=<your READ token>
 
 docker run --gpus all \
     --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
     -p 8080:80 \
     -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
     --model-id $model
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 0b239484c9b..13337165094 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,38 +1,38 @@
-import sys
-import subprocess
-import contextlib
-import pytest
 import asyncio
-import os
-import docker
+import contextlib
 import json
 import math
+import os
+import random
+import re
 import shutil
+import subprocess
+import sys
 import tempfile
 import time
-import random
+from typing import Dict, List, Optional
 
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
-from typing import Optional, List, Dict
 from syrupy.extensions.json import JSONSnapshotExtension
-from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-
 from text_generation import AsyncClient
 from text_generation.types import (
-    Response,
-    Details,
-    InputToken,
-    Token,
     BestOfSequence,
-    Grammar,
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
     Completion,
+    Details,
+    Grammar,
+    InputToken,
+    Response,
+    Token,
 )
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
 
@@ -447,8 +447,8 @@ def docker_launcher(
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
 
-        if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+        if HF_TOKEN is not None:
+            env["HF_TOKEN"] = HF_TOKEN
 
         volumes = []
         if DOCKER_VOLUME:
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index e4d5bb85107..3e0c7a27a03 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -592,7 +592,7 @@ fn shard_manager(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Detect rope scaling
@@ -925,7 +925,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // If args.weights_cache_override is some, pass it to the download process
@@ -1227,7 +1227,7 @@ fn spawn_webserver(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Parse Compute type
diff --git a/router/src/main.rs b/router/src/main.rs
index c4203dbc248..013176f3693 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -156,7 +156,7 @@ async fn main() -> Result<(), RouterError> {
     });
 
     // Parse Huggingface hub token
-    let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+    let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
 
     // Tokenizer instance
     // This will only be used to validate payloads