Skip to content

Commit

Permalink
Support HF_TOKEN environment variable (#2066)
Browse files Browse the repository at this point in the history
* Support HF_TOKEN environement variable

* Load test.

---------

Co-authored-by: Nicolas Patry <[email protected]>
  • Loading branch information
Wauplin and Narsil authored Jun 25, 2024
1 parent 405765b commit 3447c72
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 100 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,6 @@ jobs:
export DOCKER_VOLUME=/mnt/cache
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
echo $DOCKER_IMAGE
pytest -s -vv integration-tests
70 changes: 2 additions & 68 deletions .github/workflows/load_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,24 @@ on:
- 'main'

jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: eu-central-1
EC2_AMI_ID: ami-0ab09c07cfd194259
EC2_INSTANCE_TYPE: g5.12xlarge
EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
EC2_SECURITY_GROUP: sg-072f92ae3082936c6
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-tgi-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
load-tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
env:
DOCKER_VOLUME: /cache
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Prepare disks
run: |
sudo mkfs -t ext4 /dev/nvme1n1
sudo mkdir ${{ env.DOCKER_VOLUME }}
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
- name: Install k6
run: |
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
- name: Start starcoder
run: |
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
sleep 10
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
Expand All @@ -82,27 +40,3 @@ jobs:
if: ${{ always() }}
run: |
docker stop tgi-starcoder || true
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- load-tests
runs-on: ubuntu-latest
env:
AWS_REGION: eu-central-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ jobs:
- name: Run server tests
run: |
pip install pytest
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
pytest -s -vv server/tests
- name: Pre-commit checks
run: |
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat

### Using a private or gated model

You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
`text-generation-inference`. This allows you to gain access to protected resources.

For example, if you want to serve the gated Llama V2 model variants:

1. Go to https://huggingface.co/settings/tokens
2. Copy your cli READ token
3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
3. Export `HF_TOKEN=<your cli READ token>`

or with Docker:

Expand All @@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token>

docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
```

### A note on Shared Memory (shm)
Expand Down
2 changes: 1 addition & 1 deletion benchmark/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!("Downloading tokenizer");

// Parse Huggingface hub token
let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();

// Download and instantiate tokenizer
// We need to download it outside of the Tokio runtime
Expand Down
8 changes: 4 additions & 4 deletions docs/source/basic_tutorials/gated_model_access.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)

If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
If you're using the CLI, set the `HF_TOKEN` environment variable. For example:

```
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
export HF_TOKEN=<YOUR READ TOKEN>
```

If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.

```bash
model=meta-llama/Llama-2-7b-chat-hf
Expand All @@ -17,7 +17,7 @@ token=<your READ token>

docker run --gpus all \
--shm-size 1g \
-e HUGGING_FACE_HUB_TOKEN=$token \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
--model-id $model
Expand Down
36 changes: 18 additions & 18 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
import sys
import subprocess
import contextlib
import pytest
import asyncio
import os
import docker
import contextlib
import json
import math
import os
import random
import re
import shutil
import subprocess
import sys
import tempfile
import time
import random
from typing import Dict, List, Optional

import docker
import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from typing import Optional, List, Dict
from syrupy.extensions.json import JSONSnapshotExtension
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError

from text_generation import AsyncClient
from text_generation.types import (
Response,
Details,
InputToken,
Token,
BestOfSequence,
Grammar,
ChatComplete,
ChatCompletionChunk,
ChatCompletionComplete,
Completion,
Details,
Grammar,
InputToken,
Response,
Token,
)

DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
HF_TOKEN = os.getenv("HF_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")

Expand Down Expand Up @@ -447,8 +447,8 @@ def docker_launcher(
if not use_flash_attention:
env["USE_FLASH_ATTENTION"] = "false"

if HUGGING_FACE_HUB_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
if HF_TOKEN is not None:
env["HF_TOKEN"] = HF_TOKEN

volumes = []
if DOCKER_VOLUME:
Expand Down
6 changes: 3 additions & 3 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ fn shard_manager(

// Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
envs.push(("HF_TOKEN".into(), api_token.into()))
};

// Detect rope scaling
Expand Down Expand Up @@ -925,7 +925,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L

// Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
envs.push(("HF_TOKEN".into(), api_token.into()))
};

// If args.weights_cache_override is some, pass it to the download process
Expand Down Expand Up @@ -1227,7 +1227,7 @@ fn spawn_webserver(

// Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
envs.push(("HF_TOKEN".into(), api_token.into()))
};

// Parse Compute type
Expand Down
2 changes: 1 addition & 1 deletion router/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ async fn main() -> Result<(), RouterError> {
});

// Parse Huggingface hub token
let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();

// Tokenizer instance
// This will only be used to validate payloads
Expand Down

0 comments on commit 3447c72

Please sign in to comment.