diff --git a/.github/workflows/build_documentation.yaml b/.github/workflows/build_documentation.yaml index 1addd21d13..ce07564011 100644 --- a/.github/workflows/build_documentation.yaml +++ b/.github/workflows/build_documentation.yaml @@ -13,6 +13,6 @@ jobs: with: commit_sha: ${{ github.sha }} package: huggingface_hub - languages: cn de fr en hi ko + languages: cn de fr en hi ko tm secrets: hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yaml b/.github/workflows/build_pr_documentation.yaml index b41ac2b036..c2294b842e 100644 --- a/.github/workflows/build_pr_documentation.yaml +++ b/.github/workflows/build_pr_documentation.yaml @@ -14,4 +14,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: huggingface_hub - languages: cn de fr en hi ko + languages: cn de fr en hi ko tm diff --git a/.github/workflows/build_repocard_examples.yaml b/.github/workflows/build_repocard_examples.yaml index 03d9f91861..7cfcf37ea9 100644 --- a/.github/workflows/build_repocard_examples.yaml +++ b/.github/workflows/build_repocard_examples.yaml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.11 + python-version: 3.13 # Install dependencies - name: Configure and install dependencies diff --git a/.github/workflows/model_card_consistency_reminder.yml b/.github/workflows/model_card_consistency_reminder.yml index 302ab5af4e..c05547f207 100644 --- a/.github/workflows/model_card_consistency_reminder.yml +++ b/.github/workflows/model_card_consistency_reminder.yml @@ -19,7 +19,7 @@ jobs: Some content is duplicated among the following files. Please make sure that everything stays consistent. - [src/.../repocard.py](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/repocard.py) - - [src/.../datasetcard_template.md](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/templates/modelcard_template.md) + - [src/.../datasetcard_template.md](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/templates/datasetcard_template.md) - [src/.../modelcard_template.md](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/templates/modelcard_template.md) - [modelcard.md](https://github.com/huggingface/hub-docs/blob/main/modelcard.md) (`hub-docs` repo) - [docs/hub/model-cards.md](https://github.com/huggingface/hub-docs/blob/main/docs/hub/model-cards.md) (`hub-docs` repo) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index 2bce55d2c6..8c3a5ca81c 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -21,26 +21,28 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.11"] + python-version: ["3.8", "3.13"] test_name: [ "Repository only", "Everything else", - "torch_1.11", - "torch_latest", + ] include: - - python-version: "3.11" # LFS not ran on 3.8 + - python-version: "3.13" # LFS not ran on 3.8 test_name: "lfs" - python-version: "3.8" - test_name: "fastai" # fastai not supported on 3.11 -> test it on 3.10 - - python-version: "3.10" + test_name: "fastai" + - python-version: "3.10" # fastai not supported on 3.12 and 3.11 -> test it on 3.10 test_name: "fastai" - python-version: "3.8" - test_name: "tensorflow" # Tensorflow not supported on 3.11 -> test it on 3.10 - - python-version: "3.10" test_name: "tensorflow" - + - python-version: "3.10" # tensorflow not supported on 3.12 -> test it on 3.10 + test_name: "tensorflow" + - python-version: "3.8" # test torch~=1.11 on python 3.8 only. + test_name: "Python 3.8, torch_1.11" + - python-version: "3.12" # test torch latest on python 3.12 only. + test_name: "torch_latest" steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -81,7 +83,7 @@ jobs: uv pip install --upgrade torch ;; - torch_1.11) + "Python 3.8, torch_1.11") uv pip install "huggingface_hub[torch] @ ." uv pip install torch~=1.11 ;; @@ -132,7 +134,7 @@ jobs: eval "$PYTEST ../tests/test_serialization.py" ;; - torch_1.11 | torch_latest) + "Python 3.8, torch_1.11" | torch_latest) eval "$PYTEST ../tests/test_hub_mixin*" eval "$PYTEST ../tests/test_serialization.py" ;; diff --git a/docs/source/cn/concepts/git_vs_http.md b/docs/source/cn/concepts/git_vs_http.md index 8509309b76..b582b5f991 100644 --- a/docs/source/cn/concepts/git_vs_http.md +++ b/docs/source/cn/concepts/git_vs_http.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/cn/guides/repository.md b/docs/source/cn/guides/repository.md index aa1f3b3dff..0b5b294d45 100644 --- a/docs/source/cn/guides/repository.md +++ b/docs/source/cn/guides/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/cn/index.md b/docs/source/cn/index.md index 1944c08f48..9a7d8db6a8 100644 --- a/docs/source/cn/index.md +++ b/docs/source/cn/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/cn/installation.md b/docs/source/cn/installation.md index 8c58425670..c800b4b173 100644 --- a/docs/source/cn/installation.md +++ b/docs/source/cn/installation.md @@ -1,4 +1,4 @@ - @@ -97,7 +97,7 @@ cd huggingface_hub pip install -e . ``` -这些命令将你克隆存储库的文件夹与你的 Python 库路径链接起来。Python 现在将除了正常的库路径之外,还会在你克隆到的文件夹中查找。例如,如果你的 Python 包通常安装在`./.venv/lib/python3.11/site-packages/`中,Python 还会搜索你克隆的文件夹`./huggingface_hub/` +这些命令将你克隆存储库的文件夹与你的 Python 库路径链接起来。Python 现在将除了正常的库路径之外,还会在你克隆到的文件夹中查找。例如,如果你的 Python 包通常安装在`./.venv/lib/python3.13/site-packages/`中,Python 还会搜索你克隆的文件夹`./huggingface_hub/` ## 通过 conda 安装 diff --git a/docs/source/cn/quick-start.md b/docs/source/cn/quick-start.md index 861973a6f7..4a4a809a79 100644 --- a/docs/source/cn/quick-start.md +++ b/docs/source/cn/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/concepts/git_vs_http.md b/docs/source/de/concepts/git_vs_http.md index 799cb974b2..978123762a 100644 --- a/docs/source/de/concepts/git_vs_http.md +++ b/docs/source/de/concepts/git_vs_http.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/community.md b/docs/source/de/guides/community.md index a2026436c2..c492330f5a 100644 --- a/docs/source/de/guides/community.md +++ b/docs/source/de/guides/community.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/download.md b/docs/source/de/guides/download.md index afd2d9d4f7..ec195d4035 100644 --- a/docs/source/de/guides/download.md +++ b/docs/source/de/guides/download.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/hf_file_system.md b/docs/source/de/guides/hf_file_system.md index 6afd8a705c..e33cc97cd4 100644 --- a/docs/source/de/guides/hf_file_system.md +++ b/docs/source/de/guides/hf_file_system.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/inference.md b/docs/source/de/guides/inference.md index dc6e35921f..e34103ac34 100644 --- a/docs/source/de/guides/inference.md +++ b/docs/source/de/guides/inference.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/integrations.md b/docs/source/de/guides/integrations.md index 3482ae6eac..d8f036ec4d 100644 --- a/docs/source/de/guides/integrations.md +++ b/docs/source/de/guides/integrations.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/manage-cache.md b/docs/source/de/guides/manage-cache.md index 2f8c029f8e..779c34b683 100644 --- a/docs/source/de/guides/manage-cache.md +++ b/docs/source/de/guides/manage-cache.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/manage-spaces.md b/docs/source/de/guides/manage-spaces.md index 3caab0f5ed..57a45e9e11 100644 --- a/docs/source/de/guides/manage-spaces.md +++ b/docs/source/de/guides/manage-spaces.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/model-cards.md b/docs/source/de/guides/model-cards.md index bcba30866c..01858c41b8 100644 --- a/docs/source/de/guides/model-cards.md +++ b/docs/source/de/guides/model-cards.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/overview.md b/docs/source/de/guides/overview.md index 7f761839a1..785689b989 100644 --- a/docs/source/de/guides/overview.md +++ b/docs/source/de/guides/overview.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/repository.md b/docs/source/de/guides/repository.md index b2f492717e..6e4187ae8e 100644 --- a/docs/source/de/guides/repository.md +++ b/docs/source/de/guides/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/search.md b/docs/source/de/guides/search.md index 0cafd4e7b3..c40d91b9b8 100644 --- a/docs/source/de/guides/search.md +++ b/docs/source/de/guides/search.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/upload.md b/docs/source/de/guides/upload.md index e591afcb52..38db944c7e 100644 --- a/docs/source/de/guides/upload.md +++ b/docs/source/de/guides/upload.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/guides/webhooks_server.md b/docs/source/de/guides/webhooks_server.md index 223ff8ca31..d2214d6b72 100644 --- a/docs/source/de/guides/webhooks_server.md +++ b/docs/source/de/guides/webhooks_server.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/index.md b/docs/source/de/index.md index 320f098ab0..3514dcf384 100644 --- a/docs/source/de/index.md +++ b/docs/source/de/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md index 9dc4d62ba5..3ba965bd4b 100644 --- a/docs/source/de/installation.md +++ b/docs/source/de/installation.md @@ -1,4 +1,4 @@ - @@ -90,7 +90,7 @@ cd huggingface_hub pip install -e . ``` -Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit Ihren Python-Bibliothekspfaden. Python wird nun zusätzlich zu den normalen Bibliothekspfaden im geklonten Ordner suchen. Wenn Ihre Python-Pakete normalerweise in `./.venv/lib/python3.11/site-packages/` installiert sind, wird Python auch den geklonten Ordner `./huggingface_hub/` durchsuchen. +Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit Ihren Python-Bibliothekspfaden. Python wird nun zusätzlich zu den normalen Bibliothekspfaden im geklonten Ordner suchen. Wenn Ihre Python-Pakete normalerweise in `./.venv/lib/python3.13/site-packages/` installiert sind, wird Python auch den geklonten Ordner `./huggingface_hub/` durchsuchen. ## Installieren mit conda diff --git a/docs/source/de/quick-start.md b/docs/source/de/quick-start.md index f78fa55e75..83ce0a24e0 100644 --- a/docs/source/de/quick-start.md +++ b/docs/source/de/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/concepts/git_vs_http.md b/docs/source/en/concepts/git_vs_http.md index f581f1678b..e6eb755af5 100644 --- a/docs/source/en/concepts/git_vs_http.md +++ b/docs/source/en/concepts/git_vs_http.md @@ -1,11 +1,11 @@ - # Git vs HTTP paradigm The `huggingface_hub` library is a library for interacting with the Hugging Face Hub, which is a -collections of git-based repositories (models, datasets or Spaces). There are two main +collection of git-based repositories (models, datasets or Spaces). There are two main ways to access the Hub using `huggingface_hub`. The first approach, the so-called "git-based" approach, is led by the [`Repository`] class. diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 9fb355ae9d..35dc861ab4 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -1,4 +1,4 @@ - @@ -139,7 +139,7 @@ If you are not logged in, an error message will be printed. ## huggingface-cli logout -This commands logs you out. In practice, it will delete all tokens stored on your machine. If you want to remove a specific token, you can specify the token name as an argument. +This command logs you out. In practice, it will delete all tokens stored on your machine. If you want to remove a specific token, you can specify the token name as an argument. This command will not log you out if you are logged in using the `HF_TOKEN` environment variable (see [reference](../package_reference/environment_variables#hftoken)). If that is the case, you must unset the environment variable in your machine configuration. diff --git a/docs/source/en/guides/collections.md b/docs/source/en/guides/collections.md index 1fc1cd8da8..2a70f1dd94 100644 --- a/docs/source/en/guides/collections.md +++ b/docs/source/en/guides/collections.md @@ -1,4 +1,4 @@ - @@ -6,7 +6,7 @@ rendered properly in your Markdown viewer. A collection is a group of related items on the Hub (models, datasets, Spaces, papers) that are organized together on the same page. Collections are useful for creating your own portfolio, bookmarking content in categories, or presenting a curated list of items you want to share. Check out this [guide](https://huggingface.co/docs/hub/collections) to understand in more detail what collections are and how they look on the Hub. -You can directly manage collections in the browser, but in this guide, we will focus on how to manage it programmatically. +You can directly manage collections in the browser, but in this guide, we will focus on how to manage them programmatically. ## Fetch a collection @@ -115,7 +115,7 @@ Now that we know how to get a [`Collection`], let's create our own! Use [`create ... ) ``` -It will return a [`Collection`] object with the high-level metadata (title, description, owner, etc.) and an empty list of items. You will now be able to refer to this collection using it's `slug`. +It will return a [`Collection`] object with the high-level metadata (title, description, owner, etc.) and an empty list of items. You will now be able to refer to this collection using its `slug`. ```py >>> collection.slug diff --git a/docs/source/en/guides/community.md b/docs/source/en/guides/community.md index df4a2a28bb..8f55a761c4 100644 --- a/docs/source/en/guides/community.md +++ b/docs/source/en/guides/community.md @@ -1,4 +1,4 @@ - @@ -80,7 +80,7 @@ with more detailed information about the Discussion or Pull Request. Information and renames of the Discussion via [`DiscussionWithDetails.events`]. In case of a Pull Request, you can retrieve the raw git diff with [`DiscussionWithDetails.diff`]. All the commits of the -Pull Request are listed in [`DiscussionWithDetails.events`]. +Pull Requests are listed in [`DiscussionWithDetails.events`]. ## Create and edit a Discussion or Pull Request programmatically diff --git a/docs/source/en/guides/download.md b/docs/source/en/guides/download.md index 1eb8250d20..254c72d165 100644 --- a/docs/source/en/guides/download.md +++ b/docs/source/en/guides/download.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/hf_file_system.md b/docs/source/en/guides/hf_file_system.md index 6c4383952b..92184838d8 100644 --- a/docs/source/en/guides/hf_file_system.md +++ b/docs/source/en/guides/hf_file_system.md @@ -1,10 +1,18 @@ - # Interact with the Hub through the Filesystem API -In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, and `put_file`. +In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds on top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, and `put_file`. + + + + [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading + Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility + layer. For better performance and reliability, it's recommended to use [`HfApi`] methods when possible. + + ## Usage @@ -17,7 +25,7 @@ In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSys ['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] >>> # List all ".csv" files in a repo ->>> fs.glob("datasets/my-username/my-dataset-repo/**.csv") +>>> fs.glob("datasets/my-username/my-dataset-repo/**/*.csv") ['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] >>> # Read a remote file diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md index 92207658cd..07a5ca5014 100644 --- a/docs/source/en/guides/inference.md +++ b/docs/source/en/guides/inference.md @@ -1,4 +1,4 @@ - @@ -77,7 +77,7 @@ ChatCompletionOutput( ) ``` -In this example, we specified which model we want to use (`"meta-llama/Meta-Llama-3-8B-Instruct"`). You can find a list of compatible models [on this page](https://huggingface.co/models?other=conversational&sort=likes). We then gave a list of messages to complete (here, a single question) and passed an additional parameter to API (`max_token=100`). The output is a `ChatCompletionOutput` object that follows the OpenAI specification. The generated content can be access with `output.choices[0].message.content`. For more details, check out the [`~InferenceClient.chat_completion`] documentation. +In this example, we specified which model we want to use (`"meta-llama/Meta-Llama-3-8B-Instruct"`). You can find a list of compatible models [on this page](https://huggingface.co/models?other=conversational&sort=likes). We then gave a list of messages to complete (here, a single question) and passed an additional parameter to API (`max_token=100`). The output is a `ChatCompletionOutput` object that follows the OpenAI specification. The generated content can be accessed with `output.choices[0].message.content`. For more details, check out the [`~InferenceClient.chat_completion`] documentation. @@ -176,7 +176,7 @@ for chunk in output: print(chunk.choices[0].delta.content) ``` -And that's it! The only required changes are to replace `from openai import OpenAI` by `from huggingface_hub import InferenceClient` and `client = OpenAI(...)` by `client = InferenceClient(...)`. You can chose any LLM model from the Hugging Face Hub by passing its model id as `model` parameter. [Here is a list](https://huggingface.co/models?pipeline_tag=text-generation&other=conversational,text-generation-inference&sort=trending) of supported models. For authentication, you should pass a valid [User Access Token](https://huggingface.co/settings/tokens) as `api_key` or authenticate using `huggingface_hub` (see the [authentication guide](https://huggingface.co/docs/huggingface_hub/quick-start#authentication)). +And that's it! The only required changes are to replace `from openai import OpenAI` by `from huggingface_hub import InferenceClient` and `client = OpenAI(...)` by `client = InferenceClient(...)`. You can choose any LLM model from the Hugging Face Hub by passing its model id as `model` parameter. [Here is a list](https://huggingface.co/models?pipeline_tag=text-generation&other=conversational,text-generation-inference&sort=trending) of supported models. For authentication, you should pass a valid [User Access Token](https://huggingface.co/settings/tokens) as `api_key` or authenticate using `huggingface_hub` (see the [authentication guide](https://huggingface.co/docs/huggingface_hub/quick-start#authentication)). All input parameters and output format are strictly the same. In particular, you can pass `stream=True` to receive tokens as they are generated. You can also use the [`AsyncInferenceClient`] to run inference using `asyncio`: @@ -201,7 +201,7 @@ asyncio.run(main()) ``` You might wonder why using [`InferenceClient`] instead of OpenAI's client? There are a few reasons for that: -1. [`InferenceClient`] is configured for Hugging Face services. You don't need to provide a `base_url` to run models on the serverless Inference API. You also don't need to provide a `token` or `api_key` if you machine is already correctly logged in. +1. [`InferenceClient`] is configured for Hugging Face services. You don't need to provide a `base_url` to run models on the serverless Inference API. You also don't need to provide a `token` or `api_key` if your machine is already correctly logged in. 2. [`InferenceClient`] is tailored for both Text-Generation-Inference (TGI) and `transformers` frameworks, meaning you are assured it will always be on-par with the latest updates. 3. [`InferenceClient`] is integrated with our Inference Endpoints service, making it easier to launch an Inference Endpoint, check its status and run inference on it. Check out the [Inference Endpoints](./inference_endpoints.md) guide for more details. @@ -285,7 +285,7 @@ After installation all async API endpoints are available via [`AsyncInferenceCli strictly the same as the sync-only version. ```py -# Code must be run in a asyncio concurrent context. +# Code must be run in an asyncio concurrent context. # $ python -m asyncio >>> from huggingface_hub import AsyncInferenceClient >>> client = AsyncInferenceClient() diff --git a/docs/source/en/guides/integrations.md b/docs/source/en/guides/integrations.md index 1b7a8080c2..26018356c5 100644 --- a/docs/source/en/guides/integrations.md +++ b/docs/source/en/guides/integrations.md @@ -1,4 +1,4 @@ - @@ -39,7 +39,7 @@ Implementation can differ between libraries, but the workflow is often similar. ### from_pretrained -This is how a `from_pretrained` method usually look like: +This is how a `from_pretrained` method usually looks like: ```python def from_pretrained(model_id: str) -> MyModelClass: @@ -390,7 +390,7 @@ class VoiceCraft(nn.Module): ... ``` -One solution can be to update the `__init__` signature to `def __init__(self, pattern: str, hidden_size: int)` and update all snippets that instantiates your class. This is a perfectly valid way to fix it but it might break downstream applications using your library. +One solution can be to update the `__init__` signature to `def __init__(self, pattern: str, hidden_size: int)` and update all snippets that instantiate your class. This is a perfectly valid way to fix it but it might break downstream applications using your library. Another solution is to provide a simple encoder/decoder to convert `argparse.Namespace` to a dictionary. diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md index aad3be96c8..521a50b21f 100644 --- a/docs/source/en/guides/manage-cache.md +++ b/docs/source/en/guides/manage-cache.md @@ -1,4 +1,4 @@ - @@ -101,13 +101,13 @@ on the Hub. Its structure is the same as the `snapshots` folder with 1 subfolder Unlike the `snapshots` folder, files are simple empty files (no symlinks). In this example, the file `"config_that_does_not_exist.json"` does not exist on the Hub for the revision `"aaaaaa"`. -As it only stores empty files, this folder is neglectable is term of disk usage. +As it only stores empty files, this folder is neglectable in term of disk usage. So now you might wonder, why is this information even relevant? In some cases, a framework tries to load optional files for a model. Saving the non-existence of optional files makes it faster to load a model as it saves 1 HTTP call per possible optional file. This is for example the case in `transformers` where each tokenizer can support additional files. -The first time you load the tokenizer on your machine, it will cache which optional files exists (and +The first time you load the tokenizer on your machine, it will cache which optional files exist (and which doesn't) to make the loading time faster for the next initializations. To test if a file is cached locally (without making any HTTP request), you can use the [`try_to_load_from_cache`] diff --git a/docs/source/en/guides/manage-spaces.md b/docs/source/en/guides/manage-spaces.md index 898c8b1fb5..8890101fbb 100644 --- a/docs/source/en/guides/manage-spaces.md +++ b/docs/source/en/guides/manage-spaces.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/model-cards.md b/docs/source/en/guides/model-cards.md index e1638e3f3a..0c9f52a2cc 100644 --- a/docs/source/en/guides/model-cards.md +++ b/docs/source/en/guides/model-cards.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/overview.md b/docs/source/en/guides/overview.md index dd2813ea8d..501fa78d37 100644 --- a/docs/source/en/guides/overview.md +++ b/docs/source/en/guides/overview.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/repository.md b/docs/source/en/guides/repository.md index adfc881298..6426fd136a 100644 --- a/docs/source/en/guides/repository.md +++ b/docs/source/en/guides/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/search.md b/docs/source/en/guides/search.md index e5e50e0c29..ce0f6c0bdc 100644 --- a/docs/source/en/guides/search.md +++ b/docs/source/en/guides/search.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md index d65e803454..d12ff09990 100644 --- a/docs/source/en/guides/upload.md +++ b/docs/source/en/guides/upload.md @@ -1,4 +1,4 @@ - @@ -234,41 +234,8 @@ Future(...) ### Upload a folder by chunks [`upload_folder`] makes it easy to upload an entire folder to the Hub. However, for large folders (thousands of files or -hundreds of GB), it can still be challenging. If you have a folder with a lot of files, you might want to upload -it in several commits. If you experience an error or a connection issue during the upload, you would not have to resume -the process from the beginning. - -To upload a folder in multiple commits, just pass `multi_commits=True` as argument. Under the hood, `huggingface_hub` -will list the files to upload/delete and split them in several commits. The "strategy" (i.e. how to split the commits) -is based on the number and size of the files to upload. A PR is open on the Hub to push all the commits. Once the PR is -ready, the commits are squashed into a single commit. If the process is interrupted before completing, you can rerun -your script to resume the upload. The created PR will be automatically detected and the upload will resume from where -it stopped. It is recommended to pass `multi_commits_verbose=True` to get a better understanding of the upload and its -progress. - -The example below will upload the checkpoints folder to a dataset in multiple commits. A PR will be created on the Hub -and merged automatically once the upload is complete. If you prefer the PR to stay open and review it manually, you can -pass `create_pr=True`. +hundreds of GB), we recommend using [`upload_large_folder`], which splits the upload into multiple commits. See the [Upload a large folder](#upload-a-large-folder) section for more details. -```py ->>> upload_folder( -... folder_path="local/checkpoints", -... repo_id="username/my-dataset", -... repo_type="dataset", -... multi_commits=True, -... multi_commits_verbose=True, -... ) -``` - -If you want a better control on the upload strategy (i.e. the commits that are created), you can have a look at the -low-level [`plan_multi_commits`] and [`create_commits_on_pr`] methods. - - - -`multi_commits` is still an experimental feature. Its API and behavior is subject to change in the future without prior -notice. - - ### Scheduled uploads diff --git a/docs/source/en/guides/webhooks.md b/docs/source/en/guides/webhooks.md index 3d9856c920..064a6cfa99 100644 --- a/docs/source/en/guides/webhooks.md +++ b/docs/source/en/guides/webhooks.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 0095c26700..900d26b346 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 47e672a576..9af8a32676 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -1,4 +1,4 @@ - @@ -104,7 +104,7 @@ pip install -e . These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. -For example, if your Python packages are typically installed in `./.venv/lib/python3.11/site-packages/`, +For example, if your Python packages are typically installed in `./.venv/lib/python3.13/site-packages/`, Python will also search the folder you cloned `./huggingface_hub/`. ## Install with conda @@ -145,7 +145,7 @@ encounter any undocumented problem by opening [an issue on Github](https://githu - `huggingface_hub`'s cache system relies on symlinks to efficiently cache files downloaded from the Hub. On Windows, you must activate developer mode or run your script as admin to -enable symlinks. If they are not activated, the cache-system still works but in an non-optimized +enable symlinks. If they are not activated, the cache-system still works but in a non-optimized manner. Please read [the cache limitations](./guides/manage-cache#limitations) section for more details. - Filepaths on the Hub can have special characters (e.g. `"path/to?/my/file"`). Windows is more restrictive on [special characters](https://learn.microsoft.com/en-us/windows/win32/intl/character-sets-used-in-file-names) diff --git a/docs/source/en/package_reference/authentication.md b/docs/source/en/package_reference/authentication.md index 1577b5bc29..84c7344d22 100644 --- a/docs/source/en/package_reference/authentication.md +++ b/docs/source/en/package_reference/authentication.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/cache.md b/docs/source/en/package_reference/cache.md index ea8cad29c7..83bb1b956a 100644 --- a/docs/source/en/package_reference/cache.md +++ b/docs/source/en/package_reference/cache.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/cards.md b/docs/source/en/package_reference/cards.md index 99fc1196d2..c3ccfbe1ad 100644 --- a/docs/source/en/package_reference/cards.md +++ b/docs/source/en/package_reference/cards.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/collections.md b/docs/source/en/package_reference/collections.md index 8b54d5a744..1678a88d0c 100644 --- a/docs/source/en/package_reference/collections.md +++ b/docs/source/en/package_reference/collections.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/community.md b/docs/source/en/package_reference/community.md index d41d45c8a6..50ae75e054 100644 --- a/docs/source/en/package_reference/community.md +++ b/docs/source/en/package_reference/community.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/environment_variables.md b/docs/source/en/package_reference/environment_variables.md index 15fe3ad3b8..2930a462fa 100644 --- a/docs/source/en/package_reference/environment_variables.md +++ b/docs/source/en/package_reference/environment_variables.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/file_download.md b/docs/source/en/package_reference/file_download.md index 1bc5b21478..26a033c5f1 100644 --- a/docs/source/en/package_reference/file_download.md +++ b/docs/source/en/package_reference/file_download.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/hf_api.md b/docs/source/en/package_reference/hf_api.md index 3f883a014e..4c77b44aaa 100644 --- a/docs/source/en/package_reference/hf_api.md +++ b/docs/source/en/package_reference/hf_api.md @@ -1,4 +1,4 @@ - @@ -31,8 +31,6 @@ models = hf_api.list_models() [[autodoc]] HfApi -[[autodoc]] plan_multi_commits - ## API Dataclasses ### AccessRequest diff --git a/docs/source/en/package_reference/hf_file_system.md b/docs/source/en/package_reference/hf_file_system.md index 097c798fa7..89fcf23887 100644 --- a/docs/source/en/package_reference/hf_file_system.md +++ b/docs/source/en/package_reference/hf_file_system.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/inference_client.md b/docs/source/en/package_reference/inference_client.md index 2a6b37c61b..5cfef6be0d 100644 --- a/docs/source/en/package_reference/inference_client.md +++ b/docs/source/en/package_reference/inference_client.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md index aa63c64b68..368a716cf3 100644 --- a/docs/source/en/package_reference/inference_types.md +++ b/docs/source/en/package_reference/inference_types.md @@ -1,5 +1,5 @@ - diff --git a/docs/source/en/package_reference/mixins.md b/docs/source/en/package_reference/mixins.md index 04e66c8f2d..42c253e710 100644 --- a/docs/source/en/package_reference/mixins.md +++ b/docs/source/en/package_reference/mixins.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/overview.md b/docs/source/en/package_reference/overview.md index 1265c1efef..1bb51ac910 100644 --- a/docs/source/en/package_reference/overview.md +++ b/docs/source/en/package_reference/overview.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/repository.md b/docs/source/en/package_reference/repository.md index 0a8ebf284c..de7851d6a9 100644 --- a/docs/source/en/package_reference/repository.md +++ b/docs/source/en/package_reference/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 841d9d3011..0149855e02 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/space_runtime.md b/docs/source/en/package_reference/space_runtime.md index 75fac898fd..364c43be35 100644 --- a/docs/source/en/package_reference/space_runtime.md +++ b/docs/source/en/package_reference/space_runtime.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/tensorboard.md b/docs/source/en/package_reference/tensorboard.md index 20fa4a3885..14addbb5ba 100644 --- a/docs/source/en/package_reference/tensorboard.md +++ b/docs/source/en/package_reference/tensorboard.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/utilities.md b/docs/source/en/package_reference/utilities.md index 5ea9d56b46..80fe3148ff 100644 --- a/docs/source/en/package_reference/utilities.md +++ b/docs/source/en/package_reference/utilities.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/package_reference/webhooks_server.md b/docs/source/en/package_reference/webhooks_server.md index 3d6847a3e0..a9115cbb03 100644 --- a/docs/source/en/package_reference/webhooks_server.md +++ b/docs/source/en/package_reference/webhooks_server.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/en/quick-start.md b/docs/source/en/quick-start.md index dbbe0d2a91..b8a2b67005 100644 --- a/docs/source/en/quick-start.md +++ b/docs/source/en/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/fr/concepts/git_vs_http.md b/docs/source/fr/concepts/git_vs_http.md index 8a4829b8ff..8ccc31b69c 100644 --- a/docs/source/fr/concepts/git_vs_http.md +++ b/docs/source/fr/concepts/git_vs_http.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/fr/guides/integrations.md b/docs/source/fr/guides/integrations.md index 5813707f90..3adcc8f9f7 100644 --- a/docs/source/fr/guides/integrations.md +++ b/docs/source/fr/guides/integrations.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/fr/index.md b/docs/source/fr/index.md index e0ddf64ae5..51a19a3ea3 100644 --- a/docs/source/fr/index.md +++ b/docs/source/fr/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/fr/installation.md b/docs/source/fr/installation.md index 4531cc6d67..15fb47a9f8 100644 --- a/docs/source/fr/installation.md +++ b/docs/source/fr/installation.md @@ -1,4 +1,4 @@ - @@ -59,7 +59,7 @@ pip install 'huggingface_hub[cli,torch]' Voici une liste des dépendances optionnelles dans `huggingface_hub`: - `cli` fournit une interface d'invite de commande plus pratique pour `huggingface_hub`. - `fastai`, `torch` et `tensorflow` sont des dépendances pour utiliser des fonctionnalités spécifiques à un framework. -- `dev` permet de contribuer à la librairie. Cette dépendance inclut `testing` (pour lancer des tests), `typing` (pour lancer le vérifieur de type) et `quality` (pour lancer des linters). +- `dev` permet de contribuer à la librairie. Cette dépendance inclut `testing` (pour lancer des tests), `typing` (pour lancer le vérifieur de type) et `quality` (pour lancer des linters). @@ -104,7 +104,7 @@ pip install -e . Python regardera maintenant à l'intérieur du dossier dans lequel vous avez cloné le dépôt en plus des chemins de librairie classiques. Par exemple, si vos packages Python sont installés dans -`./.venv/lib/python3.11/site-packages/`, Python regardera aussi dans le dossier que vous avez +`./.venv/lib/python3.13/site-packages/`, Python regardera aussi dans le dossier que vous avez cloné `./huggingface_hub/`. ## Installation avec conda @@ -158,4 +158,4 @@ pour que nous cherchions une solution. Une fois que `huggingface_hub` est installé correctement sur votre machine, vous aurez peut-être besoin de [configurer les variables d'environnement](package_reference/environment_variables) ou de [lire un de nos guides](guides/overview) -pour vous lancer. \ No newline at end of file +pour vous lancer. diff --git a/docs/source/fr/quick-start.md b/docs/source/fr/quick-start.md index f11d5412c3..1a99d830d7 100644 --- a/docs/source/fr/quick-start.md +++ b/docs/source/fr/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/hi/index.md b/docs/source/hi/index.md index e2f83b7d56..99cd7aeccb 100644 --- a/docs/source/hi/index.md +++ b/docs/source/hi/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/hi/installation.md b/docs/source/hi/installation.md index cf145b1484..4d16d6624b 100644 --- a/docs/source/hi/installation.md +++ b/docs/source/hi/installation.md @@ -1,4 +1,4 @@ - @@ -103,7 +103,7 @@ pip install -e . ये कमांड उस फ़ोल्डर को लिंक करेंगे जिसे आपने रिपॉजिटरी में क्लोन किया है और आपके पायथन लाइब्रेरी पथ। पाइथॉन अब सामान्य लाइब्रेरी पथों के अलावा आपके द्वारा क्लोन किए गए फ़ोल्डर के अंदर भी देखेगा। -उदाहरण के लिए, यदि आपके पायथन पैकेज आमतौर पर `./.venv/lib/python3.11/site-packages/` में स्थापित हैं, +उदाहरण के लिए, यदि आपके पायथन पैकेज आमतौर पर `./.venv/lib/python3.13/site-packages/` में स्थापित हैं, पायथन आपके द्वारा क्लोन किए गए फ़ोल्डर `./huggingface_hub/` को भी खोजेगा। ## कोंडा के साथ स्थापित करें diff --git a/docs/source/hi/quick-start.md b/docs/source/hi/quick-start.md index 88cf26f02f..3ce8a78a7a 100644 --- a/docs/source/hi/quick-start.md +++ b/docs/source/hi/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/concepts/git_vs_http.md b/docs/source/ko/concepts/git_vs_http.md index ac9761ec7c..7f2bd9933f 100644 --- a/docs/source/ko/concepts/git_vs_http.md +++ b/docs/source/ko/concepts/git_vs_http.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/cli.md b/docs/source/ko/guides/cli.md index 6dbd0fd3ef..13c4352235 100644 --- a/docs/source/ko/guides/cli.md +++ b/docs/source/ko/guides/cli.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/collections.md b/docs/source/ko/guides/collections.md index bb7db4bb36..69e75b92cf 100644 --- a/docs/source/ko/guides/collections.md +++ b/docs/source/ko/guides/collections.md @@ -1,4 +1,4 @@ - # Collections[[collections]] diff --git a/docs/source/ko/guides/community.md b/docs/source/ko/guides/community.md index 1affc7861e..76ecb23e25 100644 --- a/docs/source/ko/guides/community.md +++ b/docs/source/ko/guides/community.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/download.md b/docs/source/ko/guides/download.md index e4d52d5cca..ac658dde4e 100644 --- a/docs/source/ko/guides/download.md +++ b/docs/source/ko/guides/download.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/hf_file_system.md b/docs/source/ko/guides/hf_file_system.md index e3f9b9a521..74a02f71f5 100644 --- a/docs/source/ko/guides/hf_file_system.md +++ b/docs/source/ko/guides/hf_file_system.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/inference.md b/docs/source/ko/guides/inference.md index f59f2b52b8..04ac68ad41 100644 --- a/docs/source/ko/guides/inference.md +++ b/docs/source/ko/guides/inference.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/integrations.md b/docs/source/ko/guides/integrations.md index f251785a07..f0946bc298 100644 --- a/docs/source/ko/guides/integrations.md +++ b/docs/source/ko/guides/integrations.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/manage-cache.md b/docs/source/ko/guides/manage-cache.md index 9e5c1a674f..7b54c7eaab 100644 --- a/docs/source/ko/guides/manage-cache.md +++ b/docs/source/ko/guides/manage-cache.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/manage-spaces.md b/docs/source/ko/guides/manage-spaces.md index 7ee3ccde21..f629d121e0 100644 --- a/docs/source/ko/guides/manage-spaces.md +++ b/docs/source/ko/guides/manage-spaces.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/model-cards.md b/docs/source/ko/guides/model-cards.md index 15bdd320fc..41bc4f1317 100644 --- a/docs/source/ko/guides/model-cards.md +++ b/docs/source/ko/guides/model-cards.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/overview.md b/docs/source/ko/guides/overview.md index 318f933c4d..9e171c97f0 100644 --- a/docs/source/ko/guides/overview.md +++ b/docs/source/ko/guides/overview.md @@ -1,4 +1,4 @@ - @@ -118,4 +118,4 @@ rendered properly in your Markdown viewer. - + diff --git a/docs/source/ko/guides/repository.md b/docs/source/ko/guides/repository.md index 56328b419c..ab6e7bea34 100644 --- a/docs/source/ko/guides/repository.md +++ b/docs/source/ko/guides/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/search.md b/docs/source/ko/guides/search.md index e5e37e7c68..ef739d4bde 100644 --- a/docs/source/ko/guides/search.md +++ b/docs/source/ko/guides/search.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/upload.md b/docs/source/ko/guides/upload.md index e097870b01..fd704d0ba6 100644 --- a/docs/source/ko/guides/upload.md +++ b/docs/source/ko/guides/upload.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/guides/webhooks_server.md b/docs/source/ko/guides/webhooks_server.md index adc94e1877..39049e160a 100644 --- a/docs/source/ko/guides/webhooks_server.md +++ b/docs/source/ko/guides/webhooks_server.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/index.md b/docs/source/ko/index.md index 53785d8fe3..b5428b6a7c 100644 --- a/docs/source/ko/index.md +++ b/docs/source/ko/index.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md index bc5329c2ce..720346b1a1 100644 --- a/docs/source/ko/installation.md +++ b/docs/source/ko/installation.md @@ -1,4 +1,4 @@ - @@ -94,7 +94,7 @@ pip install -e . 이렇게 클론한 레포지토리 폴더와 Python 경로를 연결합니다. 이제 Python은 일반적인 라이브러리 경로 외에도 복제된 폴더 내부를 찾습니다. -예를 들어 파이썬 패키지가 일반적으로 `./.venv/lib/python3.11/site-packages/`에 설치되어 있다면, Python은 복제된 폴더 `./huggingface_hub/`도 검색하게 됩니다. +예를 들어 파이썬 패키지가 일반적으로 `./.venv/lib/python3.13/site-packages/`에 설치되어 있다면, Python은 복제된 폴더 `./huggingface_hub/`도 검색하게 됩니다. ## conda로 설치하기 [[install-with-conda]] diff --git a/docs/source/ko/package_reference/cache.md b/docs/source/ko/package_reference/cache.md index 2e155c7f63..53c6d358ee 100644 --- a/docs/source/ko/package_reference/cache.md +++ b/docs/source/ko/package_reference/cache.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/collections.md b/docs/source/ko/package_reference/collections.md index 3b7bec4826..d8276953b0 100644 --- a/docs/source/ko/package_reference/collections.md +++ b/docs/source/ko/package_reference/collections.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/community.md b/docs/source/ko/package_reference/community.md index 264af6fba5..9e598efc77 100644 --- a/docs/source/ko/package_reference/community.md +++ b/docs/source/ko/package_reference/community.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/environment_variables.md b/docs/source/ko/package_reference/environment_variables.md index 455e5e49ab..a4d26123e4 100644 --- a/docs/source/ko/package_reference/environment_variables.md +++ b/docs/source/ko/package_reference/environment_variables.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/file_download.md b/docs/source/ko/package_reference/file_download.md index 82e67a83c6..b3b36f5659 100644 --- a/docs/source/ko/package_reference/file_download.md +++ b/docs/source/ko/package_reference/file_download.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/hf_api.md b/docs/source/ko/package_reference/hf_api.md index d426d13923..f3363fce61 100644 --- a/docs/source/ko/package_reference/hf_api.md +++ b/docs/source/ko/package_reference/hf_api.md @@ -1,4 +1,4 @@ - @@ -31,8 +31,6 @@ models = hf_api.list_models() [[autodoc]] HfApi -[[autodoc]] plan_multi_commits - ## API Dataclasses[[api-dataclasses]] ### AccessRequest[[huggingface_hub.hf_api.AccessRequest]] diff --git a/docs/source/ko/package_reference/hf_file_system.md b/docs/source/ko/package_reference/hf_file_system.md index fb27660d8b..bebbeda1f1 100644 --- a/docs/source/ko/package_reference/hf_file_system.md +++ b/docs/source/ko/package_reference/hf_file_system.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/inference_client.md b/docs/source/ko/package_reference/inference_client.md index d3ecfcaa87..2ccab6366f 100644 --- a/docs/source/ko/package_reference/inference_client.md +++ b/docs/source/ko/package_reference/inference_client.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md index 393481e10f..6ae2736fba 100644 --- a/docs/source/ko/package_reference/inference_types.md +++ b/docs/source/ko/package_reference/inference_types.md @@ -1,5 +1,5 @@ - diff --git a/docs/source/ko/package_reference/login.md b/docs/source/ko/package_reference/login.md index 8ab39c2208..dbfa8a0650 100644 --- a/docs/source/ko/package_reference/login.md +++ b/docs/source/ko/package_reference/login.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/mixins.md b/docs/source/ko/package_reference/mixins.md index 53aaf9303a..4a4a84ad9e 100644 --- a/docs/source/ko/package_reference/mixins.md +++ b/docs/source/ko/package_reference/mixins.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/overview.md b/docs/source/ko/package_reference/overview.md index 87fcb99ea1..f55369302b 100644 --- a/docs/source/ko/package_reference/overview.md +++ b/docs/source/ko/package_reference/overview.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/repository.md b/docs/source/ko/package_reference/repository.md index b513137cc9..fc70e3e203 100644 --- a/docs/source/ko/package_reference/repository.md +++ b/docs/source/ko/package_reference/repository.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/serialization.md b/docs/source/ko/package_reference/serialization.md index d026052eda..25901237bf 100644 --- a/docs/source/ko/package_reference/serialization.md +++ b/docs/source/ko/package_reference/serialization.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/space_runtime.md b/docs/source/ko/package_reference/space_runtime.md index 8f22a5361f..03bc46cd13 100644 --- a/docs/source/ko/package_reference/space_runtime.md +++ b/docs/source/ko/package_reference/space_runtime.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/tensorboard.md b/docs/source/ko/package_reference/tensorboard.md index 395a733756..77f87e5ea0 100644 --- a/docs/source/ko/package_reference/tensorboard.md +++ b/docs/source/ko/package_reference/tensorboard.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/utilities.md b/docs/source/ko/package_reference/utilities.md index 592092f64a..a76e9d474b 100644 --- a/docs/source/ko/package_reference/utilities.md +++ b/docs/source/ko/package_reference/utilities.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/package_reference/webhooks_server.md b/docs/source/ko/package_reference/webhooks_server.md index 9712c6c0be..8764fb2f7f 100644 --- a/docs/source/ko/package_reference/webhooks_server.md +++ b/docs/source/ko/package_reference/webhooks_server.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/ko/quick-start.md b/docs/source/ko/quick-start.md index 9a6eb23b5a..30e07331e3 100644 --- a/docs/source/ko/quick-start.md +++ b/docs/source/ko/quick-start.md @@ -1,4 +1,4 @@ - diff --git a/docs/source/tm/_toctree.yml b/docs/source/tm/_toctree.yml new file mode 100644 index 0000000000..833cbfabdf --- /dev/null +++ b/docs/source/tm/_toctree.yml @@ -0,0 +1,6 @@ +- title: "Get started" + sections: + - local: index + title: குறியீட்டு + - local: installation + title: நிறுவல் \ No newline at end of file diff --git a/docs/source/tm/index.md b/docs/source/tm/index.md new file mode 100644 index 0000000000..a4fdc5e4b9 --- /dev/null +++ b/docs/source/tm/index.md @@ -0,0 +1,45 @@ + + +# 🤗 ஹப் கிளையன்ட் லைப்ரரி + +`Huggingface_hub` லைப்ரரி உங்களை [ஹக்கிங் ஃபேஸ் ஹப்](https://hf.co) உடன் தொடர்புகொள்ள அனுமதிக்கிறது, இது படைப்பாளர்கள் மற்றும் கூட்டுப்பணியாளர்களுக்கான இயந்திர கற்றல் தளமாகும். உங்கள் திட்டங்களுக்கான முன் பயிற்சி பெற்ற மாதிரிகள் மற்றும் தரவுத்தொகுப்புகளைக் கண்டறியவும் அல்லது ஹப்பில் ஹோஸ்ட் செய்யப்பட்ட நூற்றுக்கணக்கான இயந்திர கற்றல் பயன்பாடுகளுடன் விளையாடவும். உங்கள் சொந்த மாதிரிகள் மற்றும் தரவுத்தொகுப்புகளை உருவாக்கி சமூகத்துடன் பகிரலாம். `huggingface_hub` லைப்ரரி பைதான் மூலம் இவற்றைச் செய்வதற்கான எளிய வழியை வழங்குகிறது. + + +[இந்த துரிதத் தொடக்கக் கையேட்டை](quick-start) வாசித்தால், `huggingface_hub` நூலகத்துடன் வேலை செய்ய எவ்வாறு ஆரம்பிக்கலாம் என்பதை நீங்கள் கற்றுக்கொள்வீர்கள். இதில், 🤗 ஹப் (Hub) இலிருந்து கோப்புகளை எவ்வாறு பதிவிறக்குவது, ஒரு `repository` உருவாக்குவது மற்றும் கோப்புகளை ஹபுக்கு எவ்வாறு பதிவேற்றுவது என்பதை நீங்கள் கற்றுக்கொள்வீர்கள்.மேலும், 🤗 ஹபில் உங்கள் repositoryகளை எவ்வாறு நிர்வகிக்க வேண்டும், விவாதங்களில் எவ்வாறு ஈடுபட வேண்டும், அல்லது `Inference API`யை எப்படி அணுகுவது என்பதையும் கற்றுக்கொள்ள இந்த வழிகாட்டியை தொடர்ந்து வாசியுங்கள். + + +
+ +
+ +## பங்களிப்பு + +`huggingface_hub`-க்கு அனைத்து பங்களிப்புகளும் வரவேற்கப்படுகின்றன மற்றும் சமமாக மதிக்கப்படுகின்றன! 🤗 கோடில் உள்ள உள்ளமைவுகளையும் அல்லது பிழைகளைச் சரிசெய்வதோடு, ஆவணங்களை சரியாகவும், தற்போதைய நிலையில் இருப்பதையும் உறுதிப்படுத்துவதன் மூலம் தங்களால் உதவலாம், மேலும் இஷ்யூக்களுக்கான கேள்விகளுக்கு பதிலளிக்கவும், நூலகத்தை மேம்படுத்துமாறு நீங்கள் நினைப்பதைத் தொடர்ந்து புதிய அம்சங்களை கோரலாம். பங்களிப்பு குறித்த [வழிகாட்டலை](https://github.com/huggingface/huggingface_hub/blob/main/CONTRIBUTING.md) பார்க்கவும், புதிய இஷ்யூவோ அல்லது அம்சக் கோரிக்கையோ எப்படி சமர்ப்பிக்க வேண்டும், புல் ரிக்வெஸ்ட்களை (Pull Request) சமர்ப்பிப்பது எப்படி, மேலும் உங்கள் பங்களிப்புகள் அனைத்தும் எதிர்பார்த்தது போல வேலை செய்கிறதா என்பதைச் சோதிப்பது எப்படி என்பதையும் கற்றுக்கொள்ளலாம். + +பங்களிப்பாளர்கள் அனைவருக்கும் உள்ளடக்கிய மற்றும் வரவேற்கக்கூடிய ஒத்துழைப்பு நிலையை உருவாக்க, நாங்கள் உருவாக்கிய [நடத்தை விதிகளை](https://github.com/huggingface/huggingface_hub/blob/main/CODE_OF_CONDUCT.md) மதிக்க வேண்டும். + + + + + + diff --git a/docs/source/tm/installation.md b/docs/source/tm/installation.md new file mode 100644 index 0000000000..f16ac74667 --- /dev/null +++ b/docs/source/tm/installation.md @@ -0,0 +1,131 @@ +# நிறுவல் + +நீங்கள் தொடங்குவதற்கு முன், தகுந்த தொகுப்புகளை நிறுவுவதன் மூலம் உங்கள் சூழலை அமைக்க வேண்டும். + +`huggingface_hub` **Python 3.8+** மின்பொருள்களில் சோதிக்கப்பட்டுள்ளது. + +### பிப் மூலம் நிறுவு + +**pip மூலம் நிறுவல்** + +`huggingface_hub`-ஐ ஒரு [மெய்நிகர் சூழலில்](https://docs.python.org/3/library/venv.html) (virtual environment) நிறுவுவது மிகவும் பரிந்துரைக்கப்படுகிறது. நீங்கள் பைதான் மெய்நிகர் சூழல்களைக் குறித்து அறியாதவராக இருந்தால், இந்த [வழிகாட்டலைப்](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/)பார்க்கவும். ஒரு மெய்நிகர் சூழல் பல்வேறு திட்டங்களை எளிதில் நிர்வகிக்கவும், சார்புகளுக்கிடையிலான (dependencies) இணக்கமின்மை பிரச்சனைகளைத் தவிர்க்கவும் உதவுகிறது. + +முதலில், உங்கள் திட்ட அடைவரிசையில் (project directory) ஒரு மெய்நிகர் சூழலை உருவாக்கத் தொடங்குங்கள்: + +```bash +python -m venv .env +``` +மெய்நிகர் சூழலை செயல்படுத்தவும். Linux மற்றும் macOS-இல்: + + +```bash +source .env/bin/activate +``` + +விண்டோஸ்-இல் மெய்நிகர் சூழலை செயல்படுத்த: + +```bash +.env/Scripts/activate +``` + +இப்போது நீங்கள் `huggingface_hub`-ஐ [PyPi பதிவகத்திலிருந்து](https://pypi.org/project/huggingface-hub/) நிறுவ தயாராக இருக்கிறீர்கள். + +```bash +pip install --upgrade huggingface_hub +``` + +முடித்த பிறகு, [நிறுவல் சரியாக வேலை](#check-installation) செய்கிறதா என்பதைச் சோதிக்கவும். + +### விருப்பத் தேவைப்படும் சார்புகளை நிறுவல்** + +`huggingface_hub`-இன் சில சார்புகள் விருப்பமானவை, ஏனெனில் அவை `huggingface_hub`-இன் அடிப்படை அம்சங்களை இயக்க தேவையில்லை. எனினும், விருப்பச் சார்புகள் நிறுவப்படாதால், `huggingface_hub`-இன் சில அம்சங்கள் கிடைக்காது. + +நீங்கள் விருப்பத் தேவைப்படும் சார்புகளை `pip` மூலம் நிறுவலாம்: + +```bash +# டென்சர்‌ஃபிளோவுக்கான குறிப்பிட்ட அம்சங்களுக்கு சார்ந்த பொறுப்பு நிறுவவும் +# /!\ எச்சரிக்கை: இது `pip install tensorflow` க்கு சமமாகக் கருதப்படாது +pip install 'huggingface_hub[tensorflow]' + +# டார்ச்-குறிப்பிட்ட மற்றும் CLI-குறிப்பிட்ட அம்சங்களுக்கு தேவையான பொறுப்புகளை நிறுவவும். +pip install 'huggingface_hub[cli,torch]' +``` +`huggingface_hub`-இல் உள்ள விருப்பத் தேவைப்படும் சார்புகளின் பட்டியல்: + +- `cli`: `huggingface_hub`-க்கு மிகவும் வசதியான CLI இடைமுகத்தை வழங்குகிறது. +- `fastai`, `torch`, `tensorflow`: வடிவமைப்பு குறிப்பிட்ட அம்சங்களை இயக்க தேவையான சார்புகள். +- `dev`: நூலகத்திற்கு பங்களிக்க தேவையான சார்புகள். இதில் சோதனை (சோதனைகளை இயக்க), வகை சோதனை (வகை சரிபார்ப்பு ஐ இயக்க) மற்றும் தரம் (லிண்டர்கள் ஐ இயக்க) உள்ளன. + +### மூலத்திலிருந்து நிறுவல் + +சில சமயம், `huggingface_hub`-ஐ நேரடியாக மூலத்திலிருந்து நிறுவுவது சுவாரஸ்யமாக இருக்கலாம். இது, சமீபத்திய நிலையான பதிப்பு பதிலாக, புதியதாக இருக்கும் `முக்கிய` பதிப்பைப் பயன்படுத்த அனுமதிக்கிறது. `முக்கிய` பதிப்பு, சமீபத்திய முன்னேற்றங்களுடன் புதுப்பிக்க உதவுகிறது, உதாரணமாக, சமீபத்திய அதிகாரப்பூர்வ வெளியீட்டுக்குப் பிறகு பிழை சரிசெய்யப்பட்டிருந்தாலும் புதிய வெளியீடு வந்ததாக இல்லை. + +எனினும், இதன் பொருள் `முக்கிய` பதிப்பு எப்போதும் நிலையாக இருக்காது. `முக்கிய` பதிப்பை செயல்படுமாறு வைத்திருக்க நாங்கள் முயற்சிக்கிறோம், மேலும் பெரும்பாலான சிக்கல்களை சில மணி நேரங்கள் அல்லது ஒரு நாளுக்குள் தீர்க்கவேண்டியவை. நீங்கள் ஒரு பிரச்சினையை எதிர்கொண்டால், அதைக் கூட்டுங்கள், அதைக் கூட விரைவில் சரிசெய்ய நாங்கள் முயற்சிக்கிறோம்! + + +```bash +pip install git+https://github.com/huggingface/huggingface_hub +``` + +மூலத்திலிருந்து நிறுவும் போது, நீங்கள் குறிப்பிட்ட கிளையை (branch) குறிப்படலாம். இது, இன்னும் இணைக்கப்படாத புதிய அம்சம் அல்லது புதிய பிழை சரிசெய்வுகளை சோதிக்க விரும்பும்போது பயனுள்ளதாக இருக்கும்: + + +```bash +pip install git+https://github.com/huggingface/huggingface_hub@my-feature-branch +``` +முடித்த பிறகு, [நிறுவல் சரியாக வேலை செய்கிறதா]((#check-installation)) என்பதைச் சோதிக்கவும். + +### திருத்தக்கூடிய நிறுவல் + +மூலத்திலிருந்து நிறுவுதல் [எடிடேபிள் இன்ஸ்டால்](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) அமைப்பதற்கு அனுமதிக்கிறது. இது, `huggingface_hub`-க்கு பங்களிக்க திட்டமிட்டு, கோடில் மாற்றங்களை சோதிக்க விரும்பும் போது மேலும் முற்றிலும் மேம்பட்ட நிறுவல் ஆகும். உங்கள் இயந்திரத்தில் `huggingface_hub`-இன் ஒரு உள்ளூர் நகலை கிளோன் செய்ய வேண்டும். + +```bash +# முதலில், கிடுகிடுக்கும் தொகுப்பை உள்ளூர் முறையில் கிளோன் செய்யவும். +git clone https://github.com/huggingface/huggingface_hub.git + +# அதன் பிறகு, -e கொள்கையைப் பயன்படுத்தி நிறுவவும். +cd huggingface_hub +pip install -e . +``` + +இந்த கட்டளைகள், நீங்கள் தரவுகளை கிளோன் செய்த அடைவை மற்றும் உங்கள் பைதான் நூலகப் பாதைகளை இணைக்கும். பைதான், தற்போது சாதாரண நூலகப் பாதைகளுக்கு கூட, நீங்கள் கிளோன் செய்த அடைவைப் பார்வையிடும். + +உதாரணமாக, உங்கள் பைதான் தொகுப்புகள் பொதுவாக `./.venv/lib/python3.11/site-packages/` இல் நிறுவப்பட்டிருந்தால், பைதான்n நீங்கள் கிளோன் செய்த `./huggingface_hub/` அடைவையும் தேடுவதாக இருக்கும். + +## கொண்டா மூலம் நிறுவல் + +**நீங்கள் அதனுடன் மேலும் பரிச்சயமாக இருந்தால்**, `huggingface_hub`-ஐ [conda-forge சேனல்](https://anaconda.org/conda-forge/huggingface_hub) பயன்படுத்தி நிறுவலாம்: + +```bash +conda install -c conda-forge huggingface_hub +``` + +முடித்த பிறகு, [நிறுவல் சரியாக வேலை செய்கிறதா என்பதைச் சோதிக்கவும்](#check-installation). + +## நிறுவலைச் சோதிக்கவும் + +நிறுவலுக்குப் பிறகு, `huggingface_hub` சரியாக வேலை செய்கிறதா என்பதைக் கீழ்காணும் கட்டளையை இயக்கி சோதிக்கவும்: + +```bash +python -c "from huggingface_hub import model_info; print(model_info('gpt2'))" +``` + +இந்த கட்டளை, Hub-இல் உள்ள [gpt2](https://huggingface.co/gpt2) மாடலுக்கான தகவல்களை பெறும். வெளியீடு கீழ்காணும் மாதிரியாக இருக்க வேண்டும்: + + +```text +Model Name: gpt2 +Tags: ['pytorch', 'tf', 'jax', 'tflite', 'rust', 'safetensors', 'gpt2', 'text-generation', 'en', 'doi:10.57967/hf/0039', 'transformers', 'exbert', 'license:mit', 'has_space'] +Task: text-generation +``` + +## Windows மரபுகள் + +எந்த இடத்திலும் சிறந்த ML-ஐ பொதுமக்களுக்கு வழங்கும் எங்கள் இலக்குடன், `huggingface_hub`-ஐ ஒரு குறைவில்லாத தளத்துடன் உருவாக்கினோம் மற்றும் குறிப்பாக Unix அடிப்படையிலான மற்றும் Windows அமைப்புகளில் சரியாக செயல்படவும். ஆனால், Windows-இல் இயங்கும் போது `huggingface_hub`-க்கு சில வரையறைகள் உள்ளன. இங்கே தெரிந்த சிக்கல்களின் முழு பட்டியல் உள்ளது. உங்கள் சந்தர்ப்பத்தில் ஆவணமிடாத சிக்கல் கண்டுபிடித்தால், [Github-ல் ஒரு பிரச்சனை திறக்க](https://github.com/huggingface/huggingface_hub/issues/new/choose) எங்களுக்கு தெரிவிக்கவும். + +- `huggingface_hub`-இன் காசே அமைப்பு, Hub-இல் இருந்து பதிவிறக்கம் செய்யப்பட்ட கோப்புகளைச் சரியாக காசே செய்ய சிம்லிங்குகளை நம்புகிறது. Windows-இல், சிம்லிங்குகளை இயக்குவதற்கு நீங்கள் டெவலப்பர் முறை அல்லது உங்கள் ஸ்கிரிப்டைப் ஆட்மின் ஆக இயக்க வேண்டும். சிம்லிங்குகள் இயக்கப்படாவிட்டால், காசே அமைப்பு இன்னும் வேலை செய்யும் ஆனால் சரியாக செயல்படாது. மேலும் விவரங்களுக்கு [காசே வரையறைகள்](./guides/manage-cache#limitations) பகுதியைப் படிக்கவும். +- Hub-இல் கோப்பு பாதைகள் சிறப்பு எழுத்துக்கள் கொண்டதாக இருக்கலாம் (எ.கா. `"path/to?/my/file"`). Windows, [சிறப்பு எழுத்துக்கள்](https://learn.microsoft.com/en-us/windows/win32/intl/character-sets-used-in-file-names) மீது அதிக கட்டுப்பாடுகளை கொண்டுள்ளது, இது Windows-இல் அந்த கோப்புகளை பதிவிறக்கம் செய்ய முடியாததாக உருவாக்குகிறது. இது நிச்சயமாக ஒரு புலவியல் சந்தர்ப்பமாக இருக்க வேண்டும். இது தவறு என்று நீங்கள் நினைத்தால், அதற்கான தீர்வைத் தேட எங்களை அணுகவும். + +## அடுத்த கட்டங்கள் + +`huggingface_hub` உங்கள் இயந்திரத்தில் முறையாக நிறுவப்பட்ட பிறகு, [சூழல் மாறிலிகளை](package_reference/environment_variables) கட்டமைக்க அல்லது [எங்கள் வழிகாட்டிகளில்](guides/overview) ஒன்றைப் பார்வையிட தேவையெனில், தொடங்குங்கள். \ No newline at end of file diff --git a/i18n/README_cn.md b/i18n/README_cn.md index 6aea789b74..24f4e73849 100644 --- a/i18n/README_cn.md +++ b/i18n/README_cn.md @@ -25,11 +25,12 @@ 中文(简体)

+ --- -**文档**: ``https://hf.co/docs/huggingface_hub `` +**文档**: https://hf.co/docs/huggingface_hub -**源代码**: ``https://github.com/huggingface/huggingface_hub `` +**源代码**: https://github.com/huggingface/huggingface_hub --- diff --git a/setup.py b/setup.py index 373e3119cf..9d69ad35c4 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,6 @@ def get_version() -> str: extras["inference"] = [ "aiohttp", # for AsyncInferenceClient - "minijinja>=1.0", # for chat-completion if not TGI-served ] extras["torch"] = [ @@ -74,7 +73,7 @@ def get_version() -> str: "urllib3<2.0", # VCR.py broken with urllib3 2.0 (see https://urllib3.readthedocs.io/en/stable/v2-migration-guide.html) "soundfile", "Pillow", - "gradio", # to test webhooks + "gradio>=4.0.0", # to test webhooks # pin to avoid issue on Python3.12 "numpy", # for embeddings "fastapi", # To build the documentation ] @@ -134,6 +133,8 @@ def get_version() -> str: "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], include_package_data=True, diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 8b9250a2fd..d6f719a997 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -46,7 +46,7 @@ from typing import TYPE_CHECKING -__version__ = "0.26.0.dev0" +__version__ = "0.27.0.dev0" # Alphabetical order of definitions is ensured in tests # WARNING: any comment added in this dictionary definition will be lost when @@ -70,10 +70,6 @@ "logout", "notebook_login", ], - "_multi_commits": [ - "MultiCommitException", - "plan_multi_commits", - ], "_snapshot_download": [ "snapshot_download", ], @@ -167,7 +163,6 @@ "create_branch", "create_collection", "create_commit", - "create_commits_on_pr", "create_discussion", "create_inference_endpoint", "create_pull_request", @@ -601,10 +596,6 @@ def __dir__(): logout, # noqa: F401 notebook_login, # noqa: F401 ) - from ._multi_commits import ( - MultiCommitException, # noqa: F401 - plan_multi_commits, # noqa: F401 - ) from ._snapshot_download import snapshot_download # noqa: F401 from ._space_api import ( SpaceHardware, # noqa: F401 @@ -694,7 +685,6 @@ def __dir__(): create_branch, # noqa: F401 create_collection, # noqa: F401 create_commit, # noqa: F401 - create_commits_on_pr, # noqa: F401 create_discussion, # noqa: F401 create_inference_endpoint, # noqa: F401 create_pull_request, # noqa: F401 diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 049394af1d..6fd05f053a 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -53,7 +53,6 @@ import os import time from dataclasses import dataclass -from functools import lru_cache from pathlib import Path from typing import Optional @@ -179,7 +178,6 @@ def save(self, paths: LocalUploadFilePaths) -> None: self.timestamp = new_timestamp -@lru_cache(maxsize=128) # ensure singleton def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths: """Compute paths to the files related to a download process. @@ -220,7 +218,6 @@ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFil return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path) -@lru_cache(maxsize=128) # ensure singleton def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths: """Compute paths to the files related to an upload process. @@ -404,7 +401,6 @@ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, et f.write(f"{commit_hash}\n{etag}\n{time.time()}\n") -@lru_cache() def _huggingface_dir(local_dir: Path) -> Path: """Return the path to the `.cache/huggingface` directory in a local directory.""" # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times diff --git a/src/huggingface_hub/_login.py b/src/huggingface_hub/_login.py index 24af592acf..b14702201d 100644 --- a/src/huggingface_hub/_login.py +++ b/src/huggingface_hub/_login.py @@ -15,8 +15,6 @@ import os import subprocess -import warnings -from functools import partial from getpass import getpass from pathlib import Path from typing import Optional @@ -43,6 +41,7 @@ _save_token, get_stored_tokens, ) +from .utils._deprecation import _deprecate_arguments, _deprecate_positional_args logger = logging.get_logger(__name__) @@ -56,8 +55,15 @@ """ +@_deprecate_arguments( + version="1.0", + deprecated_args="write_permission", + custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.", +) +@_deprecate_positional_args(version="1.0") def login( token: Optional[str] = None, + *, add_to_git_credential: bool = False, new_session: bool = True, write_permission: bool = False, @@ -98,8 +104,8 @@ def login( to the end user. new_session (`bool`, defaults to `True`): If `True`, will request a token even if one is already saved on the machine. - write_permission (`bool`, defaults to `False`): - If `True`, requires a token with write permission. + write_permission (`bool`): + Ignored and deprecated argument. Raises: [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) If an organization token is passed. Only personal account tokens are valid @@ -111,17 +117,17 @@ def login( """ if token is not None: if not add_to_git_credential: - print( + logger.info( "The token has not been saved to the git credentials helper. Pass " "`add_to_git_credential=True` in this function directly or " "`--add-to-git-credential` if using via `huggingface-cli` if " "you want to set the git credential as well." ) - _login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission) + _login(token, add_to_git_credential=add_to_git_credential) elif is_notebook(): - notebook_login(new_session=new_session, write_permission=write_permission) + notebook_login(new_session=new_session) else: - interpreter_login(new_session=new_session, write_permission=write_permission) + interpreter_login(new_session=new_session) def logout(token_name: Optional[str] = None) -> None: @@ -137,7 +143,7 @@ def logout(token_name: Optional[str] = None) -> None: If the access token name is not found. """ if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens - print("Not logged in!") + logger.warning("Not logged in!") return if not token_name: # Delete all saved access tokens and token @@ -146,10 +152,10 @@ def logout(token_name: Optional[str] = None) -> None: Path(file_path).unlink() except FileNotFoundError: pass - print("Successfully logged out from all access tokens.") + logger.info("Successfully logged out from all access tokens.") else: _logout_from_token(token_name) - print(f"Successfully logged out from access token: {token_name}.") + logger.info(f"Successfully logged out from access token: {token_name}.") unset_git_credential() @@ -187,10 +193,10 @@ def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None: raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}") # Write token to HF_TOKEN_PATH _set_active_token(token_name, add_to_git_credential) - print(f"The current active token is: {token_name}") + logger.info(f"The current active token is: {token_name}") token_from_environment = _get_token_from_environment() if token_from_environment is not None and token_from_environment != token: - warnings.warn( + logger.warning( "The environment variable `HF_TOKEN` is set and will override the access token you've just switched to." ) @@ -200,7 +206,7 @@ def auth_list() -> None: tokens = get_stored_tokens() if not tokens: - print("No access tokens found.") + logger.info("No access tokens found.") return # Find current token current_token = get_token() @@ -222,11 +228,11 @@ def auth_list() -> None: print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token)) if _get_token_from_environment(): - print( + logger.warning( "\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above." ) elif current_token_name is None: - print( + logger.warning( "\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `huggingface-cli login` to log in." ) @@ -236,7 +242,13 @@ def auth_list() -> None: ### -def interpreter_login(new_session: bool = True, write_permission: bool = False) -> None: +@_deprecate_arguments( + version="1.0", + deprecated_args="write_permission", + custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.", +) +@_deprecate_positional_args(version="1.0") +def interpreter_login(*, new_session: bool = True, write_permission: bool = False) -> None: """ Displays a prompt to log in to the HF website and store the token. @@ -249,36 +261,33 @@ def interpreter_login(new_session: bool = True, write_permission: bool = False) Args: new_session (`bool`, defaults to `True`): If `True`, will request a token even if one is already saved on the machine. - write_permission (`bool`, defaults to `False`): - If `True`, requires a token with write permission. - + write_permission (`bool`): + Ignored and deprecated argument. """ - if not new_session and _current_token_okay(write_permission=write_permission): - print("User is already logged in.") + if not new_session and get_token() is not None: + logger.info("User is already logged in.") return from .commands.delete_cache import _ask_for_confirmation_no_tui print(_HF_LOGO_ASCII) if get_token() is not None: - print( + logger.info( " A token is already saved on your machine. Run `huggingface-cli" " whoami` to get more information or `huggingface-cli logout` if you want" " to log out." ) - print(" Setting a new token will erase the existing one.") + logger.info(" Setting a new token will erase the existing one.") - print(" To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .") + logger.info( + " To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ." + ) if os.name == "nt": - print("Token can be pasted using 'Right-Click'.") + logger.info("Token can be pasted using 'Right-Click'.") token = getpass("Enter your token (input will not be visible): ") add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?") - _login( - token=token, - add_to_git_credential=add_to_git_credential, - write_permission=write_permission, - ) + _login(token=token, add_to_git_credential=add_to_git_credential) ### @@ -305,7 +314,13 @@ def interpreter_login(new_session: bool = True, write_permission: bool = False) notebooks. """ -def notebook_login(new_session: bool = True, write_permission: bool = False) -> None: +@_deprecate_arguments( + version="1.0", + deprecated_args="write_permission", + custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.", +) +@_deprecate_positional_args(version="1.0") +def notebook_login(*, new_session: bool = True, write_permission: bool = False) -> None: """ Displays a widget to log in to the HF website and store the token. @@ -318,8 +333,8 @@ def notebook_login(new_session: bool = True, write_permission: bool = False) -> Args: new_session (`bool`, defaults to `True`): If `True`, will request a token even if one is already saved on the machine. - write_permission (`bool`, defaults to `False`): - If `True`, requires a token with write permission. + write_permission (`bool`): + Ignored and deprecated argument. """ try: import ipywidgets.widgets as widgets # type: ignore @@ -329,8 +344,8 @@ def notebook_login(new_session: bool = True, write_permission: bool = False) -> "The `notebook_login` function can only be used in a notebook (Jupyter or" " Colab) and you need the `ipywidgets` module: `pip install ipywidgets`." ) - if not new_session and _current_token_okay(write_permission=write_permission): - print("User is already logged in.") + if not new_session and get_token() is not None: + logger.info("User is already logged in.") return box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%") @@ -352,14 +367,8 @@ def notebook_login(new_session: bool = True, write_permission: bool = False) -> display(login_token_widget) # On click events - def login_token_event(t, write_permission: bool = False): - """ - Event handler for the login button. - - Args: - write_permission (`bool`, defaults to `False`): - If `True`, requires a token with write permission. - """ + def login_token_event(t): + """Event handler for the login button.""" token = token_widget.value add_to_git_credential = git_checkbox_widget.value # Erase token and clear value to make sure it's not saved in the notebook. @@ -368,14 +377,14 @@ def login_token_event(t, write_permission: bool = False): login_token_widget.children = [widgets.Label("Connecting...")] try: with capture_output() as captured: - _login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission) + _login(token, add_to_git_credential=add_to_git_credential) message = captured.getvalue() except Exception as error: message = str(error) # Print result (success message or error) login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()] - token_finish_button.on_click(partial(login_token_event, write_permission=write_permission)) + token_finish_button.on_click(login_token_event) ### @@ -386,7 +395,6 @@ def login_token_event(t, write_permission: bool = False): def _login( token: str, add_to_git_credential: bool, - write_permission: bool = False, ) -> None: from .hf_api import whoami # avoid circular import @@ -395,25 +403,20 @@ def _login( token_info = whoami(token) permission = token_info["auth"]["accessToken"]["role"] - if write_permission and permission != "write": - raise ValueError( - "Token is valid but is 'read-only' and a 'write' token is required.\nPlease provide a new token with" - " correct permission." - ) - print(f"Token is valid (permission: {permission}).") + logger.info(f"Token is valid (permission: {permission}).") token_name = token_info["auth"]["accessToken"]["displayName"] # Store token locally _save_token(token=token, token_name=token_name) # Set active token _set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential) - print("Login successful.") + logger.info("Login successful.") if _get_token_from_environment(): - print( + logger.warning( "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured." ) else: - print(f"The current active token is: `{token_name}`") + logger.info(f"The current active token is: `{token_name}`") def _logout_from_token(token_name: str) -> None: @@ -435,7 +438,7 @@ def _logout_from_token(token_name: str) -> None: _save_stored_tokens(stored_tokens) if token == _get_token_from_file(): - warnings.warn(f"Active token '{token_name}' has been deleted.") + logger.warning(f"Active token '{token_name}' has been deleted.") Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True) @@ -455,35 +458,17 @@ def _set_active_token( if add_to_git_credential: if _is_git_credential_helper_configured(): set_git_credential(token) - print( + logger.info( "Your token has been saved in your configured git credential helpers" + f" ({','.join(list_credential_helpers())})." ) else: - print("Token has not been saved to git credential helper.") + logger.warning("Token has not been saved to git credential helper.") # Write token to HF_TOKEN_PATH path = Path(constants.HF_TOKEN_PATH) path.parent.mkdir(parents=True, exist_ok=True) path.write_text(token) - print(f"Your token has been saved to {constants.HF_TOKEN_PATH}") - - -def _current_token_okay(write_permission: bool = False): - """Check if the current token is valid. - - Args: - write_permission (`bool`, defaults to `False`): - If `True`, requires a token with write permission. - - Returns: - `bool`: `True` if the current token is valid, `False` otherwise. - """ - from .hf_api import get_token_permission # avoid circular import - - permission = get_token_permission() - if permission is None or (write_permission and permission != "write"): - return False - return True + logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}") def _is_git_credential_helper_configured() -> bool: diff --git a/src/huggingface_hub/_multi_commits.py b/src/huggingface_hub/_multi_commits.py deleted file mode 100644 index c79377b092..0000000000 --- a/src/huggingface_hub/_multi_commits.py +++ /dev/null @@ -1,306 +0,0 @@ -# coding=utf-8 -# Copyright 2023-present, the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Contains utilities to multi-commits (i.e. push changes iteratively on a PR).""" - -import re -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Iterable, List, Optional, Set, Tuple, Union - -from ._commit_api import CommitOperationAdd, CommitOperationDelete -from .community import DiscussionWithDetails -from .utils import experimental -from .utils._cache_manager import _format_size -from .utils.insecure_hashlib import sha256 - - -if TYPE_CHECKING: - from .hf_api import HfApi - - -class MultiCommitException(Exception): - """Base exception for any exception happening while doing a multi-commit.""" - - -MULTI_COMMIT_PR_DESCRIPTION_TEMPLATE = """ -## {commit_message} - -{commit_description} - -**Multi commit ID:** {multi_commit_id} - -Scheduled commits: - -{multi_commit_strategy} - -_This is a PR opened using the `huggingface_hub` library in the context of a multi-commit. PR can be commented as a usual PR. However, please be aware that manually updating the PR description, changing the PR status, or pushing new commits, is not recommended as it might corrupt the commit process. Learn more about multi-commits [in this guide](https://huggingface.co/docs/huggingface_hub/main/guides/upload)._ -""" - -MULTI_COMMIT_PR_COMPLETION_COMMENT_TEMPLATE = """ -Multi-commit is now completed! You can ping the repo owner to review the changes. This PR can now be commented or modified without risking to corrupt it. - -_This is a comment posted using the `huggingface_hub` library in the context of a multi-commit. Learn more about multi-commits [in this guide](https://huggingface.co/docs/huggingface_hub/main/guides/upload)._ -""" - -MULTI_COMMIT_PR_CLOSING_COMMENT_TEMPLATE = """ -`create_pr=False` has been passed so PR is automatically merged. - -_This is a comment posted using the `huggingface_hub` library in the context of a multi-commit. Learn more about multi-commits [in this guide](https://huggingface.co/docs/huggingface_hub/main/guides/upload)._ -""" - -MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_NO_CHANGES_TEMPLATE = """ -Cannot merge Pull Requests as no changes are associated. This PR will be closed automatically. - -_This is a comment posted using the `huggingface_hub` library in the context of a multi-commit. Learn more about multi-commits [in this guide](https://huggingface.co/docs/huggingface_hub/main/guides/upload)._ -""" - -MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_BAD_REQUEST_TEMPLATE = """ -An error occurred while trying to merge the Pull Request: `{error_message}`. - -_This is a comment posted using the `huggingface_hub` library in the context of a multi-commit. Learn more about multi-commits [in this guide](https://huggingface.co/docs/huggingface_hub/main/guides/upload)._ -""" - - -STEP_ID_REGEX = re.compile(r"- \[(?P[ |x])\].*(?P[a-fA-F0-9]{64})", flags=re.MULTILINE) - - -@experimental -def plan_multi_commits( - operations: Iterable[Union[CommitOperationAdd, CommitOperationDelete]], - max_operations_per_commit: int = 50, - max_upload_size_per_commit: int = 2 * 1024 * 1024 * 1024, -) -> Tuple[List[List[CommitOperationAdd]], List[List[CommitOperationDelete]]]: - """Split a list of operations in a list of commits to perform. - - Implementation follows a sub-optimal (yet simple) algorithm: - 1. Delete operations are grouped together by commits of maximum `max_operations_per_commits` operations. - 2. All additions exceeding `max_upload_size_per_commit` are committed 1 by 1. - 3. All remaining additions are grouped together and split each time the `max_operations_per_commit` or the - `max_upload_size_per_commit` limit is reached. - - We do not try to optimize the splitting to get the lowest number of commits as this is a NP-hard problem (see - [bin packing problem](https://en.wikipedia.org/wiki/Bin_packing_problem)). For our use case, it is not problematic - to use a sub-optimal solution so we favored an easy-to-explain implementation. - - Args: - operations (`List` of [`~hf_api.CommitOperation`]): - The list of operations to split into commits. - max_operations_per_commit (`int`): - Maximum number of operations in a single commit. Defaults to 50. - max_upload_size_per_commit (`int`): - Maximum size to upload (in bytes) in a single commit. Defaults to 2GB. Files bigger than this limit are - uploaded, 1 per commit. - - Returns: - `Tuple[List[List[CommitOperationAdd]], List[List[CommitOperationDelete]]]`: a tuple. First item is a list of - lists of [`CommitOperationAdd`] representing the addition commits to push. The second item is a list of lists - of [`CommitOperationDelete`] representing the deletion commits. - - - - `plan_multi_commits` is experimental. Its API and behavior is subject to change in the future without prior notice. - - - - Example: - ```python - >>> from huggingface_hub import HfApi, plan_multi_commits - >>> addition_commits, deletion_commits = plan_multi_commits( - ... operations=[ - ... CommitOperationAdd(...), - ... CommitOperationAdd(...), - ... CommitOperationDelete(...), - ... CommitOperationDelete(...), - ... CommitOperationAdd(...), - ... ], - ... ) - >>> HfApi().create_commits_on_pr( - ... repo_id="my-cool-model", - ... addition_commits=addition_commits, - ... deletion_commits=deletion_commits, - ... (...) - ... verbose=True, - ... ) - ``` - - - - The initial order of the operations is not guaranteed! All deletions will be performed before additions. If you are - not updating multiple times the same file, you are fine. - - - """ - addition_commits: List[List[CommitOperationAdd]] = [] - deletion_commits: List[List[CommitOperationDelete]] = [] - - additions: List[CommitOperationAdd] = [] - additions_size = 0 - deletions: List[CommitOperationDelete] = [] - for op in operations: - if isinstance(op, CommitOperationDelete): - # Group delete operations together - deletions.append(op) - if len(deletions) >= max_operations_per_commit: - deletion_commits.append(deletions) - deletions = [] - - elif op.upload_info.size >= max_upload_size_per_commit: - # Upload huge files 1 by 1 - addition_commits.append([op]) - - elif additions_size + op.upload_info.size < max_upload_size_per_commit: - # Group other additions and split if size limit is reached (either max_nb_files or max_upload_size) - additions.append(op) - additions_size += op.upload_info.size - - else: - addition_commits.append(additions) - additions = [op] - additions_size = op.upload_info.size - - if len(additions) >= max_operations_per_commit: - addition_commits.append(additions) - additions = [] - additions_size = 0 - - if len(additions) > 0: - addition_commits.append(additions) - if len(deletions) > 0: - deletion_commits.append(deletions) - - return addition_commits, deletion_commits - - -@dataclass -class MultiCommitStep: - """Dataclass containing a list of CommitOperation to commit at once. - - A [`MultiCommitStep`] is one atomic part of a [`MultiCommitStrategy`]. Each step is identified by its own - deterministic ID based on the list of commit operations (hexadecimal sha256). ID is persistent between re-runs if - the list of commits is kept the same. - """ - - operations: List[Union[CommitOperationAdd, CommitOperationDelete]] - - id: str = field(init=False) - completed: bool = False - - def __post_init__(self) -> None: - if len(self.operations) == 0: - raise ValueError("A MultiCommitStep must have at least 1 commit operation, got 0.") - - # Generate commit id - sha = sha256() - for op in self.operations: - if isinstance(op, CommitOperationAdd): - sha.update(b"ADD") - sha.update(op.path_in_repo.encode()) - sha.update(op.upload_info.sha256) - elif isinstance(op, CommitOperationDelete): - sha.update(b"DELETE") - sha.update(op.path_in_repo.encode()) - sha.update(str(op.is_folder).encode()) - else: - NotImplementedError() - self.id = sha.hexdigest() - - def __str__(self) -> str: - """Format a step for PR description. - - Formatting can be changed in the future as long as it is single line, starts with `- [ ]`/`- [x]` and contains - `self.id`. Must be able to match `STEP_ID_REGEX`. - """ - additions = [op for op in self.operations if isinstance(op, CommitOperationAdd)] - file_deletions = [op for op in self.operations if isinstance(op, CommitOperationDelete) and not op.is_folder] - folder_deletions = [op for op in self.operations if isinstance(op, CommitOperationDelete) and op.is_folder] - if len(additions) > 0: - return ( - f"- [{'x' if self.completed else ' '}] Upload {len(additions)} file(s) " - f"totalling {_format_size(sum(add.upload_info.size for add in additions))}" - f" ({self.id})" - ) - else: - return ( - f"- [{'x' if self.completed else ' '}] Delete {len(file_deletions)} file(s) and" - f" {len(folder_deletions)} folder(s) ({self.id})" - ) - - -@dataclass -class MultiCommitStrategy: - """Dataclass containing a list of [`MultiCommitStep`] to commit iteratively. - - A strategy is identified by its own deterministic ID based on the list of its steps (hexadecimal sha256). ID is - persistent between re-runs if the list of commits is kept the same. - """ - - addition_commits: List[MultiCommitStep] - deletion_commits: List[MultiCommitStep] - - id: str = field(init=False) - all_steps: Set[str] = field(init=False) - - def __post_init__(self) -> None: - self.all_steps = {step.id for step in self.addition_commits + self.deletion_commits} - if len(self.all_steps) < len(self.addition_commits) + len(self.deletion_commits): - raise ValueError("Got duplicate commits in MultiCommitStrategy. All commits must be unique.") - - if len(self.all_steps) == 0: - raise ValueError("A MultiCommitStrategy must have at least 1 commit, got 0.") - - # Generate strategy id - sha = sha256() - for step in self.addition_commits + self.deletion_commits: - sha.update("new step".encode()) - sha.update(step.id.encode()) - self.id = sha.hexdigest() - - -def multi_commit_create_pull_request( - api: "HfApi", - repo_id: str, - commit_message: str, - commit_description: Optional[str], - strategy: MultiCommitStrategy, - repo_type: Optional[str], - token: Union[str, bool, None] = None, -) -> DiscussionWithDetails: - return api.create_pull_request( - repo_id=repo_id, - title=f"[WIP] {commit_message} (multi-commit {strategy.id})", - description=multi_commit_generate_comment( - commit_message=commit_message, commit_description=commit_description, strategy=strategy - ), - token=token, - repo_type=repo_type, - ) - - -def multi_commit_generate_comment( - commit_message: str, - commit_description: Optional[str], - strategy: MultiCommitStrategy, -) -> str: - return MULTI_COMMIT_PR_DESCRIPTION_TEMPLATE.format( - commit_message=commit_message, - commit_description=commit_description or "", - multi_commit_id=strategy.id, - multi_commit_strategy="\n".join( - str(commit) for commit in strategy.deletion_commits + strategy.addition_commits - ), - ) - - -def multi_commit_parse_pr_description(description: str) -> Set[str]: - return {match[1] for match in STEP_ID_REGEX.findall(description)} diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index 90b9246e53..b928dd3466 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -256,9 +256,12 @@ def snapshot_download( # In that case store a ref. if revision != commit_hash: ref_path = os.path.join(storage_folder, "refs", revision) - os.makedirs(os.path.dirname(ref_path), exist_ok=True) - with open(ref_path, "w") as f: - f.write(commit_hash) + try: + os.makedirs(os.path.dirname(ref_path), exist_ok=True) + with open(ref_path, "w") as f: + f.write(commit_hash) + except OSError as e: + logger.warning(f"Ignored error while writing commit hash to {ref_path}: {e}.") # we pass the commit_hash to hf_hub_download # so no network call happens if we already diff --git a/src/huggingface_hub/commands/user.py b/src/huggingface_hub/commands/user.py index c16bbefdb2..9741a219f1 100644 --- a/src/huggingface_hub/commands/user.py +++ b/src/huggingface_hub/commands/user.py @@ -11,9 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Contains commands to authenticate to the Hugging Face Hub and interact with your repositories. + +Usage: + # login and save token locally. + huggingface-cli login --token=hf_*** --add-to-git-credential + + # switch between tokens + huggingface-cli auth switch + + # list all tokens + huggingface-cli auth list + + # logout from a specific token, if no token-name is provided, all tokens will be deleted from your machine. + huggingface-cli logout --token-name=your_token_name + + # find out which huggingface.co account you are logged in as + huggingface-cli whoami + + # create a new dataset repo on the Hub + huggingface-cli repo create mydataset --type=dataset + +""" + import subprocess from argparse import _SubParsersAction -from typing import Optional +from typing import List, Optional from requests.exceptions import HTTPError @@ -126,6 +149,7 @@ def __init__(self, args): class LoginCommand(BaseUserCommand): def run(self): + logging.set_verbosity_info() login( token=self.args.token, add_to_git_credential=self.args.add_to_git_credential, @@ -134,11 +158,13 @@ def run(self): class LogoutCommand(BaseUserCommand): def run(self): + logging.set_verbosity_info() logout(token_name=self.args.token_name) class AuthSwitchCommand(BaseUserCommand): def run(self): + logging.set_verbosity_info() token_name = self.args.token_name if token_name is None: token_name = self._select_token_name() @@ -174,7 +200,7 @@ def _select_token_name(self) -> Optional[str]: except ValueError: print("Invalid input. Please enter a number or 'q' to quit.") - def _select_token_name_tui(self, token_names: list[str]) -> Optional[str]: + def _select_token_name_tui(self, token_names: List[str]) -> Optional[str]: choices = [Choice(token_name, name=token_name) for token_name in token_names] try: return inquirer.select( @@ -189,6 +215,7 @@ def _select_token_name_tui(self, token_names: list[str]) -> Optional[str]: class AuthListCommand(BaseUserCommand): def run(self): + logging.set_verbosity_info() auth_list() diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index 5252319de2..1dae6ddf97 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -120,13 +120,6 @@ class NotASafetensorsRepoError(Exception): """ -# TEMPLATING ERRORS - - -class TemplateError(Exception): - """Any error raised while trying to fetch or render a chat template.""" - - # TEXT GENERATION ERRORS diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index def44b3a34..47bd055871 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -390,9 +390,8 @@ def http_get( consistency_error_message = ( f"Consistency check failed: file should be of size {expected_size} but has size" - f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry" - " with `force_download=True`.\nIf the issue persists, please let us know by opening an issue " - "on https://github.com/huggingface/huggingface_hub." + f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file." + " Please retry with `force_download=True`." ) # Stream file to buffer @@ -821,7 +820,7 @@ def hf_hub_download( if repo_type not in constants.REPO_TYPES: raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}") - headers = build_hf_headers( + hf_headers = build_hf_headers( token=token, library_name=library_name, library_version=library_version, @@ -850,7 +849,7 @@ def hf_hub_download( # HTTP info endpoint=endpoint, etag_timeout=etag_timeout, - headers=headers, + headers=hf_headers, proxies=proxies, token=token, # Additional options @@ -870,7 +869,7 @@ def hf_hub_download( # HTTP info endpoint=endpoint, etag_timeout=etag_timeout, - headers=headers, + headers=hf_headers, proxies=proxies, token=token, # Additional options @@ -1283,20 +1282,20 @@ def get_hf_file_metadata( A [`HfFileMetadata`] object containing metadata such as location, etag, size and commit_hash. """ - headers = build_hf_headers( + hf_headers = build_hf_headers( token=token, library_name=library_name, library_version=library_version, user_agent=user_agent, headers=headers, ) - headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file + hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file # Retrieve metadata r = _request_wrapper( method="HEAD", url=url, - headers=headers, + headers=hf_headers, allow_redirects=False, follow_relative_redirects=True, proxies=proxies, diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index f0dee92a3a..963c8fdf88 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -61,19 +61,6 @@ _warn_on_overwriting_operations, ) from ._inference_endpoints import InferenceEndpoint, InferenceEndpointType -from ._multi_commits import ( - MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_BAD_REQUEST_TEMPLATE, - MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_NO_CHANGES_TEMPLATE, - MULTI_COMMIT_PR_CLOSING_COMMENT_TEMPLATE, - MULTI_COMMIT_PR_COMPLETION_COMMENT_TEMPLATE, - MultiCommitException, - MultiCommitStep, - MultiCommitStrategy, - multi_commit_create_pull_request, - multi_commit_generate_comment, - multi_commit_parse_pr_description, - plan_multi_commits, -) from ._space_api import SpaceHardware, SpaceRuntime, SpaceStorage, SpaceVariable from ._upload_large_folder import upload_large_folder_internal from .community import ( @@ -125,7 +112,6 @@ SafetensorsRepoMetadata, TensorInfo, build_hf_headers, - experimental, filter_repo_objects, fix_hf_endpoint_in_url, get_session, @@ -336,7 +322,8 @@ def __post_init__(self): # hack to make BlobLfsInfo backward compatible @dataclass class BlobSecurityInfo(dict): - safe: bool + safe: bool # duplicate information with "status" field, keeping it for backward compatibility + status: str av_scan: Optional[Dict] pickle_import_scan: Optional[Dict] @@ -656,10 +643,14 @@ def __init__(self, **kwargs): oid=last_commit["id"], title=last_commit["title"], date=parse_datetime(last_commit["date"]) ) self.last_commit = last_commit - security = kwargs.pop("security", None) + security = kwargs.pop("securityFileStatus", None) if security is not None: + safe = security["status"] == "safe" security = BlobSecurityInfo( - safe=security["safe"], av_scan=security["avScan"], pickle_import_scan=security["pickleImportScan"] + safe=safe, + status=security["status"], + av_scan=security["avScan"], + pickle_import_scan=security["pickleImportScan"], ) self.security = security @@ -769,6 +760,8 @@ class ModelInfo: List of spaces using the model. safetensors (`SafeTensorsInfo`, *optional*): Model's safetensors information. + security_repo_status (`Dict`, *optional*): + Model's security scan status. """ id: str @@ -797,6 +790,7 @@ class ModelInfo: siblings: Optional[List[RepoSibling]] spaces: Optional[List[str]] safetensors: Optional[SafeTensorsInfo] + security_repo_status: Optional[Dict] def __init__(self, **kwargs): self.id = kwargs.pop("id") @@ -862,7 +856,7 @@ def __init__(self, **kwargs): if safetensors else None ) - + self.security_repo_status = kwargs.pop("securityRepoStatus", None) # backwards compatibility self.lastModified = self.last_modified self.cardData = self.card_data @@ -1555,6 +1549,36 @@ def _inner(self, *args, **kwargs): class HfApi: + """ + Client to interact with the Hugging Face Hub via HTTP. + + The client is initialized with some high-level settings used in all requests + made to the Hub (HF endpoint, authentication, user agents...). Using the `HfApi` + client is preferred but not mandatory as all of its public methods are exposed + directly at the root of `huggingface_hub`. + + Args: + endpoint (`str`, *optional*): + Endpoint of the Hub. Defaults to . + token (Union[bool, str, None], optional): + A valid user access token (string). Defaults to the locally saved + token, which is the recommended method for authentication (see + https://huggingface.co/docs/huggingface_hub/quick-start#authentication). + To disable authentication, pass `False`. + library_name (`str`, *optional*): + The name of the library that is making the HTTP request. Will be added to + the user-agent header. Example: `"transformers"`. + library_version (`str`, *optional*): + The version of the library that is making the HTTP request. Will be added + to the user-agent header. Example: `"4.24.0"`. + user_agent (`str`, `dict`, *optional*): + The user agent info in the form of a dictionary or a single string. It will + be completed with information about the installed packages. + headers (`dict`, *optional*): + Additional headers to be sent with each request. Example: `{"X-My-Header": "value"}`. + Headers passed here are taking precedence over the default headers. + """ + def __init__( self, endpoint: Optional[str] = None, @@ -1564,32 +1588,6 @@ def __init__( user_agent: Union[Dict, str, None] = None, headers: Optional[Dict[str, str]] = None, ) -> None: - """Create a HF client to interact with the Hub via HTTP. - - The client is initialized with some high-level settings used in all requests - made to the Hub (HF endpoint, authentication, user agents...). Using the `HfApi` - client is preferred but not mandatory as all of its public methods are exposed - directly at the root of `huggingface_hub`. - - Args: - token (Union[bool, str, None], optional): - A valid user access token (string). Defaults to the locally saved - token, which is the recommended method for authentication (see - https://huggingface.co/docs/huggingface_hub/quick-start#authentication). - To disable authentication, pass `False`. - library_name (`str`, *optional*): - The name of the library that is making the HTTP request. Will be added to - the user-agent header. Example: `"transformers"`. - library_version (`str`, *optional*): - The version of the library that is making the HTTP request. Will be added - to the user-agent header. Example: `"4.24.0"`. - user_agent (`str`, `dict`, *optional*): - The user agent info in the form of a dictionary or a single string. It will - be completed with information about the installed packages. - headers (`dict`, *optional*): - Additional headers to be sent with each request. Example: `{"X-My-Header": "value"}`. - Headers passed here are taking precedence over the default headers. - """ self.endpoint = endpoint if endpoint is not None else constants.ENDPOINT self.token = token self.library_name = library_name @@ -1670,10 +1668,28 @@ def whoami(self, token: Union[bool, str, None] = None) -> Dict: ) from e return r.json() - def get_token_permission(self, token: Union[bool, str, None] = None) -> Literal["read", "write", None]: + @_deprecate_method( + version="1.0", + message=( + "Permissions are more complex than when `get_token_permission` was first introduced. " + "OAuth and fine-grain tokens allows for more detailed permissions. " + "If you need to know the permissions associated with a token, please use `whoami` and check the `'auth'` key." + ), + ) + def get_token_permission( + self, token: Union[bool, str, None] = None + ) -> Literal["read", "write", "fineGrained", None]: """ Check if a given `token` is valid and return its permissions. + + + This method is deprecated and will be removed in version 1.0. Permissions are more complex than when + `get_token_permission` was first introduced. OAuth and fine-grain tokens allows for more detailed permissions. + If you need to know the permissions associated with a token, please use `whoami` and check the `'auth'` key. + + + For more details about tokens, please refer to https://huggingface.co/docs/hub/security-tokens#what-are-user-access-tokens. Args: @@ -1684,12 +1700,12 @@ def get_token_permission(self, token: Union[bool, str, None] = None) -> Literal[ To disable authentication, pass `False`. Returns: - `Literal["read", "write", None]`: Permission granted by the token ("read" or "write"). Returns `None` if no - token passed or token is invalid. + `Literal["read", "write", "fineGrained", None]`: Permission granted by the token ("read" or "write"). Returns `None` if no + token passed, if token is invalid or if role is not returned by the server. This typically happens when the token is an OAuth token. """ try: return self.whoami(token=token)["auth"]["accessToken"]["role"] - except (LocalTokenNotFoundError, HTTPError): + except (LocalTokenNotFoundError, HTTPError, KeyError): return None def get_model_tags(self) -> Dict: @@ -1782,8 +1798,8 @@ def list_models( A tuple of two ints or floats representing a minimum and maximum carbon footprint to filter the resulting models with in grams. sort (`Literal["last_modified"]` or `str`, *optional*): - The key with which to sort the resulting models. Possible values - are the properties of the [`huggingface_hub.hf_api.ModelInfo`] class. + The key with which to sort the resulting models. Possible values are "last_modified", "trending_score", + "created_at", "downloads" and "likes". direction (`Literal[-1]` or `int`, *optional*): Direction in which to sort. The value `-1` sorts by descending order while all other values sort by ascending order. @@ -1895,7 +1911,15 @@ def list_models( if len(search_list) > 0: params["search"] = search_list if sort is not None: - params["sort"] = "lastModified" if sort == "last_modified" else sort + params["sort"] = ( + "lastModified" + if sort == "last_modified" + else "trendingScore" + if sort == "trending_score" + else "createdAt" + if sort == "created_at" + else sort + ) if direction is not None: params["direction"] = direction if limit is not None: @@ -1994,8 +2018,8 @@ def list_datasets( search (`str`, *optional*): A string that will be contained in the returned datasets. sort (`Literal["last_modified"]` or `str`, *optional*): - The key with which to sort the resulting datasets. Possible - values are the properties of the [`huggingface_hub.hf_api.DatasetInfo`] class. + The key with which to sort the resulting models. Possible values are "last_modified", "trending_score", + "created_at", "downloads" and "likes". direction (`Literal[-1]` or `int`, *optional*): Direction in which to sort. The value `-1` sorts by descending order while all other values sort by ascending order. @@ -2105,7 +2129,15 @@ def list_datasets( if len(search_list) > 0: params["search"] = search_list if sort is not None: - params["sort"] = "lastModified" if sort == "last_modified" else sort + params["sort"] = ( + "lastModified" + if sort == "last_modified" + else "trendingScore" + if sort == "trending_score" + else "createdAt" + if sort == "created_at" + else sort + ) if direction is not None: params["direction"] = direction if limit is not None: @@ -2177,8 +2209,8 @@ def list_spaces( linked (`bool`, *optional*): Whether to return Spaces that make use of either a model or a dataset. sort (`Literal["last_modified"]` or `str`, *optional*): - The key with which to sort the resulting Spaces. Possible - values are the properties of the [`huggingface_hub.hf_api.SpaceInfo`]` class. + The key with which to sort the resulting models. Possible values are "last_modified", "trending_score", + "created_at" and "likes". direction (`Literal[-1]` or `int`, *optional*): Direction in which to sort. The value `-1` sorts by descending order while all other values sort by ascending order. @@ -2214,7 +2246,15 @@ def list_spaces( if search is not None: params["search"] = search if sort is not None: - params["sort"] = "lastModified" if sort == "last_modified" else sort + params["sort"] = ( + "lastModified" + if sort == "last_modified" + else "trendingScore" + if sort == "trending_score" + else "createdAt" + if sort == "created_at" + else sort + ) if direction is not None: params["direction"] = direction if limit is not None: @@ -2484,7 +2524,7 @@ def model_info( Whether to set a timeout for the request to the Hub. securityStatus (`bool`, *optional*): Whether to retrieve the security status from the model - repository as well. + repository as well. The security status will be returned in the `security_repo_status` field. files_metadata (`bool`, *optional*): Whether or not to retrieve metadata for files in the repository (size, LFS metadata, etc). Defaults to `False`. @@ -4107,312 +4147,6 @@ def _payload_as_ndjson() -> Iterable[bytes]: pr_url=commit_data["pullRequestUrl"] if create_pr else None, ) - @experimental - @validate_hf_hub_args - @_deprecate_method( - version="0.27", message="This is an experimental feature. Please use `upload_large_folder` instead." - ) - def create_commits_on_pr( - self, - *, - repo_id: str, - addition_commits: List[List[CommitOperationAdd]], - deletion_commits: List[List[CommitOperationDelete]], - commit_message: str, - commit_description: Optional[str] = None, - token: Union[str, bool, None] = None, - repo_type: Optional[str] = None, - merge_pr: bool = True, - num_threads: int = 5, # TODO: use to multithread uploads - verbose: bool = False, - ) -> str: - """Push changes to the Hub in multiple commits. - - Commits are pushed to a draft PR branch. If the upload fails or gets interrupted, it can be resumed. Progress - is tracked in the PR description. At the end of the process, the PR is set as open and the title is updated to - match the initial commit message. If `merge_pr=True` is passed, the PR is merged automatically. - - All deletion commits are pushed first, followed by the addition commits. The order of the commits is not - guaranteed as we might implement parallel commits in the future. Be sure that your are not updating several - times the same file. - - - - `create_commits_on_pr` is experimental. Its API and behavior is subject to change in the future without prior notice. - - - - - - `create_commits_on_pr` assumes that the repo already exists on the Hub. If you get a Client error 404, please - make sure you are authenticated and that `repo_id` and `repo_type` are set correctly. If repo does not exist, - create it first using [`~hf_api.create_repo`]. - - - - Args: - repo_id (`str`): - The repository in which the commits will be pushed. Example: `"username/my-cool-model"`. - - addition_commits (`List` of `List` of [`~hf_api.CommitOperationAdd`]): - A list containing lists of [`~hf_api.CommitOperationAdd`]. Each sublist will result in a commit on the - PR. - - deletion_commits - A list containing lists of [`~hf_api.CommitOperationDelete`]. Each sublist will result in a commit on - the PR. Deletion commits are pushed before addition commits. - - commit_message (`str`): - The summary (first line) of the commit that will be created. Will also be the title of the PR. - - commit_description (`str`, *optional*): - The description of the commit that will be created. The description will be added to the PR. - - token (Union[bool, str, None], optional): - A valid user access token (string). Defaults to the locally saved - token, which is the recommended method for authentication (see - https://huggingface.co/docs/huggingface_hub/quick-start#authentication). - To disable authentication, pass `False`. - - repo_type (`str`, *optional*): - Set to `"dataset"` or `"space"` if uploading to a dataset or space, `None` or `"model"` if uploading to - a model. Default is `None`. - - merge_pr (`bool`): - If set to `True`, the Pull Request is merged at the end of the process. Defaults to `True`. - - num_threads (`int`, *optional*): - Number of concurrent threads for uploading files. Defaults to 5. - - verbose (`bool`): - If set to `True`, process will run on verbose mode i.e. print information about the ongoing tasks. - Defaults to `False`. - - Returns: - `str`: URL to the created PR. - - Example: - ```python - >>> from huggingface_hub import HfApi, plan_multi_commits - >>> addition_commits, deletion_commits = plan_multi_commits( - ... operations=[ - ... CommitOperationAdd(...), - ... CommitOperationAdd(...), - ... CommitOperationDelete(...), - ... CommitOperationDelete(...), - ... CommitOperationAdd(...), - ... ], - ... ) - >>> HfApi().create_commits_on_pr( - ... repo_id="my-cool-model", - ... addition_commits=addition_commits, - ... deletion_commits=deletion_commits, - ... (...) - ... verbose=True, - ... ) - ``` - - Raises: - [`MultiCommitException`]: - If an unexpected issue occur in the process: empty commits, unexpected commits in a PR, unexpected PR - description, etc. - """ - logger = logging.get_logger(__name__ + ".create_commits_on_pr") - if verbose: - logger.setLevel("INFO") - - # 1. Get strategy ID - logger.info( - f"Will create {len(deletion_commits)} deletion commit(s) and {len(addition_commits)} addition commit(s)," - f" totalling {sum(len(ops) for ops in addition_commits+deletion_commits)} atomic operations." - ) - strategy = MultiCommitStrategy( - addition_commits=[MultiCommitStep(operations=operations) for operations in addition_commits], # type: ignore - deletion_commits=[MultiCommitStep(operations=operations) for operations in deletion_commits], # type: ignore - ) - logger.info(f"Multi-commits strategy with ID {strategy.id}.") - - # 2. Get or create a PR with this strategy ID - for discussion in self.get_repo_discussions(repo_id=repo_id, repo_type=repo_type, token=token): - # search for a draft PR with strategy ID - if discussion.is_pull_request and discussion.status == "draft" and strategy.id in discussion.title: - pr = self.get_discussion_details( - repo_id=repo_id, discussion_num=discussion.num, repo_type=repo_type, token=token - ) - logger.info(f"PR already exists: {pr.url}. Will resume process where it stopped.") - break - else: - # did not find a PR matching the strategy ID - pr = multi_commit_create_pull_request( - self, - repo_id=repo_id, - commit_message=commit_message, - commit_description=commit_description, - strategy=strategy, - token=token, - repo_type=repo_type, - ) - logger.info(f"New PR created: {pr.url}") - - # 3. Parse PR description to check consistency with strategy (e.g. same commits are scheduled) - for event in pr.events: - if isinstance(event, DiscussionComment): - pr_comment = event - break - else: - raise MultiCommitException(f"PR #{pr.num} must have at least 1 comment") - - description_commits = multi_commit_parse_pr_description(pr_comment.content) - if len(description_commits) != len(strategy.all_steps): - raise MultiCommitException( - f"Corrupted multi-commit PR #{pr.num}: got {len(description_commits)} steps in" - f" description but {len(strategy.all_steps)} in strategy." - ) - for step_id in strategy.all_steps: - if step_id not in description_commits: - raise MultiCommitException( - f"Corrupted multi-commit PR #{pr.num}: expected step {step_id} but didn't find" - f" it (have {', '.join(description_commits)})." - ) - - # 4. Retrieve commit history (and check consistency) - commits_on_main_branch = { - commit.commit_id - for commit in self.list_repo_commits( - repo_id=repo_id, repo_type=repo_type, token=token, revision=constants.DEFAULT_REVISION - ) - } - pr_commits = [ - commit - for commit in self.list_repo_commits( - repo_id=repo_id, repo_type=repo_type, token=token, revision=pr.git_reference - ) - if commit.commit_id not in commits_on_main_branch - ] - if len(pr_commits) > 0: - logger.info(f"Found {len(pr_commits)} existing commits on the PR.") - - # At this point `pr_commits` is a list of commits pushed to the PR. We expect all of these commits (if any) to have - # a step_id as title. We raise exception if an unexpected commit has been pushed. - if len(pr_commits) > len(strategy.all_steps): - raise MultiCommitException( - f"Corrupted multi-commit PR #{pr.num}: scheduled {len(strategy.all_steps)} steps but" - f" {len(pr_commits)} commits have already been pushed to the PR." - ) - - # Check which steps are already completed - remaining_additions = {step.id: step for step in strategy.addition_commits} - remaining_deletions = {step.id: step for step in strategy.deletion_commits} - for commit in pr_commits: - if commit.title in remaining_additions: - step = remaining_additions.pop(commit.title) - step.completed = True - elif commit.title in remaining_deletions: - step = remaining_deletions.pop(commit.title) - step.completed = True - - if len(remaining_deletions) > 0 and len(remaining_additions) < len(strategy.addition_commits): - raise MultiCommitException( - f"Corrupted multi-commit PR #{pr.num}: some addition commits have already been pushed to the PR but" - " deletion commits are not all completed yet." - ) - nb_remaining = len(remaining_deletions) + len(remaining_additions) - if len(pr_commits) > 0: - logger.info( - f"{nb_remaining} commits remaining ({len(remaining_deletions)} deletion commits and" - f" {len(remaining_additions)} addition commits)" - ) - - # 5. Push remaining commits to the PR + update description - # TODO: multi-thread this - for step in list(remaining_deletions.values()) + list(remaining_additions.values()): - # Push new commit - self.create_commit( - repo_id=repo_id, - repo_type=repo_type, - token=token, - commit_message=step.id, - revision=pr.git_reference, - num_threads=num_threads, - operations=step.operations, - create_pr=False, - ) - step.completed = True - nb_remaining -= 1 - logger.info(f" step {step.id} completed (still {nb_remaining} to go).") - - # Update PR description - self.edit_discussion_comment( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - comment_id=pr_comment.id, - new_content=multi_commit_generate_comment( - commit_message=commit_message, commit_description=commit_description, strategy=strategy - ), - ) - logger.info("All commits have been pushed.") - - # 6. Update PR (and merge) - self.rename_discussion( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - new_title=commit_message, - ) - self.change_discussion_status( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - new_status="open", - comment=MULTI_COMMIT_PR_COMPLETION_COMMENT_TEMPLATE, - ) - logger.info("PR is now open for reviews.") - - if merge_pr: # User don't want a PR => merge it - try: - self.merge_pull_request( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - comment=MULTI_COMMIT_PR_CLOSING_COMMENT_TEMPLATE, - ) - logger.info("PR has been automatically merged (`merge_pr=True` was passed).") - except BadRequestError as error: - if error.server_message is not None and "no associated changes" in error.server_message: - # PR cannot be merged as no changes are associated. We close the PR without merging with a comment to - # explain. - self.change_discussion_status( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - comment=MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_NO_CHANGES_TEMPLATE, - new_status="closed", - ) - logger.warning("Couldn't merge the PR: no associated changes.") - else: - # PR cannot be merged for another reason (conflicting files for example). We comment the PR to explain - # and re-raise the exception. - self.comment_discussion( - repo_id=repo_id, - repo_type=repo_type, - token=token, - discussion_num=pr.num, - comment=MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_BAD_REQUEST_TEMPLATE.format( - error_message=error.server_message - ), - ) - raise MultiCommitException( - f"Couldn't merge Pull Request in multi-commit: {error.server_message}" - ) from error - - return pr.url - def preupload_lfs_files( self, repo_id: str, @@ -4786,8 +4520,6 @@ def upload_folder( # type: ignore allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, delete_patterns: Optional[Union[List[str], str]] = None, - multi_commits: Literal[False] = ..., - multi_commits_verbose: bool = False, run_as_future: Literal[False] = ..., ) -> CommitInfo: ... @@ -4808,57 +4540,9 @@ def upload_folder( # type: ignore allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, delete_patterns: Optional[Union[List[str], str]] = None, - multi_commits: Literal[True] = ..., - multi_commits_verbose: bool = False, - run_as_future: Literal[False] = ..., - ) -> str: # Only the PR url in multi-commits mode - ... - - @overload - def upload_folder( # type: ignore - self, - *, - repo_id: str, - folder_path: Union[str, Path], - path_in_repo: Optional[str] = None, - commit_message: Optional[str] = None, - commit_description: Optional[str] = None, - token: Union[str, bool, None] = None, - repo_type: Optional[str] = None, - revision: Optional[str] = None, - create_pr: Optional[bool] = None, - parent_commit: Optional[str] = None, - allow_patterns: Optional[Union[List[str], str]] = None, - ignore_patterns: Optional[Union[List[str], str]] = None, - delete_patterns: Optional[Union[List[str], str]] = None, - multi_commits: Literal[False] = ..., - multi_commits_verbose: bool = False, run_as_future: Literal[True] = ..., ) -> Future[CommitInfo]: ... - @overload - def upload_folder( - self, - *, - repo_id: str, - folder_path: Union[str, Path], - path_in_repo: Optional[str] = None, - commit_message: Optional[str] = None, - commit_description: Optional[str] = None, - token: Union[str, bool, None] = None, - repo_type: Optional[str] = None, - revision: Optional[str] = None, - create_pr: Optional[bool] = None, - parent_commit: Optional[str] = None, - allow_patterns: Optional[Union[List[str], str]] = None, - ignore_patterns: Optional[Union[List[str], str]] = None, - delete_patterns: Optional[Union[List[str], str]] = None, - multi_commits: Literal[True] = ..., - multi_commits_verbose: bool = False, - run_as_future: Literal[True] = ..., - ) -> Future[str]: # Only the PR url in multi-commits mode - ... - @validate_hf_hub_args @future_compatible def upload_folder( @@ -4877,10 +4561,8 @@ def upload_folder( allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, delete_patterns: Optional[Union[List[str], str]] = None, - multi_commits: bool = False, - multi_commits_verbose: bool = False, run_as_future: bool = False, - ) -> Union[CommitInfo, str, Future[CommitInfo], Future[str]]: + ) -> Union[CommitInfo, Future[CommitInfo]]: """ Upload a local folder to the given repo. The upload is done through a HTTP requests, and doesn't require git or git-lfs to be installed. @@ -4933,8 +4615,7 @@ def upload_folder( Whether or not to create a Pull Request with that commit. Defaults to `False`. If `revision` is not set, PR is opened against the `"main"` branch. If `revision` is set and is a branch, PR is opened against this branch. If `revision` is set and is not a branch name (example: a commit oid), an - `RevisionNotFoundError` is returned by the server. If both `multi_commits` and `create_pr` are True, - the PR created in the multi-commit process is kept opened. + `RevisionNotFoundError` is returned by the server. parent_commit (`str`, *optional*): The OID / SHA of the parent commit, as a hexadecimal string. Shorthands (7 first characters) are also supported. If specified and `create_pr` is `False`, the commit will fail if `revision` does not point to `parent_commit`. @@ -4949,12 +4630,6 @@ def upload_folder( If provided, remote files matching any of the patterns will be deleted from the repo while committing new files. This is useful if you don't know which files have already been uploaded. Note: to avoid discrepancies the `.gitattributes` file is not deleted even if it matches the pattern. - multi_commits (`bool`): - Deprecated. For large uploads, use `upload_large_folder` instead. - If True, changes are pushed to a PR using a multi-commit process. Defaults to `False`. - multi_commits_verbose (`bool`): - Deprecated. For large uploads, use `upload_large_folder` instead. - If True and `multi_commits` is used, more information will be displayed to the user. run_as_future (`bool`, *optional*): Whether or not to run this method in the background. Background jobs are run sequentially without blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) @@ -4965,9 +4640,6 @@ def upload_folder( Instance of [`CommitInfo`] containing information about the newly created commit (commit hash, commit url, pr url, commit message,...). If `run_as_future=True` is passed, returns a Future object which will contain the result when executed. - [`str`] or `Future`: - If `multi_commits=True`, returns the url of the PR created to push the changes. If `run_as_future=True` - is passed, returns a Future object which will contain the result when executed. @@ -4988,9 +4660,9 @@ def upload_folder( - + - `multi_commits` is experimental. Its API and behavior is subject to change in the future without prior notice. + When dealing with a large folder (thousands of files or hundreds of GB), we recommend using [`~hf_api.upload_large_folder`] instead. @@ -5036,10 +4708,6 @@ def upload_folder( if repo_type not in constants.REPO_TYPES: raise ValueError(f"Invalid repo type, must be one of {constants.REPO_TYPES}") - if multi_commits: - if revision is not None and revision != constants.DEFAULT_REVISION: - raise ValueError("Cannot use `multi_commit` to commit changes other than the main branch.") - # By default, upload folder to the root directory in repo. if path_in_repo is None: path_in_repo = "" @@ -5077,22 +4745,6 @@ def upload_folder( commit_operations = delete_operations + add_operations commit_message = commit_message or "Upload folder using huggingface_hub" - if multi_commits: - addition_commits, deletion_commits = plan_multi_commits(operations=commit_operations) - pr_url = self.create_commits_on_pr( - repo_id=repo_id, - repo_type=repo_type, - addition_commits=addition_commits, - deletion_commits=deletion_commits, - commit_message=commit_message, - commit_description=commit_description, - token=token, - merge_pr=not create_pr, - verbose=multi_commits_verbose, - ) - # Defining a CommitInfo object is not really relevant in this case - # Let's return early with pr_url only (as string). - return pr_url commit_info = self.create_commit( repo_type=repo_type, @@ -8614,7 +8266,7 @@ def delete_collection_item( Slug of the collection to update. Example: `"TheBloke/recent-models-64f9a55bb3115b4f513ec026"`. item_object_id (`str`): ID of the item in the collection. This is not the id of the item on the Hub (repo_id or paper id). - It must be retrieved from a [`CollectionItem`] object. Example: `collection.items[0]._id`. + It must be retrieved from a [`CollectionItem`] object. Example: `collection.items[0].item_object_id`. missing_ok (`bool`, *optional*): If `True`, do not raise an error if item doesn't exists. token (Union[bool, str, None], optional): @@ -9461,7 +9113,6 @@ def delete_webhook(self, webhook_id: str, *, token: Union[bool, str, None] = Non def _build_hf_headers( self, token: Union[bool, str, None] = None, - is_write_action: bool = False, library_name: Optional[str] = None, library_version: Optional[str] = None, user_agent: Union[Dict, str, None] = None, @@ -9475,7 +9126,6 @@ def _build_hf_headers( token = self.token return build_hf_headers( token=token, - is_write_action=is_write_action, library_name=library_name or self.library_name, library_version=library_version or self.library_version, user_agent=user_agent or self.user_agent, @@ -9562,12 +9212,13 @@ def _prepare_upload_folder_additions( # It's better to fail early than to fail after all the files have been hashed. if "README.md" in filtered_repo_objects: self._validate_yaml( - content=relpath_to_abspath["README.md"].read_text(), + content=relpath_to_abspath["README.md"].read_text(encoding="utf8"), repo_type=repo_type, token=token, ) if len(filtered_repo_objects) > 30: - logger.info( + log = logger.warning if len(filtered_repo_objects) > 200 else logger.info + log( "It seems you are trying to upload a large folder at once. This might take some time and then fail if " "the folder is too large. For such cases, it is recommended to upload in smaller batches or to use " "`HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, " @@ -9927,7 +9578,6 @@ def _parse_revision_from_pr_url(pr_url: str) -> str: delete_file = api.delete_file delete_folder = api.delete_folder delete_files = api.delete_files -create_commits_on_pr = api.create_commits_on_pr upload_large_folder = api.upload_large_folder preupload_lfs_files = api.preupload_lfs_files create_branch = api.create_branch diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index a9fb009570..2e70a66a90 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -1,4 +1,3 @@ -import inspect import os import re import tempfile @@ -7,7 +6,7 @@ from datetime import datetime from itertools import chain from pathlib import Path -from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, NoReturn, Optional, Tuple, Union from urllib.parse import quote, unquote import fsspec @@ -20,11 +19,7 @@ from .errors import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from .file_download import hf_hub_url, http_get from .hf_api import HfApi, LastCommitInfo, RepoFile -from .utils import ( - HFValidationError, - hf_raise_for_status, - http_backoff, -) +from .utils import HFValidationError, hf_raise_for_status, http_backoff # Regex used to match special revisions with "/" in them (see #1710) @@ -64,13 +59,22 @@ class HfFileSystem(fsspec.AbstractFileSystem): """ Access a remote Hugging Face Hub repository as if were a local file system. + + + [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading + Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility + layer. For better performance and reliability, it's recommended to use `HfApi` methods when possible. + + + Args: token (`str` or `bool`, *optional*): A valid user access token (string). Defaults to the locally saved token, which is the recommended method for authentication (see https://huggingface.co/docs/huggingface_hub/quick-start#authentication). To disable authentication, pass `False`. - + endpoint (`str`, *optional*): + Endpoint of the Hub. Defaults to . Usage: ```python @@ -133,6 +137,25 @@ def _repo_and_revision_exist( return self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] def resolve_path(self, path: str, revision: Optional[str] = None) -> HfFileSystemResolvedPath: + """ + Resolve a Hugging Face file system path into its components. + + Args: + path (`str`): + Path to resolve. + revision (`str`, *optional*): + The revision of the repo to resolve. Defaults to the revision specified in the path. + + Returns: + [`HfFileSystemResolvedPath`]: Resolved path information containing `repo_type`, `repo_id`, `revision` and `path_in_repo`. + + Raises: + `ValueError`: + If path contains conflicting revision information. + `NotImplementedError`: + If trying to list repositories. + """ + def _align_revision_in_path_with_revision( revision_in_path: Optional[str], revision: Optional[str] ) -> Optional[str]: @@ -209,15 +232,33 @@ def _align_revision_in_path_with_revision( return HfFileSystemResolvedPath(repo_type, repo_id, revision, path_in_repo, _raw_revision=revision_in_path) def invalidate_cache(self, path: Optional[str] = None) -> None: + """ + Clear the cache for a given path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.invalidate_cache). + + Args: + path (`str`, *optional*): + Path to clear from cache. If not provided, clear the entire cache. + + """ if not path: self.dircache.clear() self._repo_and_revision_exists_cache.clear() else: - path = self.resolve_path(path).unresolve() + resolved_path = self.resolve_path(path) + path = resolved_path.unresolve() while path: self.dircache.pop(path, None) path = self._parent(path) + # Only clear repo cache if path is to repo root + if not resolved_path.path_in_repo: + self._repo_and_revision_exists_cache.pop((resolved_path.repo_type, resolved_path.repo_id, None), None) + self._repo_and_revision_exists_cache.pop( + (resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision), None + ) + def _open( self, path: str, @@ -254,6 +295,28 @@ def rm( revision: Optional[str] = None, **kwargs, ) -> None: + """ + Delete files from a repository. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.rm). + + + + Note: When possible, use `HfApi.delete_file()` for better performance. + + + + Args: + path (`str`): + Path to delete. + recursive (`bool`, *optional*): + If True, delete directory and all its contents. Defaults to False. + maxdepth (`int`, *optional*): + Maximum number of subdirectories to visit when deleting recursively. + revision (`str`, *optional*): + The git revision to delete from. + + """ resolved_path = self.resolve_path(path, revision=revision) paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth, revision=revision) paths_in_repo = [self.resolve_path(path).path_in_repo for path in paths if not self.isdir(path)] @@ -276,7 +339,32 @@ def rm( def ls( self, path: str, detail: bool = True, refresh: bool = False, revision: Optional[str] = None, **kwargs ) -> List[Union[str, Dict[str, Any]]]: - """List the contents of a directory.""" + """ + List the contents of a directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.ls). + + + + Note: When possible, use `HfApi.list_repo_tree()` for better performance. + + + + Args: + path (`str`): + Path to the directory. + detail (`bool`, *optional*): + If True, returns a list of dictionaries containing file information. If False, + returns a list of file paths. Defaults to True. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to list from. + + Returns: + `List[Union[str, Dict[str, Any]]]`: List of file paths (if detail=False) or list of file information + dictionaries (if detail=True). + """ resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() kwargs = {"expand_info": detail, **kwargs} @@ -396,13 +484,37 @@ def _ls_tree( out.append(cache_path_info) return out - def walk(self, path, *args, **kwargs): + def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], List[str]]]: + """ + Return all files below the given path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.walk). + + Args: + path (`str`): + Root path to list files from. + + Returns: + `Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples. + """ # Set expand_info=False by default to get a x10 speed boost kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() yield from super().walk(path, *args, **kwargs) - def glob(self, path, **kwargs): + def glob(self, path: str, **kwargs) -> List[str]: + """ + Find files by glob-matching. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.glob). + + Args: + path (`str`): + Path pattern to match. + + Returns: + `List[str]`: List of paths matching the pattern. + """ # Set expand_info=False by default to get a x10 speed boost kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() @@ -418,6 +530,28 @@ def find( revision: Optional[str] = None, **kwargs, ) -> Union[List[str], Dict[str, Dict[str, Any]]]: + """ + List all files below path. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.find). + + Args: + path (`str`): + Root path to list files from. + maxdepth (`int`, *optional*): + Maximum depth to descend into subdirectories. + withdirs (`bool`, *optional*): + Include directory paths in the output. Defaults to False. + detail (`bool`, *optional*): + If True, returns a dict mapping paths to file information. Defaults to False. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to list from. + + Returns: + `Union[List[str], Dict[str, Dict[str, Any]]]`: List of paths or dict of file information. + """ if maxdepth: return super().find( path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, refresh=refresh, revision=revision, **kwargs @@ -448,6 +582,24 @@ def find( return {name: out[name] for name in names} def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwargs) -> None: + """ + Copy a file within or between repositories. + + + + Note: When possible, use `HfApi.upload_file()` for better performance. + + + + Args: + path1 (`str`): + Source path to copy from. + path2 (`str`): + Destination path to copy to. + revision (`str`, *optional*): + The git revision to copy from. + + """ resolved_path1 = self.resolve_path(path1, revision=revision) resolved_path2 = self.resolve_path(path2, revision=revision) @@ -489,10 +641,45 @@ def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwar self.invalidate_cache(path=resolved_path2.unresolve()) def modified(self, path: str, **kwargs) -> datetime: + """ + Get the last modified time of a file. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.modified). + + Args: + path (`str`): + Path to the file. + + Returns: + `datetime`: Last commit date of the file. + """ info = self.info(path, **kwargs) return info["last_commit"]["date"] def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]: + """ + Get information about a file or directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.info). + + + + Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance. + + + + Args: + path (`str`): + Path to get info for. + refresh (`bool`, *optional*): + If True, bypass the cache and fetch the latest data. Defaults to False. + revision (`str`, *optional*): + The git revision to get info from. + + Returns: + `Dict[str, Any]`: Dictionary containing file information (type, size, commit info, etc.). + + """ resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() expand_info = kwargs.get( @@ -570,30 +757,80 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, return out def exists(self, path, **kwargs): - """Is there a file at the given path""" + """ + Check if a file exists. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists). + + + + Note: When possible, use `HfApi.file_exists()` for better performance. + + + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if file exists, False otherwise. + """ try: + if kwargs.get("refresh", False): + self.invalidate_cache(path) + self.info(path, **{**kwargs, "expand_info": False}) return True except: # noqa: E722 - # any exception allowed bar FileNotFoundError? return False def isdir(self, path): - """Is this entry directory-like?""" + """ + Check if a path is a directory. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.isdir). + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if path is a directory, False otherwise. + """ try: return self.info(path, expand_info=False)["type"] == "directory" except OSError: return False def isfile(self, path): - """Is this entry file-like?""" + """ + Check if a path is a file. + + For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.isfile). + + Args: + path (`str`): + Path to check. + + Returns: + `bool`: True if path is a file, False otherwise. + """ try: return self.info(path, expand_info=False)["type"] == "file" except: # noqa: E722 return False def url(self, path: str) -> str: - """Get the HTTP URL of the given path""" + """ + Get the HTTP URL of the given path. + + Args: + path (`str`): + Path to get URL for. + + Returns: + `str`: HTTP URL to access the file or directory on the Hub. + """ resolved_path = self.resolve_path(path) url = hf_hub_url( resolved_path.repo_id, @@ -607,7 +844,26 @@ def url(self, path: str) -> str: return url def get_file(self, rpath, lpath, callback=_DEFAULT_CALLBACK, outfile=None, **kwargs) -> None: - """Copy single remote file to local.""" + """ + Copy single remote file to local. + + + + Note: When possible, use `HfApi.hf_hub_download()` for better performance. + + + + Args: + rpath (`str`): + Remote path to download from. + lpath (`str`): + Local path to download to. + callback (`Callback`, *optional*): + Optional callback to track download progress. Defaults to no callback. + outfile (`IO`, *optional*): + Optional file-like object to write to. If provided, `lpath` is ignored. + + """ revision = kwargs.get("revision") unhandled_kwargs = set(kwargs.keys()) - {"revision"} if not isinstance(callback, (NoOpCallback, TqdmCallback)) or len(unhandled_kwargs) > 0: @@ -882,20 +1138,3 @@ def _raise_file_not_found(path: str, err: Optional[Exception]) -> NoReturn: def reopen(fs: HfFileSystem, path: str, mode: str, block_size: int, cache_type: str): return fs.open(path, mode=mode, block_size=block_size, cache_type=cache_type) - - -# Add docstrings to the methods of HfFileSystem from fsspec.AbstractFileSystem -for name, function in inspect.getmembers(HfFileSystem, predicate=inspect.isfunction): - parent = getattr(fsspec.AbstractFileSystem, name, None) - if parent is not None and parent.__doc__ is not None: - parent_doc = parent.__doc__ - parent_doc = parent_doc.replace("Parameters\n ----------\n", "Args:\n") - parent_doc = parent_doc.replace("Returns\n -------\n", "Return:\n") - function.__doc__ = ( - ( - "\n_Docstring taken from " - f"[fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.{name})._" - ) - + "\n\n" - + parent_doc - ) diff --git a/src/huggingface_hub/hub_mixin.py b/src/huggingface_hub/hub_mixin.py index 1e77fffb6b..b23ef2ae41 100644 --- a/src/huggingface_hub/hub_mixin.py +++ b/src/huggingface_hub/hub_mixin.py @@ -1,21 +1,11 @@ import inspect import json import os -import warnings from dataclasses import asdict, dataclass, is_dataclass from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Optional, - Tuple, - Type, - TypeVar, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union + +import packaging.version from . import constants from .errors import EntryNotFoundError, HfHubHTTPError @@ -41,7 +31,6 @@ import torch # type: ignore if is_safetensors_available(): - import packaging.version import safetensors from safetensors.torch import load_model as load_model_as_safetensor from safetensors.torch import save_model as save_model_as_safetensor @@ -220,8 +209,6 @@ def __init_subclass__( # Value is a tuple (encoder, decoder). # Example: {MyCustomType: (lambda x: x.value, lambda data: MyCustomType(data))} ] = None, - # Deprecated arguments - languages: Optional[List[str]] = None, ) -> None: """Inspect __init__ signature only once when subclassing + handle modelcard.""" super().__init_subclass__() @@ -247,13 +234,6 @@ def __init_subclass__( info.repo_url = cls._hub_mixin_info.repo_url cls._hub_mixin_info = info - if languages is not None: - warnings.warn( - "The `languages` argument is deprecated. Use `language` instead. This will be removed in `huggingface_hub>=0.27.0`.", - DeprecationWarning, - ) - language = languages - # Update MixinInfo with metadata if model_card_template is not None and model_card_template != DEFAULT_MODEL_CARD: info.model_card_template = model_card_template diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 38b37b71e3..d3b38ce750 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -57,6 +57,7 @@ _get_unsupported_text_generation_kwargs, _import_numpy, _open_as_binary, + _prepare_payload, _set_unsupported_text_generation_kwargs, _stream_chat_completion_response, _stream_text_generation_response, @@ -177,7 +178,9 @@ def __init__( self.model: Optional[str] = model self.token: Union[str, bool, None] = token if token is not None else api_key - self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' + self.headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + build_hf_headers(token=self.token) # 'authorization' + 'user-agent' + ) if headers is not None: self.headers.update(headers) self.cookies = cookies @@ -364,18 +367,8 @@ def audio_classification( ``` """ parameters = {"function_to_apply": function_to_apply, "top_k": top_k} - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, send audio as raw data - data = audio - payload: Optional[Dict[str, Any]] = None - else: - # Or some parameters are provided -> send audio as base64 encoded string - data = None - payload = {"inputs": _b64_encode(audio)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, data=data, model=model, task="audio-classification") + payload = _prepare_payload(audio, parameters=parameters, expect_binary=True) + response = self.post(**payload, model=model, task="audio-classification") return AudioClassificationOutputElement.parse_obj_as_list(response) def audio_to_audio( @@ -591,7 +584,7 @@ def chat_completion( Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message. max_tokens (`int`, *optional*): - Maximum number of tokens allowed in the response. Defaults to 20. + Maximum number of tokens allowed in the response. Defaults to 100. n (`int`, *optional*): UNUSED. presence_penalty (`float`, *optional*): @@ -988,7 +981,7 @@ def document_question_answering( [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)] ``` """ - payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)} + inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)} parameters = { "doc_stride": doc_stride, "handle_impossible_answer": handle_impossible_answer, @@ -999,10 +992,8 @@ def document_question_answering( "top_k": top_k, "word_boxes": word_boxes, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="document-question-answering") + payload = _prepare_payload(inputs, parameters=parameters) + response = self.post(**payload, model=model, task="document-question-answering") return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response) def feature_extraction( @@ -1060,17 +1051,14 @@ def feature_extraction( [ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32) ``` """ - payload: Dict = {"inputs": text} parameters = { "normalize": normalize, "prompt_name": prompt_name, "truncate": truncate, "truncation_direction": truncation_direction, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="feature-extraction") + payload = _prepare_payload(text, parameters=parameters) + response = self.post(**payload, model=model, task="feature-extraction") np = _import_numpy() return np.array(_bytes_to_dict(response), dtype="float32") @@ -1119,12 +1107,9 @@ def fill_mask( ] ``` """ - payload: Dict = {"inputs": text} parameters = {"targets": targets, "top_k": top_k} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="fill-mask") + payload = _prepare_payload(text, parameters=parameters) + response = self.post(**payload, model=model, task="fill-mask") return FillMaskOutputElement.parse_obj_as_list(response) def image_classification( @@ -1166,19 +1151,8 @@ def image_classification( ``` """ parameters = {"function_to_apply": function_to_apply, "top_k": top_k} - - if all(parameter is None for parameter in parameters.values()): - data = image - payload: Optional[Dict[str, Any]] = None - - else: - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - - response = self.post(json=payload, data=data, model=model, task="image-classification") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = self.post(**payload, model=model, task="image-classification") return ImageClassificationOutputElement.parse_obj_as_list(response) def image_segmentation( @@ -1237,18 +1211,8 @@ def image_segmentation( "subtask": subtask, "threshold": threshold, } - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, the image can be raw bytes, an image file, or URL to an online image - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, data=data, model=model, task="image-segmentation") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = self.post(**payload, model=model, task="image-segmentation") output = ImageSegmentationOutputElement.parse_obj_as_list(response) for item in output: item.mask = _b64_to_image(item.mask) # type: ignore [assignment] @@ -1323,19 +1287,8 @@ def image_to_image( "guidance_scale": guidance_scale, **kwargs, } - if all(parameter is None for parameter in parameters.values()): - # Either only an image to send => send as raw bytes - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - - response = self.post(json=payload, data=data, model=model, task="image-to-image") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = self.post(**payload, model=model, task="image-to-image") return _bytes_to_image(response) def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput: @@ -1493,25 +1446,15 @@ def object_detection( ```py >>> from huggingface_hub import InferenceClient >>> client = InferenceClient() - >>> client.object_detection("people.jpg"): + >>> client.object_detection("people.jpg") [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...] ``` """ parameters = { "threshold": threshold, } - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, the image can be raw bytes, an image file, or URL to an online image - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, data=data, model=model, task="object-detection") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = self.post(**payload, model=model, task="object-detection") return ObjectDetectionOutputElement.parse_obj_as_list(response) def question_answering( @@ -1587,12 +1530,10 @@ def question_answering( "max_seq_len": max_seq_len, "top_k": top_k, } - payload: Dict[str, Any] = {"question": question, "context": context} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value + inputs: Dict[str, Any] = {"question": question, "context": context} + payload = _prepare_payload(inputs, parameters=parameters) response = self.post( - json=payload, + **payload, model=model, task="question-answering", ) @@ -1700,19 +1641,14 @@ def summarization( SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....") ``` """ - payload: Dict[str, Any] = {"inputs": text} - if parameters is not None: - payload["parameters"] = parameters - else: + if parameters is None: parameters = { "clean_up_tokenization_spaces": clean_up_tokenization_spaces, "generate_parameters": generate_parameters, "truncation": truncation, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="summarization") + payload = _prepare_payload(text, parameters=parameters) + response = self.post(**payload, model=model, task="summarization") return SummarizationOutput.parse_obj_as_list(response)[0] def table_question_answering( @@ -1757,15 +1693,13 @@ def table_question_answering( TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE') ``` """ - payload: Dict[str, Any] = { + inputs = { "query": query, "table": table, } - - if parameters is not None: - payload["parameters"] = parameters + payload = _prepare_payload(inputs, parameters=parameters) response = self.post( - json=payload, + **payload, model=model, task="table-question-answering", ) @@ -1813,7 +1747,11 @@ def tabular_classification(self, table: Dict[str, Any], *, model: Optional[str] ["5", "5", "5"] ``` """ - response = self.post(json={"table": table}, model=model, task="tabular-classification") + response = self.post( + json={"table": table}, + model=model, + task="tabular-classification", + ) return _bytes_to_list(response) def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]: @@ -1899,15 +1837,16 @@ def text_classification( ] ``` """ - payload: Dict[str, Any] = {"inputs": text} parameters = { "function_to_apply": function_to_apply, "top_k": top_k, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="text-classification") + payload = _prepare_payload(text, parameters=parameters) + response = self.post( + **payload, + model=model, + task="text-classification", + ) return TextClassificationOutputElement.parse_obj_as_list(response)[0] # type: ignore [return-value] @overload @@ -2136,7 +2075,7 @@ def text_generation( grammar ([`TextGenerationInputGrammarType`], *optional*): Grammar constraints. Can be either a JSONSchema or a regex. max_new_tokens (`int`, *optional*): - Maximum number of generated tokens + Maximum number of generated tokens. Defaults to 100. repetition_penalty (`float`, *optional*): The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. @@ -2481,7 +2420,7 @@ def text_to_image( >>> image.save("better_astronaut.png") ``` """ - payload = {"inputs": prompt} + parameters = { "negative_prompt": negative_prompt, "height": height, @@ -2493,10 +2432,8 @@ def text_to_image( "seed": seed, **kwargs, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value # type: ignore - response = self.post(json=payload, model=model, task="text-to-image") + payload = _prepare_payload(prompt, parameters=parameters) + response = self.post(**payload, model=model, task="text-to-image") return _bytes_to_image(response) def text_to_speech( @@ -2599,7 +2536,6 @@ def text_to_speech( >>> Path("hello_world.flac").write_bytes(audio) ``` """ - payload: Dict[str, Any] = {"inputs": text} parameters = { "do_sample": do_sample, "early_stopping": early_stopping, @@ -2618,10 +2554,8 @@ def text_to_speech( "typical_p": typical_p, "use_cache": use_cache, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="text-to-speech") + payload = _prepare_payload(text, parameters=parameters) + response = self.post(**payload, model=model, task="text-to-speech") return response def token_classification( @@ -2683,17 +2617,15 @@ def token_classification( ] ``` """ - payload: Dict[str, Any] = {"inputs": text} + parameters = { "aggregation_strategy": aggregation_strategy, "ignore_labels": ignore_labels, "stride": stride, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value + payload = _prepare_payload(text, parameters=parameters) response = self.post( - json=payload, + **payload, model=model, task="token-classification", ) @@ -2769,7 +2701,6 @@ def translation( if src_lang is None and tgt_lang is not None: raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.") - payload: Dict[str, Any] = {"inputs": text} parameters = { "src_lang": src_lang, "tgt_lang": tgt_lang, @@ -2777,10 +2708,8 @@ def translation( "truncation": truncation, "generate_parameters": generate_parameters, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = self.post(json=payload, model=model, task="translation") + payload = _prepare_payload(text, parameters=parameters) + response = self.post(**payload, model=model, task="translation") return TranslationOutput.parse_obj_as_list(response)[0] def visual_question_answering( @@ -2921,15 +2850,14 @@ def zero_shot_classification( ``` """ - parameters = {"candidate_labels": labels, "multi_label": multi_label} - if hypothesis_template is not None: - parameters["hypothesis_template"] = hypothesis_template - + parameters = { + "candidate_labels": labels, + "multi_label": multi_label, + "hypothesis_template": hypothesis_template, + } + payload = _prepare_payload(text, parameters=parameters) response = self.post( - json={ - "inputs": text, - "parameters": parameters, - }, + **payload, task="zero-shot-classification", model=model, ) @@ -2986,13 +2914,11 @@ def zero_shot_image_classification( if len(labels) < 2: raise ValueError("You must specify at least 2 classes to compare.") - payload = { - "inputs": {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}, - } - if hypothesis_template is not None: - payload.setdefault("parameters", {})["hypothesis_template"] = hypothesis_template + inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)} + parameters = {"hypothesis_template": hypothesis_template} + payload = _prepare_payload(inputs, parameters=parameters) response = self.post( - json=payload, + **payload, model=model, task="zero-shot-image-classification", ) diff --git a/src/huggingface_hub/inference/_common.py b/src/huggingface_hub/inference/_common.py index a92d8fad4a..a5d80f282a 100644 --- a/src/huggingface_hub/inference/_common.py +++ b/src/huggingface_hub/inference/_common.py @@ -58,10 +58,7 @@ is_numpy_available, is_pillow_available, ) -from ._generated.types import ( - ChatCompletionStreamOutput, - TextGenerationStreamOutput, -) +from ._generated.types import ChatCompletionStreamOutput, TextGenerationStreamOutput if TYPE_CHECKING: @@ -84,7 +81,7 @@ @dataclass class ModelStatus: """ - This Dataclass represents the the model status in the Hugging Face Inference API. + This Dataclass represents the model status in the Hugging Face Inference API. Args: loaded (`bool`): @@ -216,7 +213,7 @@ def _open_as_binary(content: Optional[ContentT]) -> Generator[Optional[BinaryT], def _b64_encode(content: ContentT) -> str: - """Encode a raw file (image, audio) into base64. Can be byes, an opened file, a path or a URL.""" + """Encode a raw file (image, audio) into base64. Can be bytes, an opened file, a path or a URL.""" with _open_as_binary(content) as data: data_as_bytes = data if isinstance(data, bytes) else data.read() return base64.b64encode(data_as_bytes).decode() @@ -259,6 +256,47 @@ def _bytes_to_image(content: bytes) -> "Image": return Image.open(io.BytesIO(content)) +## PAYLOAD UTILS + + +def _prepare_payload( + inputs: Union[str, Dict[str, Any], ContentT], + parameters: Optional[Dict[str, Any]], + expect_binary: bool = False, +) -> Dict[str, Any]: + """ + Used in `InferenceClient` and `AsyncInferenceClient` to prepare the payload for an API request, handling various input types and parameters. + `expect_binary` is set to `True` when the inputs are a binary object or a local path or URL. This is the case for image and audio inputs. + """ + if parameters is None: + parameters = {} + parameters = {k: v for k, v in parameters.items() if v is not None} + has_parameters = len(parameters) > 0 + + is_binary = isinstance(inputs, (bytes, Path)) + # If expect_binary is True, inputs must be a binary object or a local path or a URL. + if expect_binary and not is_binary and not isinstance(inputs, str): + raise ValueError(f"Expected binary inputs or a local path or a URL. Got {inputs}") # type: ignore + # Send inputs as raw content when no parameters are provided + if expect_binary and not has_parameters: + return {"data": inputs} + # If expect_binary is False, inputs must not be a binary object. + if not expect_binary and is_binary: + raise ValueError(f"Unexpected binary inputs. Got {inputs}") # type: ignore + + json: Dict[str, Any] = {} + # If inputs is a bytes-like object, encode it to base64 + if expect_binary: + json["inputs"] = _b64_encode(inputs) # type: ignore + # Otherwise (string, dict, list) send it as is + else: + json["inputs"] = inputs + # Add parameters to the json payload if any + if has_parameters: + json["parameters"] = parameters + return {"json": json} + + ## STREAMING UTILS diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 8a1384a671..0dd671c9be 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -45,6 +45,7 @@ _get_unsupported_text_generation_kwargs, _import_numpy, _open_as_binary, + _prepare_payload, _set_unsupported_text_generation_kwargs, raise_text_generation_error, ) @@ -169,7 +170,9 @@ def __init__( self.model: Optional[str] = model self.token: Union[str, bool, None] = token if token is not None else api_key - self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' + self.headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + build_hf_headers(token=self.token) # 'authorization' + 'user-agent' + ) if headers is not None: self.headers.update(headers) self.cookies = cookies @@ -398,18 +401,8 @@ async def audio_classification( ``` """ parameters = {"function_to_apply": function_to_apply, "top_k": top_k} - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, send audio as raw data - data = audio - payload: Optional[Dict[str, Any]] = None - else: - # Or some parameters are provided -> send audio as base64 encoded string - data = None - payload = {"inputs": _b64_encode(audio)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, data=data, model=model, task="audio-classification") + payload = _prepare_payload(audio, parameters=parameters, expect_binary=True) + response = await self.post(**payload, model=model, task="audio-classification") return AudioClassificationOutputElement.parse_obj_as_list(response) async def audio_to_audio( @@ -627,7 +620,7 @@ async def chat_completion( Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message. max_tokens (`int`, *optional*): - Maximum number of tokens allowed in the response. Defaults to 20. + Maximum number of tokens allowed in the response. Defaults to 100. n (`int`, *optional*): UNUSED. presence_penalty (`float`, *optional*): @@ -1031,7 +1024,7 @@ async def document_question_answering( [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)] ``` """ - payload: Dict[str, Any] = {"question": question, "image": _b64_encode(image)} + inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)} parameters = { "doc_stride": doc_stride, "handle_impossible_answer": handle_impossible_answer, @@ -1042,10 +1035,8 @@ async def document_question_answering( "top_k": top_k, "word_boxes": word_boxes, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="document-question-answering") + payload = _prepare_payload(inputs, parameters=parameters) + response = await self.post(**payload, model=model, task="document-question-answering") return DocumentQuestionAnsweringOutputElement.parse_obj_as_list(response) async def feature_extraction( @@ -1104,17 +1095,14 @@ async def feature_extraction( [ 0.28552425, -0.928395 , -1.2077185 , ..., 0.76810825, -2.1069427 , 0.6236161 ]], dtype=float32) ``` """ - payload: Dict = {"inputs": text} parameters = { "normalize": normalize, "prompt_name": prompt_name, "truncate": truncate, "truncation_direction": truncation_direction, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="feature-extraction") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post(**payload, model=model, task="feature-extraction") np = _import_numpy() return np.array(_bytes_to_dict(response), dtype="float32") @@ -1164,12 +1152,9 @@ async def fill_mask( ] ``` """ - payload: Dict = {"inputs": text} parameters = {"targets": targets, "top_k": top_k} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="fill-mask") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post(**payload, model=model, task="fill-mask") return FillMaskOutputElement.parse_obj_as_list(response) async def image_classification( @@ -1212,19 +1197,8 @@ async def image_classification( ``` """ parameters = {"function_to_apply": function_to_apply, "top_k": top_k} - - if all(parameter is None for parameter in parameters.values()): - data = image - payload: Optional[Dict[str, Any]] = None - - else: - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - - response = await self.post(json=payload, data=data, model=model, task="image-classification") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = await self.post(**payload, model=model, task="image-classification") return ImageClassificationOutputElement.parse_obj_as_list(response) async def image_segmentation( @@ -1284,18 +1258,8 @@ async def image_segmentation( "subtask": subtask, "threshold": threshold, } - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, the image can be raw bytes, an image file, or URL to an online image - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, data=data, model=model, task="image-segmentation") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = await self.post(**payload, model=model, task="image-segmentation") output = ImageSegmentationOutputElement.parse_obj_as_list(response) for item in output: item.mask = _b64_to_image(item.mask) # type: ignore [assignment] @@ -1371,19 +1335,8 @@ async def image_to_image( "guidance_scale": guidance_scale, **kwargs, } - if all(parameter is None for parameter in parameters.values()): - # Either only an image to send => send as raw bytes - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - - response = await self.post(json=payload, data=data, model=model, task="image-to-image") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = await self.post(**payload, model=model, task="image-to-image") return _bytes_to_image(response) async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput: @@ -1549,25 +1502,15 @@ async def object_detection( # Must be run in an async context >>> from huggingface_hub import AsyncInferenceClient >>> client = AsyncInferenceClient() - >>> await client.object_detection("people.jpg"): + >>> await client.object_detection("people.jpg") [ObjectDetectionOutputElement(score=0.9486683011054993, label='person', box=ObjectDetectionBoundingBox(xmin=59, ymin=39, xmax=420, ymax=510)), ...] ``` """ parameters = { "threshold": threshold, } - if all(parameter is None for parameter in parameters.values()): - # if no parameters are provided, the image can be raw bytes, an image file, or URL to an online image - data = image - payload: Optional[Dict[str, Any]] = None - else: - # if parameters are provided, the image needs to be a base64-encoded string - data = None - payload = {"inputs": _b64_encode(image)} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, data=data, model=model, task="object-detection") + payload = _prepare_payload(image, parameters=parameters, expect_binary=True) + response = await self.post(**payload, model=model, task="object-detection") return ObjectDetectionOutputElement.parse_obj_as_list(response) async def question_answering( @@ -1644,12 +1587,10 @@ async def question_answering( "max_seq_len": max_seq_len, "top_k": top_k, } - payload: Dict[str, Any] = {"question": question, "context": context} - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value + inputs: Dict[str, Any] = {"question": question, "context": context} + payload = _prepare_payload(inputs, parameters=parameters) response = await self.post( - json=payload, + **payload, model=model, task="question-answering", ) @@ -1759,19 +1700,14 @@ async def summarization( SummarizationOutput(generated_text="The Eiffel tower is one of the most famous landmarks in the world....") ``` """ - payload: Dict[str, Any] = {"inputs": text} - if parameters is not None: - payload["parameters"] = parameters - else: + if parameters is None: parameters = { "clean_up_tokenization_spaces": clean_up_tokenization_spaces, "generate_parameters": generate_parameters, "truncation": truncation, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="summarization") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post(**payload, model=model, task="summarization") return SummarizationOutput.parse_obj_as_list(response)[0] async def table_question_answering( @@ -1817,15 +1753,13 @@ async def table_question_answering( TableQuestionAnsweringOutputElement(answer='36542', coordinates=[[0, 1]], cells=['36542'], aggregator='AVERAGE') ``` """ - payload: Dict[str, Any] = { + inputs = { "query": query, "table": table, } - - if parameters is not None: - payload["parameters"] = parameters + payload = _prepare_payload(inputs, parameters=parameters) response = await self.post( - json=payload, + **payload, model=model, task="table-question-answering", ) @@ -1874,7 +1808,11 @@ async def tabular_classification(self, table: Dict[str, Any], *, model: Optional ["5", "5", "5"] ``` """ - response = await self.post(json={"table": table}, model=model, task="tabular-classification") + response = await self.post( + json={"table": table}, + model=model, + task="tabular-classification", + ) return _bytes_to_list(response) async def tabular_regression(self, table: Dict[str, Any], *, model: Optional[str] = None) -> List[float]: @@ -1962,15 +1900,16 @@ async def text_classification( ] ``` """ - payload: Dict[str, Any] = {"inputs": text} parameters = { "function_to_apply": function_to_apply, "top_k": top_k, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="text-classification") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post( + **payload, + model=model, + task="text-classification", + ) return TextClassificationOutputElement.parse_obj_as_list(response)[0] # type: ignore [return-value] @overload @@ -2199,7 +2138,7 @@ async def text_generation( grammar ([`TextGenerationInputGrammarType`], *optional*): Grammar constraints. Can be either a JSONSchema or a regex. max_new_tokens (`int`, *optional*): - Maximum number of generated tokens + Maximum number of generated tokens. Defaults to 100. repetition_penalty (`float`, *optional*): The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. @@ -2546,7 +2485,7 @@ async def text_to_image( >>> image.save("better_astronaut.png") ``` """ - payload = {"inputs": prompt} + parameters = { "negative_prompt": negative_prompt, "height": height, @@ -2558,10 +2497,8 @@ async def text_to_image( "seed": seed, **kwargs, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value # type: ignore - response = await self.post(json=payload, model=model, task="text-to-image") + payload = _prepare_payload(prompt, parameters=parameters) + response = await self.post(**payload, model=model, task="text-to-image") return _bytes_to_image(response) async def text_to_speech( @@ -2665,7 +2602,6 @@ async def text_to_speech( >>> Path("hello_world.flac").write_bytes(audio) ``` """ - payload: Dict[str, Any] = {"inputs": text} parameters = { "do_sample": do_sample, "early_stopping": early_stopping, @@ -2684,10 +2620,8 @@ async def text_to_speech( "typical_p": typical_p, "use_cache": use_cache, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="text-to-speech") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post(**payload, model=model, task="text-to-speech") return response async def token_classification( @@ -2750,17 +2684,15 @@ async def token_classification( ] ``` """ - payload: Dict[str, Any] = {"inputs": text} + parameters = { "aggregation_strategy": aggregation_strategy, "ignore_labels": ignore_labels, "stride": stride, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value + payload = _prepare_payload(text, parameters=parameters) response = await self.post( - json=payload, + **payload, model=model, task="token-classification", ) @@ -2837,7 +2769,6 @@ async def translation( if src_lang is None and tgt_lang is not None: raise ValueError("You cannot specify `tgt_lang` without specifying `src_lang`.") - payload: Dict[str, Any] = {"inputs": text} parameters = { "src_lang": src_lang, "tgt_lang": tgt_lang, @@ -2845,10 +2776,8 @@ async def translation( "truncation": truncation, "generate_parameters": generate_parameters, } - for key, value in parameters.items(): - if value is not None: - payload.setdefault("parameters", {})[key] = value - response = await self.post(json=payload, model=model, task="translation") + payload = _prepare_payload(text, parameters=parameters) + response = await self.post(**payload, model=model, task="translation") return TranslationOutput.parse_obj_as_list(response)[0] async def visual_question_answering( @@ -2992,15 +2921,14 @@ async def zero_shot_classification( ``` """ - parameters = {"candidate_labels": labels, "multi_label": multi_label} - if hypothesis_template is not None: - parameters["hypothesis_template"] = hypothesis_template - + parameters = { + "candidate_labels": labels, + "multi_label": multi_label, + "hypothesis_template": hypothesis_template, + } + payload = _prepare_payload(text, parameters=parameters) response = await self.post( - json={ - "inputs": text, - "parameters": parameters, - }, + **payload, task="zero-shot-classification", model=model, ) @@ -3058,13 +2986,11 @@ async def zero_shot_image_classification( if len(labels) < 2: raise ValueError("You must specify at least 2 classes to compare.") - payload = { - "inputs": {"image": _b64_encode(image), "candidateLabels": ",".join(labels)}, - } - if hypothesis_template is not None: - payload.setdefault("parameters", {})["hypothesis_template"] = hypothesis_template + inputs = {"image": _b64_encode(image), "candidateLabels": ",".join(labels)} + parameters = {"hypothesis_template": hypothesis_template} + payload = _prepare_payload(inputs, parameters=parameters) response = await self.post( - json=payload, + **payload, model=model, task="zero-shot-image-classification", ) diff --git a/src/huggingface_hub/inference/_templating.py b/src/huggingface_hub/inference/_templating.py deleted file mode 100644 index 954b203908..0000000000 --- a/src/huggingface_hub/inference/_templating.py +++ /dev/null @@ -1,102 +0,0 @@ -from functools import lru_cache -from typing import Callable, Dict, List, Optional, Union - -from ..errors import HfHubHTTPError, RepositoryNotFoundError, TemplateError -from ..utils import is_minijinja_available - - -def _import_minijinja(): - if not is_minijinja_available(): - raise ImportError("Cannot render template. Please install minijinja using `pip install minijinja`.") - import minijinja # noqa: F401 - - return minijinja - - -def render_chat_prompt( - *, - model_id: str, - messages: List[Dict[str, str]], - token: Union[str, bool, None] = None, - add_generation_prompt: bool = True, - **kwargs, -) -> str: - """Render a chat prompt using a model's chat template. - - Args: - model_id (`str`): - The model id. - messages (`List[Dict[str, str]]`): - The list of messages to render. - token (`str` or `bool`, *optional*): - Hugging Face token. Will default to the locally saved token if not provided. - - Returns: - `str`: The rendered chat prompt. - - Raises: - `TemplateError`: If there's any issue while fetching, compiling or rendering the chat template. - """ - minijinja = _import_minijinja() - template = _fetch_and_compile_template(model_id=model_id, token=token) - - try: - return template(messages=messages, add_generation_prompt=add_generation_prompt, **kwargs) - except minijinja.TemplateError as e: - raise TemplateError(f"Error while trying to render chat prompt for model '{model_id}': {e}") from e - - -@lru_cache # TODO: lru_cache for raised exceptions -def _fetch_and_compile_template(*, model_id: str, token: Union[str, None]) -> Callable: - """Fetch and compile a model's chat template. - - Method is cached to avoid fetching the same model's config multiple times. - - Args: - model_id (`str`): - The model id. - token (`str` or `bool`, *optional*): - Hugging Face token. Will default to the locally saved token if not provided. - - Returns: - `Callable`: A callable that takes a list of messages and returns the rendered chat prompt. - """ - from huggingface_hub.hf_api import HfApi - - minijinja = _import_minijinja() - - # 1. fetch config from API - try: - config = HfApi(token=token).model_info(model_id).config - except RepositoryNotFoundError as e: - raise TemplateError(f"Cannot render chat template: model '{model_id}' not found.") from e - except HfHubHTTPError as e: - raise TemplateError(f"Error while trying to fetch chat template for model '{model_id}': {e}") from e - - # 2. check config validity - if config is None: - raise TemplateError(f"Config not found for model '{model_id}'.") - tokenizer_config = config.get("tokenizer_config") - if tokenizer_config is None: - raise TemplateError(f"Tokenizer config not found for model '{model_id}'.") - if tokenizer_config.get("chat_template") is None: - raise TemplateError(f"Chat template not found in tokenizer_config for model '{model_id}'.") - chat_template = tokenizer_config["chat_template"] - if not isinstance(chat_template, str): - raise TemplateError(f"Chat template must be a string, not '{type(chat_template)}' (model: {model_id}).") - - special_tokens: Dict[str, Optional[str]] = {} - for key, value in tokenizer_config.items(): - if "token" in key: - if isinstance(value, str): - special_tokens[key] = value - elif isinstance(value, dict) and value.get("__type") == "AddedToken": - special_tokens[key] = value.get("content") - - # 3. compile template and return - env = minijinja.Environment() - try: - env.add_template("chat_template", chat_template) - except minijinja.TemplateError as e: - raise TemplateError(f"Error while trying to compile chat template for model '{model_id}': {e}") from e - return lambda **kwargs: env.render_template("chat_template", **kwargs, **special_tokens) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index 855d3a1f13..9a07a8f29f 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -185,7 +185,7 @@ def to_dict(self): data_dict = copy.deepcopy(self.__dict__) self._to_dict(data_dict) - return _remove_none(data_dict) + return {key: value for key, value in data_dict.items() if value is not None} def _to_dict(self, data_dict): """Use this method in child classes to alter the dict representation of the data. Alter the dict in-place. @@ -252,8 +252,8 @@ class ModelCardData(CardData): The identifier of the base model from which the model derives. This is applicable for example if your model is a fine-tune or adapter of an existing model. The value must be the ID of a model on the Hub (or a list of IDs if your model derives from multiple models). Defaults to None. - datasets (`List[str]`, *optional*): - List of datasets that were used to train this model. Should be a dataset ID + datasets (`Union[str, List[str]]`, *optional*): + Dataset or list of datasets that were used to train this model. Should be a dataset ID found on https://hf.co/datasets. Defaults to None. eval_results (`Union[List[EvalResult], EvalResult]`, *optional*): List of `huggingface_hub.EvalResult` that define evaluation results of the model. If provided, @@ -312,7 +312,7 @@ def __init__( self, *, base_model: Optional[Union[str, List[str]]] = None, - datasets: Optional[List[str]] = None, + datasets: Optional[Union[str, List[str]]] = None, eval_results: Optional[List[EvalResult]] = None, language: Optional[Union[str, List[str]]] = None, library_name: Optional[str] = None, diff --git a/src/huggingface_hub/serialization/_torch.py b/src/huggingface_hub/serialization/_torch.py index e87e728d23..58777d9947 100644 --- a/src/huggingface_hub/serialization/_torch.py +++ b/src/huggingface_hub/serialization/_torch.py @@ -41,6 +41,7 @@ def save_torch_model( max_shard_size: Union[int, str] = MAX_SHARD_SIZE, metadata: Optional[Dict[str, str]] = None, safe_serialization: bool = True, + is_main_process: bool = True, ): """ Saves a given torch model to disk, handling sharding and shared tensors issues. @@ -88,6 +89,10 @@ def save_torch_model( Whether to save as safetensors, which is the default behavior. If `False`, the shards are saved as pickle. Safe serialization is recommended for security reasons. Saving as pickle is deprecated and will be removed in a future version. + is_main_process (`bool`, *optional*): + Whether the process calling this is the main process or not. Useful when in distributed training like + TPUs and need to call this function from all processes. In this case, set `is_main_process=True` only on + the main process to avoid race conditions. Defaults to True. Example: @@ -112,6 +117,7 @@ def save_torch_model( metadata=metadata, safe_serialization=safe_serialization, save_directory=save_directory, + is_main_process=is_main_process, ) @@ -124,6 +130,7 @@ def save_torch_state_dict( max_shard_size: Union[int, str] = MAX_SHARD_SIZE, metadata: Optional[Dict[str, str]] = None, safe_serialization: bool = True, + is_main_process: bool = True, ) -> None: """ Save a model state dictionary to the disk, handling sharding and shared tensors issues. @@ -171,7 +178,10 @@ def save_torch_state_dict( Whether to save as safetensors, which is the default behavior. If `False`, the shards are saved as pickle. Safe serialization is recommended for security reasons. Saving as pickle is deprecated and will be removed in a future version. - + is_main_process (`bool`, *optional*): + Whether the process calling this is the main process or not. Useful when in distributed training like + TPUs and need to call this function from all processes. In this case, set `is_main_process=True` only on + the main process to avoid race conditions. Defaults to True. Example: ```py @@ -222,15 +232,18 @@ def save_torch_state_dict( state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size ) - # Clean the folder from previous save - existing_files_regex = re.compile(filename_pattern.format(suffix=r"(-\d{5}-of-\d{5})?") + r"(\.index\.json)?") - for filename in os.listdir(save_directory): - if existing_files_regex.match(filename): - try: - logger.debug(f"Removing existing file '{filename}' from folder.") - os.remove(os.path.join(save_directory, filename)) - except Exception as e: - logger.warning(f"Error when trying to remove existing '{filename}' from folder: {e}. Continuing...") + # Only main process should clean up existing files to avoid race conditions in distributed environment + if is_main_process: + existing_files_regex = re.compile(filename_pattern.format(suffix=r"(-\d{5}-of-\d{5})?") + r"(\.index\.json)?") + for filename in os.listdir(save_directory): + if existing_files_regex.match(filename): + try: + logger.debug(f"Removing existing file '{filename}' from folder.") + os.remove(os.path.join(save_directory, filename)) + except Exception as e: + logger.warning( + f"Error when trying to remove existing '{filename}' from folder: {e}. Continuing..." + ) # Save each shard per_file_metadata = {"format": "pt"} @@ -442,7 +455,7 @@ def storage_ptr(tensor: "torch.Tensor") -> Union[int, Tuple[Any, ...]]: from torch.utils._python_dispatch import is_traceable_wrapper_subclass if is_traceable_wrapper_subclass(tensor): - return _get_unique_id(tensor) + return _get_unique_id(tensor) # type: ignore except ImportError: # for torch version less than 2.1, we can fallback to original implementation pass diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index 8ad45734f1..b9715dc0ad 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -73,7 +73,6 @@ get_hf_hub_version, get_hf_transfer_version, get_jinja_version, - get_minijinja_version, get_numpy_version, get_pillow_version, get_pydantic_version, @@ -92,7 +91,6 @@ is_graphviz_available, is_hf_transfer_available, is_jinja_available, - is_minijinja_available, is_notebook, is_numpy_available, is_package_available, diff --git a/src/huggingface_hub/utils/_auth.py b/src/huggingface_hub/utils/_auth.py index efdbd5c837..c70280aec4 100644 --- a/src/huggingface_hub/utils/_auth.py +++ b/src/huggingface_hub/utils/_auth.py @@ -201,7 +201,7 @@ def _save_token(token: str, token_name: str) -> None: stored_tokens = get_stored_tokens() stored_tokens[token_name] = token _save_stored_tokens(stored_tokens) - print(f"The token `{token_name}` has been saved to {tokens_path}") + logger.info(f"The token `{token_name}` has been saved to {tokens_path}") def _clean_token(token: Optional[str]) -> Optional[str]: diff --git a/src/huggingface_hub/utils/_cache_manager.py b/src/huggingface_hub/utils/_cache_manager.py index 3f3c2a9c88..21469c97af 100644 --- a/src/huggingface_hub/utils/_cache_manager.py +++ b/src/huggingface_hub/utils/_cache_manager.py @@ -742,7 +742,7 @@ def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: for ref_path in refs_path.glob("**/*"): # glob("**/*") iterates over all files and directories -> skip directories - if ref_path.is_dir(): + if ref_path.is_dir() or ref_path.name in FILES_TO_IGNORE: continue ref_name = str(ref_path.relative_to(refs_path)) diff --git a/src/huggingface_hub/utils/_headers.py b/src/huggingface_hub/utils/_headers.py index 8b05e939db..300e6b4e9c 100644 --- a/src/huggingface_hub/utils/_headers.py +++ b/src/huggingface_hub/utils/_headers.py @@ -20,6 +20,7 @@ from .. import constants from ._auth import get_token +from ._deprecation import _deprecate_arguments from ._runtime import ( get_fastai_version, get_fastcore_version, @@ -35,15 +36,20 @@ from ._validators import validate_hf_hub_args +@_deprecate_arguments( + version="1.0", + deprecated_args="is_write_action", + custom_message="This argument is ignored and we let the server handle the permission error instead (if any).", +) @validate_hf_hub_args def build_hf_headers( *, token: Optional[Union[bool, str]] = None, - is_write_action: bool = False, library_name: Optional[str] = None, library_version: Optional[str] = None, user_agent: Union[Dict, str, None] = None, headers: Optional[Dict[str, str]] = None, + is_write_action: bool = False, ) -> Dict[str, str]: """ Build headers dictionary to send in a HF Hub call. @@ -68,9 +74,6 @@ def build_hf_headers( - if `False`, authorization header is not set - if `None`, the token is read from the machine only except if `HF_HUB_DISABLE_IMPLICIT_TOKEN` env variable is set. - is_write_action (`bool`, default to `False`): - Set to True if the API call requires a write access. If `True`, the token - will be validated (cannot be `None`, cannot start by `"api_org***"`). library_name (`str`, *optional*): The name of the library that is making the HTTP request. Will be added to the user-agent header. @@ -83,6 +86,8 @@ def build_hf_headers( headers (`dict`, *optional*): Additional headers to include in the request. Those headers take precedence over the ones generated by this function. + is_write_action (`bool`): + Ignored and deprecated argument. Returns: A `Dict` of headers to pass in your API call. @@ -105,9 +110,6 @@ def build_hf_headers( >>> build_hf_headers() # token is not sent {"user-agent": ...} - >>> build_hf_headers(token="api_org_***", is_write_action=True) - ValueError: You must use your personal account token for write-access methods. - >>> build_hf_headers(library_name="transformers", library_version="1.2.3") {"authorization": ..., "user-agent": "transformers/1.2.3; hf_hub/0.10.2; python/3.10.4; tensorflow/1.55"} ``` @@ -122,7 +124,6 @@ def build_hf_headers( """ # Get auth token to send token_to_send = get_token_to_send(token) - _validate_token_to_send(token_to_send, is_write_action=is_write_action) # Combine headers hf_headers = { @@ -171,23 +172,6 @@ def get_token_to_send(token: Optional[Union[bool, str]]) -> Optional[str]: return cached_token -def _validate_token_to_send(token: Optional[str], is_write_action: bool) -> None: - if is_write_action: - if token is None: - raise ValueError( - "Token is required (write-access action) but no token found. You need" - " to provide a token or be logged in to Hugging Face with" - " `huggingface-cli login` or `huggingface_hub.login`. See" - " https://huggingface.co/settings/tokens." - ) - if token.startswith("api_org"): - raise ValueError( - "You must use your personal account token for write-access methods. To" - " generate a write-access token, go to" - " https://huggingface.co/settings/tokens" - ) - - def _http_user_agent( *, library_name: Optional[str] = None, diff --git a/src/huggingface_hub/utils/_runtime.py b/src/huggingface_hub/utils/_runtime.py index 72b4dd4a2b..c8d82d4129 100644 --- a/src/huggingface_hub/utils/_runtime.py +++ b/src/huggingface_hub/utils/_runtime.py @@ -38,7 +38,6 @@ "hf_transfer": {"hf_transfer"}, "jinja": {"Jinja2"}, "keras": {"keras"}, - "minijinja": {"minijinja"}, "numpy": {"numpy"}, "pillow": {"Pillow"}, "pydantic": {"pydantic"}, @@ -161,15 +160,6 @@ def get_keras_version() -> str: return _get_version("keras") -# Minijinja -def is_minijinja_available() -> bool: - return is_package_available("minijinja") - - -def get_minijinja_version() -> str: - return _get_version("minijinja") - - # Numpy def is_numpy_available() -> bool: return is_package_available("numpy") diff --git a/tests/test_auth.py b/tests/test_auth.py index fd1a18f641..457b9342ef 100644 --- a/tests/test_auth.py +++ b/tests/test_auth.py @@ -88,22 +88,6 @@ def test_login_success(self, mock_whoami): assert _get_token_by_name("test_token") == TOKEN assert _get_token_from_file() == TOKEN - @patch( - "huggingface_hub.hf_api.whoami", - return_value={ - "auth": { - "accessToken": { - "displayName": "test_token", - "role": "read", - "createdAt": "2024-01-01T00:00:00.000Z", - } - } - }, - ) - def test_login_errors(self, mock_whoami): - with pytest.raises(ValueError, match=r"Token is valid but is 'read-only' and a 'write' token is required.*"): - _login(TOKEN, add_to_git_credential=False, write_permission=True) - class TestLogout: def test_logout_deletes_files(self): diff --git a/tests/test_auth_cli.py b/tests/test_auth_cli.py new file mode 100644 index 0000000000..d9013b44ff --- /dev/null +++ b/tests/test_auth_cli.py @@ -0,0 +1,165 @@ +import logging +import os +import tempfile +from unittest.mock import patch + +import pytest +from pytest import CaptureFixture, LogCaptureFixture + +from huggingface_hub import constants +from huggingface_hub.commands.user import AuthListCommand, AuthSwitchCommand, LoginCommand, LogoutCommand + +from .testing_constants import ENDPOINT_STAGING + + +# fixtures & constants + +MOCK_TOKEN = "hf_1234" + + +@pytest.fixture(autouse=True) +def use_tmp_file_paths(): + """ + Fixture to temporarily override HF_TOKEN_PATH, HF_STORED_TOKENS_PATH, and ENDPOINT. + """ + with tempfile.TemporaryDirectory() as tmp_hf_home: + hf_token_path = os.path.join(tmp_hf_home, "token") + hf_stored_tokens_path = os.path.join(tmp_hf_home, "stored_tokens") + with patch.multiple( + constants, + HF_TOKEN_PATH=hf_token_path, + HF_STORED_TOKENS_PATH=hf_stored_tokens_path, + ENDPOINT=ENDPOINT_STAGING, + ): + yield + + +@pytest.fixture +def mock_whoami_api_call(): + MOCK_WHOAMI_RESPONSE = { + "auth": { + "accessToken": { + "displayName": "test_token", + "role": "write", + "createdAt": "2024-01-01T00:00:00.000Z", + } + } + } + with patch("huggingface_hub.hf_api.whoami", return_value=MOCK_WHOAMI_RESPONSE): + yield + + +@pytest.fixture +def mock_stored_tokens(): + """Mock stored tokens.""" + stored_tokens = { + "token1": "hf_1234", + "token2": "hf_5678", + "active_token": "hf_9012", + } + with patch("huggingface_hub._login.get_stored_tokens", return_value=stored_tokens), patch( + "huggingface_hub.utils._auth.get_stored_tokens", return_value=stored_tokens + ): + yield stored_tokens + + +def assert_in_logs(caplog: LogCaptureFixture, expected_output): + """Helper to check if a message appears in logs.""" + log_text = "\n".join(record.message for record in caplog.records) + assert expected_output in log_text, f"Expected '{expected_output}' not found in logs" + + +def test_login_command_basic(mock_whoami_api_call, caplog: LogCaptureFixture): + """Test basic login command execution.""" + caplog.set_level(logging.INFO) + + args = type("Args", (), {"token": MOCK_TOKEN, "add_to_git_credential": False})() + cmd = LoginCommand(args) + cmd.run() + + assert_in_logs(caplog, "Login successful") + assert_in_logs(caplog, "Token is valid") + assert_in_logs(caplog, "The current active token is: `test_token`") + + +def test_login_command_with_git(mock_whoami_api_call, caplog: LogCaptureFixture): + """Test login command with git credential option.""" + caplog.set_level(logging.INFO) + + args = type("Args", (), {"token": MOCK_TOKEN, "add_to_git_credential": True})() + cmd = LoginCommand(args) + + with patch("huggingface_hub._login._is_git_credential_helper_configured", return_value=True), patch( + "huggingface_hub.utils.set_git_credential" + ): + cmd.run() + + assert_in_logs(caplog, "Login successful") + assert_in_logs(caplog, "Your token has been saved in your configured git credential helpers") + + +def test_logout_specific_token(mock_stored_tokens, caplog: LogCaptureFixture): + """Test logout command for a specific token.""" + caplog.set_level(logging.INFO) + + args = type("Args", (), {"token_name": "token1"})() + cmd = LogoutCommand(args) + cmd.run() + + assert_in_logs(caplog, "Successfully logged out from access token: token1") + + +def test_logout_active_token(mock_stored_tokens, caplog: LogCaptureFixture): + """Test logout command for active token.""" + caplog.set_level(logging.INFO) + + with patch("huggingface_hub._login._get_token_from_file", return_value="hf_9012"): + args = type("Args", (), {"token_name": "active_token"})() + cmd = LogoutCommand(args) + cmd.run() + + assert_in_logs(caplog, "Successfully logged out from access token: active_token") + assert_in_logs(caplog, "Active token 'active_token' has been deleted") + + +def test_logout_all_tokens(mock_stored_tokens, caplog: LogCaptureFixture): + """Test logout command for all tokens.""" + caplog.set_level(logging.INFO) + + args = type("Args", (), {"token_name": None})() + cmd = LogoutCommand(args) + cmd.run() + + assert_in_logs(caplog, "Successfully logged out from all access tokens") + + +def test_switch_token(mock_stored_tokens, caplog: LogCaptureFixture): + """Test switching between tokens.""" + caplog.set_level(logging.INFO) + + args = type("Args", (), {"token_name": "token1", "add_to_git_credential": False})() + cmd = AuthSwitchCommand(args) + cmd.run() + + assert_in_logs(caplog, "The current active token is: token1") + + +def test_switch_nonexistent_token(mock_stored_tokens): + """Test switching to a non-existent token.""" + args = type("Args", (), {"token_name": "nonexistent", "add_to_git_credential": False})() + cmd = AuthSwitchCommand(args) + + with pytest.raises(ValueError, match="Access token nonexistent not found"): + cmd.run() + + +def test_list_tokens(mock_stored_tokens, capsys: CaptureFixture): + """Test listing tokens command.""" + args = type("Args", (), {})() + cmd = AuthListCommand(args) + cmd.run() + + captured = capsys.readouterr() + assert "token1" in captured.out + assert "hf_****1234" in captured.out + assert "token2" in captured.out diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index 185aa48cbd..158a32ca30 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -298,6 +298,17 @@ def test_update_dataset_repo_settings(self, repo_url: RepoUrl): assert info.gated == gated_value assert info.private == private_value + @expect_deprecation("get_token_permission") + def test_get_token_permission_on_oauth_token(self): + whoami = { + "type": "user", + "auth": {"type": "oauth", "expiresAt": "2024-10-24T19:43:43.000Z"}, + # ... + # other values are ignored as we only need to check the "auth" value + } + with patch.object(self._api, "whoami", return_value=whoami): + assert self._api.get_token_permission() is None + class CommitApiTest(HfApiCommonTest): def setUp(self) -> None: @@ -1753,6 +1764,30 @@ def test_list_models_complex_query(self): assert isinstance(model, ModelInfo) assert all(tag in model.tags for tag in ["bert", "jax"]) + def test_list_models_sort_trending_score(self): + models = list(self._api.list_models(sort="trending_score", limit=10)) + assert len(models) == 10 + assert isinstance(models[0], ModelInfo) + assert all(model.trending_score is not None for model in models) + + def test_list_models_sort_created_at(self): + models = list(self._api.list_models(sort="created_at", limit=10)) + assert len(models) == 10 + assert isinstance(models[0], ModelInfo) + assert all(model.created_at is not None for model in models) + + def test_list_models_sort_downloads(self): + models = list(self._api.list_models(sort="downloads", limit=10)) + assert len(models) == 10 + assert isinstance(models[0], ModelInfo) + assert all(model.downloads is not None for model in models) + + def test_list_models_sort_likes(self): + models = list(self._api.list_models(sort="likes", limit=10)) + assert len(models) == 10 + assert isinstance(models[0], ModelInfo) + assert all(model.likes is not None for model in models) + def test_list_models_with_config(self): for model in self._api.list_models(filter=("adapter-transformers", "bert"), fetch_config=True, limit=20): self.assertIsNotNone(model.config) @@ -1821,21 +1856,16 @@ def test_model_info(self): self.assertIsInstance(model, ModelInfo) self.assertEqual(model.sha, DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT) - # TODO; un-skip this test once it's fixed. - @unittest.skip( - "Security status is currently unreliable on the server endpoint, so this" - " test occasionally fails. Issue is tracked in" - " https://github.com/huggingface/huggingface_hub/issues/1002 and" - " https://github.com/huggingface/moon-landing/issues/3695. TODO: un-skip" - " this test once it's fixed." - ) def test_model_info_with_security(self): + # Note: this test might break in the future if `security_repo_status` object structure gets updated server-side + # (not yet fully stable) model = self._api.model_info( repo_id=DUMMY_MODEL_ID, revision=DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT, securityStatus=True, ) - self.assertEqual(model.securityStatus, {"containsInfected": False}) + self.assertIsNotNone(model.security_repo_status) + self.assertEqual(model.security_repo_status, {"scansDone": True, "filesWithIssues": []}) def test_model_info_with_file_metadata(self): model = self._api.model_info( diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index ec4ffd5ba1..34094a0265 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -586,3 +586,20 @@ def test_access_repositories_lists(not_supported_path): fs.ls(not_supported_path) with pytest.raises(NotImplementedError): fs.open(not_supported_path) + + +def test_exists_after_repo_deletion(): + """Test that exists() correctly reflects repository deletion.""" + # Initialize with staging endpoint and skip cache + hffs = HfFileSystem(endpoint=ENDPOINT_STAGING, token=TOKEN, skip_instance_cache=True) + api = hffs._api + + # Create a new repo + temp_repo_id = repo_name() + repo_url = api.create_repo(temp_repo_id) + repo_id = repo_url.repo_id + assert hffs.exists(repo_id, refresh=True) + # Delete the repo + api.delete_repo(repo_id=repo_id, repo_type="model") + # Verify that the repo no longer exists. + assert not hffs.exists(repo_id, refresh=True) diff --git a/tests/test_hub_mixin_pytorch.py b/tests/test_hub_mixin_pytorch.py index aaabc6b610..5011a9838b 100644 --- a/tests/test_hub_mixin_pytorch.py +++ b/tests/test_hub_mixin_pytorch.py @@ -60,7 +60,7 @@ class DummyModelWithModelCard( nn.Module, PyTorchModelHubMixin, model_card_template=DUMMY_MODEL_CARD_TEMPLATE, - languages=["en", "zh"], + language=["en", "zh"], library_name="my-dummy-lib", license="apache-2.0", tags=["tag1", "tag2"], diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index a4fa971a2b..b97c62d165 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -49,7 +49,11 @@ from huggingface_hub.constants import ALL_INFERENCE_API_FRAMEWORKS, MAIN_INFERENCE_API_FRAMEWORKS from huggingface_hub.errors import HfHubHTTPError, ValidationError from huggingface_hub.inference._client import _open_as_binary -from huggingface_hub.inference._common import _stream_chat_completion_response, _stream_text_generation_response +from huggingface_hub.inference._common import ( + _prepare_payload, + _stream_chat_completion_response, + _stream_text_generation_response, +) from huggingface_hub.utils import build_hf_headers from .testing_utils import with_production_testing @@ -1080,3 +1084,142 @@ def test_resolve_chat_completion_url( client = InferenceClient(model=client_model, base_url=client_base_url) url = client._resolve_chat_completion_url(model) assert url == expected_url + + +@pytest.mark.parametrize( + "inputs, parameters, expect_binary, expected_json, expected_data", + [ + # Case 1: inputs is a simple string without parameters + ( + "simple text", + None, + False, + {"inputs": "simple text"}, + None, + ), + # Case 2: inputs is a simple string with parameters + ( + "simple text", + {"param1": "value1"}, + False, + { + "inputs": "simple text", + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 3: inputs is a dict without parameters + ( + {"input_key": "input_value"}, + None, + False, + {"inputs": {"input_key": "input_value"}}, + None, + ), + # Case 4: inputs is a dict with parameters + ( + {"input_key": "input_value", "input_key2": "input_value2"}, + {"param1": "value1"}, + False, + { + "inputs": {"input_key": "input_value", "input_key2": "input_value2"}, + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 5: inputs is bytes without parameters + ( + b"binary data", + None, + True, + None, + b"binary data", + ), + # Case 6: inputs is bytes with parameters + ( + b"binary data", + {"param1": "value1"}, + True, + { + "inputs": "encoded_data", + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 7: inputs is a Path object without parameters + ( + Path("test_file.txt"), + None, + True, + None, + Path("test_file.txt"), + ), + # Case 8: inputs is a Path object with parameters + ( + Path("test_file.txt"), + {"param1": "value1"}, + True, + { + "inputs": "encoded_data", + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 9: inputs is a URL string without parameters + ( + "http://example.com", + None, + True, + None, + "http://example.com", + ), + # Case 10: inputs is a URL string without parameters but expect_binary is False + ( + "http://example.com", + None, + False, + { + "inputs": "http://example.com", + }, + None, + ), + # Case 11: inputs is a URL string with parameters + ( + "http://example.com", + {"param1": "value1"}, + True, + { + "inputs": "encoded_data", + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 12: inputs is a URL string with parameters but expect_binary is False + ( + "http://example.com", + {"param1": "value1"}, + False, + { + "inputs": "http://example.com", + "parameters": {"param1": "value1"}, + }, + None, + ), + # Case 13: parameters contain None values + ( + "simple text", + {"param1": None, "param2": "value2"}, + False, + { + "inputs": "simple text", + "parameters": {"param2": "value2"}, + }, + None, + ), + ], +) +def test_prepare_payload(inputs, parameters, expect_binary, expected_json, expected_data): + with patch("huggingface_hub.inference._common._b64_encode", return_value="encoded_data"): + payload = _prepare_payload(inputs, parameters, expect_binary=expect_binary) + assert payload.get("json") == expected_json + assert payload.get("data") == expected_data diff --git a/tests/test_init_lazy_loading.py b/tests/test_init_lazy_loading.py index 9312543128..cddf59e6ad 100644 --- a/tests/test_init_lazy_loading.py +++ b/tests/test_init_lazy_loading.py @@ -4,6 +4,9 @@ class TestHuggingfaceHubInit(unittest.TestCase): + @unittest.skip( + reason="`jedi.Completion.get_signatures()` output differs between Python 3.12 and earlier versions, affecting test consistency" + ) def test_autocomplete_on_root_imports(self) -> None: """Test autocomplete with `huggingface_hub` works with Jedi. diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py index 76a0c444db..9d19f2f4db 100644 --- a/tests/test_local_folder.py +++ b/tests/test_local_folder.py @@ -74,14 +74,17 @@ def test_local_download_paths(tmp_path: Path): assert paths.incomplete_path("etag123").parent.is_dir() -def test_local_download_paths_are_cached(tmp_path: Path): - """Test local download paths are cached.""" - # No need for an exact singleton here. - # We just want to avoid recreating the dataclass on consecutive calls (happens often - # in the process). +def test_local_download_paths_are_recreated_each_time(tmp_path: Path): paths1 = get_local_download_paths(tmp_path, "path/in/repo.txt") + assert paths1.file_path.parent.is_dir() + assert paths1.metadata_path.parent.is_dir() + + paths1.file_path.parent.rmdir() + paths1.metadata_path.parent.rmdir() + paths2 = get_local_download_paths(tmp_path, "path/in/repo.txt") - assert paths1 is paths2 + assert paths2.file_path.parent.is_dir() + assert paths2.metadata_path.parent.is_dir() @pytest.mark.skipif(os.name != "nt", reason="Windows-specific test.") @@ -198,14 +201,17 @@ def test_local_upload_paths(tmp_path: Path): assert paths.lock_path.parent.is_dir() -def test_local_upload_paths_are_cached(tmp_path: Path): - """Test local upload paths are cached.""" - # No need for an exact singleton here. - # We just want to avoid recreating the dataclass on consecutive calls (happens often - # in the process). - paths1 = get_local_download_paths(tmp_path, "path/in/repo.txt") - paths2 = get_local_download_paths(tmp_path, "path/in/repo.txt") - assert paths1 is paths2 +def test_local_upload_paths_are_recreated_each_time(tmp_path: Path): + paths1 = get_local_upload_paths(tmp_path, "path/in/repo.txt") + assert paths1.file_path.parent.is_dir() + assert paths1.metadata_path.parent.is_dir() + + paths1.file_path.parent.rmdir() + paths1.metadata_path.parent.rmdir() + + paths2 = get_local_upload_paths(tmp_path, "path/in/repo.txt") + assert paths2.file_path.parent.is_dir() + assert paths2.metadata_path.parent.is_dir() @pytest.mark.skipif(os.name != "nt", reason="Windows-specific test.") diff --git a/tests/test_repocard_data.py b/tests/test_repocard_data.py index 51b1601239..5d7052fc6b 100644 --- a/tests/test_repocard_data.py +++ b/tests/test_repocard_data.py @@ -237,6 +237,19 @@ def test_model_card_unique_tags(self): data = ModelCardData(tags=["tag2", "tag1", "tag2", "tag3"]) assert data.tags == ["tag2", "tag1", "tag3"] + def test_remove_top_level_none_values(self): + as_obj = ModelCardData(tags=["tag1", None], foo={"bar": 3, "baz": None}, pipeline_tag=None) + as_dict = as_obj.to_dict() + + assert as_obj.tags == ["tag1", None] + assert as_dict["tags"] == ["tag1", None] # none value inside list should be kept + + assert as_obj.foo == {"bar": 3, "baz": None} + assert as_dict["foo"] == {"bar": 3, "baz": None} # none value inside dict should be kept + + assert as_obj.pipeline_tag is None + assert "pipeline_tag" not in as_dict # top level none value should be removed + class DatasetCardDataTest(unittest.TestCase): def test_train_eval_index_keys_updated(self): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 019ec26f2d..d966bd478a 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -264,6 +264,7 @@ def test_save_torch_model(mocker: MockerFixture, tmp_path: Path) -> None: max_shard_size="3GB", metadata={"foo": "bar"}, safe_serialization=True, + is_main_process=True, ) safe_state_dict_mock.assert_called_once_with( state_dict=model_mock.state_dict.return_value, @@ -273,6 +274,7 @@ def test_save_torch_model(mocker: MockerFixture, tmp_path: Path) -> None: max_shard_size="3GB", metadata={"foo": "bar"}, safe_serialization=True, + is_main_process=True, ) @@ -472,3 +474,27 @@ def test_save_torch_state_dict_delete_existing_files( assert (tmp_path / "pytorch_model-00001-of-00003.bin").is_file() assert (tmp_path / "pytorch_model-00002-of-00003.bin").is_file() assert (tmp_path / "pytorch_model-00003-of-00003.bin").is_file() + + +def test_save_torch_state_dict_not_main_process( + tmp_path: Path, + torch_state_dict: Dict[str, "torch.Tensor"], +) -> None: + """ + Test that previous files in the directory are not deleted when is_main_process=False. + When is_main_process=True, previous files should be deleted, + this is already tested in `test_save_torch_state_dict_delete_existing_files`. + """ + # Create some .safetensors files before saving a new state dict. + (tmp_path / "model.safetensors").touch() + (tmp_path / "model-00001-of-00002.safetensors").touch() + (tmp_path / "model-00002-of-00002.safetensors").touch() + (tmp_path / "model.safetensors.index.json").touch() + # Save with is_main_process=False + save_torch_state_dict(torch_state_dict, tmp_path, is_main_process=False) + + # Previous files should still exist (not deleted) + assert (tmp_path / "model.safetensors").is_file() + assert (tmp_path / "model-00001-of-00002.safetensors").is_file() + assert (tmp_path / "model-00002-of-00002.safetensors").is_file() + assert (tmp_path / "model.safetensors.index.json").is_file() diff --git a/tests/test_snapshot_download.py b/tests/test_snapshot_download.py index 4edca4d60e..727dfd8aab 100644 --- a/tests/test_snapshot_download.py +++ b/tests/test_snapshot_download.py @@ -150,7 +150,7 @@ def test_download_model_local_only(self): snapshot_download(self.repo_id, local_dir=tmpdir) # now load from local_dir storage_folder = snapshot_download(self.repo_id, local_dir=tmpdir, local_files_only=True) - self.assertEquals(str(tmpdir), storage_folder) + self.assertEqual(str(tmpdir), storage_folder) def test_download_model_to_local_dir_with_offline_mode(self): """Test that an already downloaded folder is returned when there is a connection error""" @@ -161,7 +161,7 @@ def test_download_model_to_local_dir_with_offline_mode(self): for offline_mode in OfflineSimulationMode: with offline(mode=offline_mode): storage_folder = snapshot_download(self.repo_id, local_dir=tmpdir) - self.assertEquals(str(tmpdir), storage_folder) + self.assertEqual(str(tmpdir), storage_folder) def test_download_model_offline_mode_not_in_local_dir(self): """Test when connection error but local_dir is empty.""" diff --git a/tests/test_utils_headers.py b/tests/test_utils_headers.py index 89cce741c3..202f4283b0 100644 --- a/tests/test_utils_headers.py +++ b/tests/test_utils_headers.py @@ -46,19 +46,6 @@ def test_use_auth_token_none_no_cached_token(self, mock_get_token: Mock) -> None def test_use_auth_token_none_has_cached_token(self, mock_get_token: Mock) -> None: self.assertEqual(build_hf_headers(), FAKE_TOKEN_HEADER) - def test_write_action_org_token(self) -> None: - with self.assertRaises(ValueError): - build_hf_headers(use_auth_token=FAKE_TOKEN_ORG, is_write_action=True) - - @patch("huggingface_hub.utils._headers.get_token", return_value=None) - def test_write_action_none_token(self, mock_get_token: Mock) -> None: - with self.assertRaises(ValueError): - build_hf_headers(is_write_action=True) - - def test_write_action_use_auth_token_false(self) -> None: - with self.assertRaises(ValueError): - build_hf_headers(use_auth_token=False, is_write_action=True) - @patch("huggingface_hub.utils._headers.get_token", return_value=FAKE_TOKEN) def test_implicit_use_disabled(self, mock_get_token: Mock) -> None: with patch( # not as decorator to avoid friction with @handle_injection diff --git a/utils/generate_async_inference_client.py b/utils/generate_async_inference_client.py index 832049ad5d..a4b92f1d64 100644 --- a/utils/generate_async_inference_client.py +++ b/utils/generate_async_inference_client.py @@ -102,7 +102,7 @@ def check_async_client(update: bool) -> NoReturn: else: print( "❌ Expected content mismatch in `./src/huggingface_hub/inference/_generated/_async_client.py`.\n It" - " is most likely that you modified some InferenceClient code and did not update the the" + " is most likely that you modified some InferenceClient code and did not update the" " AsyncInferenceClient one.\n Please run `make style` or `python" " utils/generate_async_inference_client.py --update`." ) diff --git a/utils/generate_inference_types.py b/utils/generate_inference_types.py index 23ab7c7b9c..5f9675d60a 100644 --- a/utils/generate_inference_types.py +++ b/utils/generate_inference_types.py @@ -98,7 +98,7 @@ ] REFERENCE_PACKAGE_EN_CONTENT = """ - @@ -119,7 +119,7 @@ """ REFERENCE_PACKAGE_KO_CONTENT = """ -