From 8c122d4d6939a924e345a6a4cb3a0cc68e6d72c7 Mon Sep 17 00:00:00 2001 From: Sara Han <127759186+sdiazlor@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:37:49 +0200 Subject: [PATCH] docs: textcat tutorial and small doc fixes (#5055) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change. Closes #5035 **Type of change** (Remember to title the PR according to the type of change) - [x] Documentation update **How Has This Been Tested** (Please describe the tests that you ran to verify your changes.) `mkdocs serve` --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: David Berenstein --- .pre-commit-config.yaml | 1 + argilla/docs/community/contributor.md | 2 +- argilla/docs/community/index.md | 2 +- argilla/docs/getting_started/faq.md | 2 +- argilla/docs/getting_started/installation.md | 2 +- argilla/docs/how_to_guides/dataset.md | 4 +- argilla/docs/how_to_guides/index.md | 4 +- argilla/docs/how_to_guides/query_export.md | 4 +- argilla/docs/how_to_guides/record.md | 10 +- argilla/docs/how_to_guides/user.md | 4 +- argilla/docs/how_to_guides/workspace.md | 4 +- argilla/docs/index.md | 2 +- argilla/docs/reference/argilla/client.md | 2 +- .../argilla/datasets/dataset_records.md | 2 +- .../reference/argilla/datasets/datasets.md | 2 +- .../docs/reference/argilla/records/records.md | 2 +- .../reference/argilla/records/responses.md | 2 +- .../reference/argilla/records/suggestions.md | 2 +- .../docs/reference/argilla/records/vectors.md | 2 +- argilla/docs/reference/argilla/search.md | 4 +- .../docs/reference/argilla/settings/fields.md | 2 +- .../argilla/settings/metadata_property.md | 6 +- .../reference/argilla/settings/questions.md | 12 +- .../reference/argilla/settings/settings.md | 2 +- .../reference/argilla/settings/vectors.md | 2 +- argilla/docs/reference/argilla/users.md | 2 +- argilla/docs/reference/argilla/workspaces.md | 2 +- argilla/docs/scripts/gen_popular_issues.py | 3 + argilla/docs/tutorials/index.md | 20 + .../docs/tutorials/text_classification.ipynb | 529 ++++++++++++++++++ argilla/mkdocs.yml | 13 +- 31 files changed, 604 insertions(+), 48 deletions(-) create mode 100644 argilla/docs/tutorials/index.md create mode 100644 argilla/docs/tutorials/text_classification.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10b5b71c89..05ce4a7f10 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,7 @@ repos: rev: v4.6.0 hooks: - id: check-yaml + exclude: argilla/mkdocs.yml - id: end-of-file-fixer exclude_types: [text, jupyter] - id: trailing-whitespace diff --git a/argilla/docs/community/contributor.md b/argilla/docs/community/contributor.md index 4956c2f398..5612da8662 100644 --- a/argilla/docs/community/contributor.md +++ b/argilla/docs/community/contributor.md @@ -1,5 +1,5 @@ --- -description: the Argilla Python SDK is the reference Argilla Python server SDK. +description: This is a step-by-step guide to help you contribute to the Argilla project. We are excited to have you on board! 🚀 hide: - footer --- diff --git a/argilla/docs/community/index.md b/argilla/docs/community/index.md index d6c5ba11b5..77d14b235e 100644 --- a/argilla/docs/community/index.md +++ b/argilla/docs/community/index.md @@ -1,5 +1,5 @@ --- -description: the Argilla Python SDK is the reference Argilla Python server. +description: These are the tools and resources to be up-to-date with the Argilla development and contribute to the project. hide: - toc - footer diff --git a/argilla/docs/getting_started/faq.md b/argilla/docs/getting_started/faq.md index 410b90ff86..5c0722b497 100644 --- a/argilla/docs/getting_started/faq.md +++ b/argilla/docs/getting_started/faq.md @@ -1,5 +1,5 @@ --- -description: the Argilla Python SDK is the reference Argilla Python server. +description: These are the Frequently Asked Questions regarding Argilla. hide: toc --- diff --git a/argilla/docs/getting_started/installation.md b/argilla/docs/getting_started/installation.md index a8589a892e..8873a1b910 100644 --- a/argilla/docs/getting_started/installation.md +++ b/argilla/docs/getting_started/installation.md @@ -1,5 +1,5 @@ --- -description: Installation of the Argilla Python SDK. +description: Installation of the Argilla SDK. --- # Installation diff --git a/argilla/docs/how_to_guides/dataset.md b/argilla/docs/how_to_guides/dataset.md index 5a755ffab1..1d9d853951 100644 --- a/argilla/docs/how_to_guides/dataset.md +++ b/argilla/docs/how_to_guides/dataset.md @@ -25,7 +25,7 @@ A **dataset** is a collection of records that you can configure for labelers to client=client ) ``` - > Check the [Dataset - Python Reference](../../reference/argilla/datasets/datasets.md) to see the attributes, arguments, and methods of the `Dataset` class in detail. + > Check the [Dataset - Python Reference](../reference/argilla/datasets/datasets.md) to see the attributes, arguments, and methods of the `Dataset` class in detail. === "`rg.Settings`" @@ -45,7 +45,7 @@ A **dataset** is a collection of records that you can configure for labelers to ) ``` - > Check the [Settings - Python Reference](../../reference/argilla/settings/settings.md) to see the attributes, arguments, and methods of the `Settings` class in detail. + > Check the [Settings - Python Reference](../reference/argilla/settings/settings.md) to see the attributes, arguments, and methods of the `Settings` class in detail. ## Create a dataset diff --git a/argilla/docs/how_to_guides/index.md b/argilla/docs/how_to_guides/index.md index ff089fd943..30ddcd262b 100644 --- a/argilla/docs/how_to_guides/index.md +++ b/argilla/docs/how_to_guides/index.md @@ -1,11 +1,11 @@ --- -description: These are the how-to guides for the Argilla Python SDK. They provide step-by-step instructions for common scenarios, including detailed explanations and code samples. +description: These are the how-to guides for the Argilla SDK. They provide step-by-step instructions for common scenarios, including detailed explanations and code samples. hide: toc --- # How-to guides -These are the how-to guides for *the Argilla Python SDK*. They provide step-by-step instructions for common scenarios, including detailed explanations and code samples. +These are the how-to guides for *the Argilla SDK*. They provide step-by-step instructions for common scenarios, including detailed explanations and code samples.
diff --git a/argilla/docs/how_to_guides/query_export.md b/argilla/docs/how_to_guides/query_export.md index a9a2f89d09..9be482c797 100644 --- a/argilla/docs/how_to_guides/query_export.md +++ b/argilla/docs/how_to_guides/query_export.md @@ -18,7 +18,7 @@ You can search for records in your dataset by **querying** or **filtering**. The filter=filter ) ``` - > Check the [Query - Python Reference](../../reference/argilla/search.md) to see the attributes, arguments, and methods of the `Query` class in detail. + > Check the [Query - Python Reference](../reference/argilla/search.md) to see the attributes, arguments, and methods of the `Query` class in detail. === "`rg.Filter`" @@ -29,7 +29,7 @@ You can search for records in your dataset by **querying** or **filtering**. The ] ) ``` - > Check the [Filter - Python Reference](../../reference/argilla/search.md) to see the attributes, arguments, and methods of the `Filter` class in detail. + > Check the [Filter - Python Reference](../reference/argilla/search.md) to see the attributes, arguments, and methods of the `Filter` class in detail. ## Query with search terms diff --git a/argilla/docs/how_to_guides/record.md b/argilla/docs/how_to_guides/record.md index 2819199368..32841c0526 100644 --- a/argilla/docs/how_to_guides/record.md +++ b/argilla/docs/how_to_guides/record.md @@ -33,7 +33,7 @@ A **record** in Argilla is a data item that requires annotation, consisting of o ], ) ``` - > Check the [Record - Python Reference](../../reference/argilla/records/records.md) to see the attributes, arguments, and methods of the `Record` class in detail. + > Check the [Record - Python Reference](../reference/argilla/records/records.md) to see the attributes, arguments, and methods of the `Record` class in detail. ## Add records @@ -222,7 +222,7 @@ You can associate vectors, like text embeddings, to your records. They can be us You can also add vectors to a record in an initialized `Record` object. - > Check the [Vector - Python Reference](../../reference/argilla/records/vectors.md) to see the attributes, arguments, and methods of the `Vector` class in detail. + > Check the [Vector - Python Reference](../reference/argilla/records/vectors.md) to see the attributes, arguments, and methods of the `Vector` class in detail. ```python # Add records to the dataset with the vector 'my_vector' and dimension=3 @@ -274,10 +274,10 @@ You can associate vectors, like text embeddings, to your records. They can be us Suggestions refer to suggested responses (e.g. model predictions) that you can add to your records to make the annotation process faster. These can be added during the creation of the record or at a later stage. Only one suggestion can be provided for each question, and suggestion values must be compliant with the pre-defined questions e.g. if we have a `RatingQuestion` between 1 and 5, the suggestion should have a valid value within that range. -=== "As `Record objects" +=== "As `Record` objects" You can also add suggestions to a record in an initialized `Record` object. - > Check the [Suggestions - Python Reference](../../reference/argilla/records/suggestions.md) to see the attributes, arguments, and methods of the `Suggestion` class in detail. + > Check the [Suggestions - Python Reference](../reference/argilla/records/suggestions.md) to see the attributes, arguments, and methods of the `Suggestion` class in detail. ```python # Add records to the dataset with the label 'my_label' @@ -348,7 +348,7 @@ If your dataset includes some annotations, you can add those to the records as y === "As `Record` objects" You can also add suggestions to a record in an initialized `Record` object. - > Check the [Responses - Python Reference](../../reference/argilla/records/responses.md) to see the attributes, arguments, and methods of the `Suggestion` class in detail. + > Check the [Responses - Python Reference](../reference/argilla/records/responses.md) to see the attributes, arguments, and methods of the `Suggestion` class in detail. ```python # Add records to the dataset with the label 'my_label' diff --git a/argilla/docs/how_to_guides/user.md b/argilla/docs/how_to_guides/user.md index 72d6f66b0c..430be96e67 100644 --- a/argilla/docs/how_to_guides/user.md +++ b/argilla/docs/how_to_guides/user.md @@ -2,7 +2,7 @@ description: In this section, we will provide a step-by-step guide to show how to manage users and their credentials. --- -# User Management +# User management This guide provides an overview of user roles and credentials, explaining how to set up and manage users in Argilla. @@ -69,7 +69,7 @@ Argilla provides a default user with the `owner` role to help you get started in client=client ) ``` - > Check the [User - Python Reference](../../reference/argilla/users.md) to see the attributes, arguments, and methods of the `User` class in detail. + > Check the [User - Python Reference](../reference/argilla/users.md) to see the attributes, arguments, and methods of the `User` class in detail. ## Get current user diff --git a/argilla/docs/how_to_guides/workspace.md b/argilla/docs/how_to_guides/workspace.md index 912f17edc8..71bbb2f9c9 100644 --- a/argilla/docs/how_to_guides/workspace.md +++ b/argilla/docs/how_to_guides/workspace.md @@ -2,7 +2,7 @@ description: In this section, we will provide a step-by-step guide to show how to manage workspaces. --- -# Workspace Management +# Workspace management This guide provides an overview of workspaces, explaining how to set up and manage workspaces in Argilla. @@ -31,7 +31,7 @@ Argilla provides a default workspace to help you get started in Python and the U client=client ) ``` - > Check the [Workspace - Python Reference](../../reference/argilla/workspaces.md) to see the attributes, arguments, and methods of the `Workspace` class in detail. + > Check the [Workspace - Python Reference](../reference/argilla/workspaces.md) to see the attributes, arguments, and methods of the `Workspace` class in detail. ## Create a new workspace diff --git a/argilla/docs/index.md b/argilla/docs/index.md index ee5521d746..f9187704e4 100644 --- a/argilla/docs/index.md +++ b/argilla/docs/index.md @@ -8,7 +8,7 @@ hide: navigation Argilla is a **collaboration platform for AI engineers and domain experts** that require **high-quality outputs, full data ownership, and overall efficiency**. !!! SUCCESS "Welcome to Argilla 2.x!" - To skip the introductions and go directly to installing and creating your first dataset, see [Quickstart](getting_started/quickstart/). + To skip the introductions and go directly to installing and creating your first dataset, see [Quickstart](getting_started/quickstart.md). !!! DANGER "Looking for Argilla 1.x?" Looking for documentation for Argilla 1.x? Visit the latest release [here](https://docs.argilla.io/en/latest/). diff --git a/argilla/docs/reference/argilla/client.md b/argilla/docs/reference/argilla/client.md index 194bf61f5f..e4be355557 100644 --- a/argilla/docs/reference/argilla/client.md +++ b/argilla/docs/reference/argilla/client.md @@ -48,6 +48,6 @@ for dataset in my_workspace.datasets: ### `rg.Argilla` -::: argilla.client.Argilla +::: src.argilla.client.Argilla options: heading_level: 3 diff --git a/argilla/docs/reference/argilla/datasets/dataset_records.md b/argilla/docs/reference/argilla/datasets/dataset_records.md index 9ade4d146a..5ae78d5d4d 100644 --- a/argilla/docs/reference/argilla/datasets/dataset_records.md +++ b/argilla/docs/reference/argilla/datasets/dataset_records.md @@ -219,6 +219,6 @@ Check out the [`rg.Record`](../records/records.md) class reference for more info ### `rg.Dataset.records` -::: argilla.records.DatasetRecords +::: src.argilla.records._dataset_records.DatasetRecords options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/datasets/datasets.md b/argilla/docs/reference/argilla/datasets/datasets.md index d9cf78a024..013e774b44 100644 --- a/argilla/docs/reference/argilla/datasets/datasets.md +++ b/argilla/docs/reference/argilla/datasets/datasets.md @@ -43,6 +43,6 @@ dataset = client.datasets("my_dataset") ### `rg.Dataset` -::: argilla.datasets.Dataset +::: src.argilla.datasets._resource.Dataset options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/records/records.md b/argilla/docs/reference/argilla/records/records.md index 24bd474083..81ebc65a4e 100644 --- a/argilla/docs/reference/argilla/records/records.md +++ b/argilla/docs/reference/argilla/records/records.md @@ -55,6 +55,6 @@ For changes to take effect, the user must call the `update` method on the `Datas ### `rg.Record` -::: argilla.records.Record +::: src.argilla.records._resource.Record options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/records/responses.md b/argilla/docs/reference/argilla/records/responses.md index 028fcd8db8..f6b79a7b0b 100644 --- a/argilla/docs/reference/argilla/records/responses.md +++ b/argilla/docs/reference/argilla/records/responses.md @@ -64,6 +64,6 @@ for record in dataset.records: ### `rg.Response` -::: argilla.responses.Response +::: src.argilla.responses.Response options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/records/suggestions.md b/argilla/docs/reference/argilla/records/suggestions.md index 3a8508721e..38d3c6f6a5 100644 --- a/argilla/docs/reference/argilla/records/suggestions.md +++ b/argilla/docs/reference/argilla/records/suggestions.md @@ -73,6 +73,6 @@ for record in dataset.records(with_suggestions=True): ### `rg.Suggestion` -::: argilla.suggestions.Suggestion +::: src.argilla.suggestions.Suggestion options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/records/vectors.md b/argilla/docs/reference/argilla/records/vectors.md index 6a78908f13..5d8e8384f5 100644 --- a/argilla/docs/reference/argilla/records/vectors.md +++ b/argilla/docs/reference/argilla/records/vectors.md @@ -71,6 +71,6 @@ dataset.records.log( ### `rg.Vector` -::: argilla.vectors.Vector +::: src.argilla.vectors.Vector options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/search.md b/argilla/docs/reference/argilla/search.md index 305953d5a2..7ed9df23a0 100644 --- a/argilla/docs/reference/argilla/search.md +++ b/argilla/docs/reference/argilla/search.md @@ -46,12 +46,12 @@ for record in dataset.records(query=query): ### `rg.Query` -::: argilla.records._search.Query +::: src.argilla.records._search.Query options: heading_level: 3 ### `rg.Filter` -::: argilla.records._search.Filter +::: src.argilla.records._search.Filter options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/settings/fields.md b/argilla/docs/reference/argilla/settings/fields.md index e482ce3836..5c30e222a5 100644 --- a/argilla/docs/reference/argilla/settings/fields.md +++ b/argilla/docs/reference/argilla/settings/fields.md @@ -39,6 +39,6 @@ data = rg.Dataset( ### `rg.TextField` -::: argilla.settings.TextField +::: src.argilla.settings._field.TextField options: heading_level: 3 diff --git a/argilla/docs/reference/argilla/settings/metadata_property.md b/argilla/docs/reference/argilla/settings/metadata_property.md index a33f0075cf..6f1cd4a7c0 100644 --- a/argilla/docs/reference/argilla/settings/metadata_property.md +++ b/argilla/docs/reference/argilla/settings/metadata_property.md @@ -69,18 +69,18 @@ dataset = rg.Dataset( ### `rg.FloatMetadataProperty` -::: argilla.settings.FloatMetadataProperty +::: src.argilla.settings._metadata.FloatMetadataProperty options: heading_level: 3 ### `rg.IntegerMetadataProperty` -::: argilla.settings.IntegerMetadataProperty +::: src.argilla.settings._metadata.IntegerMetadataProperty options: heading_level: 3 ### `rg.TermsMetadataProperty` -::: argilla.settings.TermsMetadataProperty +::: src.argilla.settings._metadata.TermsMetadataProperty options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/settings/questions.md b/argilla/docs/reference/argilla/settings/questions.md index 08b9b33a5c..a4f45b5b91 100644 --- a/argilla/docs/reference/argilla/settings/questions.md +++ b/argilla/docs/reference/argilla/settings/questions.md @@ -57,36 +57,36 @@ dataset = rg.Dataset( ### `rg.LabelQuestion` -::: argilla.settings.LabelQuestion +::: src.argilla.settings._question.LabelQuestion options: heading_level: 3 ### `rg.MultiLabelQuestion` -::: argilla.settings.MultiLabelQuestion +::: src.argilla.settings._question.MultiLabelQuestion options: heading_level: 3 ### `rg.RankingQuestion` -::: argilla.settings.RankingQuestion +::: src.argilla.settings._question.RankingQuestion options: heading_level: 3 ### `rg.TextQuestion` -::: argilla.settings.TextQuestion +::: src.argilla.settings._question.TextQuestion options: heading_level: 3 ### `rg.RatingQuestion` -::: argilla.settings.RatingQuestion +::: src.argilla.settings._question.RatingQuestion options: heading_level: 3 ### `rg.SpanQuestion` -::: argilla.settings.SpanQuestion +::: src.argilla.settings._question.SpanQuestion options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/settings/settings.md b/argilla/docs/reference/argilla/settings/settings.md index 677bc16b59..9510a6e18f 100644 --- a/argilla/docs/reference/argilla/settings/settings.md +++ b/argilla/docs/reference/argilla/settings/settings.md @@ -38,6 +38,6 @@ dataset.create() ### `rg.Settings` -::: argilla.settings.Settings +::: src.argilla.settings._resource.Settings options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/settings/vectors.md b/argilla/docs/reference/argilla/settings/vectors.md index 7d00f802b6..91e0be3f05 100644 --- a/argilla/docs/reference/argilla/settings/vectors.md +++ b/argilla/docs/reference/argilla/settings/vectors.md @@ -32,6 +32,6 @@ settings = rg.Settings( ### `rg.VectorField` -::: argilla.settings.VectorField +::: src.argilla.settings._vector.VectorField options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/users.md b/argilla/docs/reference/argilla/users.md index 0c79dd8644..e91c58a10d 100644 --- a/argilla/docs/reference/argilla/users.md +++ b/argilla/docs/reference/argilla/users.md @@ -32,6 +32,6 @@ client.me ### `rg.User` -::: argilla.users.User +::: src.argilla.users._resource.User options: heading_level: 3 \ No newline at end of file diff --git a/argilla/docs/reference/argilla/workspaces.md b/argilla/docs/reference/argilla/workspaces.md index f56ee1596f..dcd1b7a425 100644 --- a/argilla/docs/reference/argilla/workspaces.md +++ b/argilla/docs/reference/argilla/workspaces.md @@ -27,6 +27,6 @@ workspace = client.workspaces("my_workspace") ### `rg.Workspace` -::: argilla.workspaces.Workspace +::: src.argilla.workspaces._resource.Workspace options: heading_level: 4 \ No newline at end of file diff --git a/argilla/docs/scripts/gen_popular_issues.py b/argilla/docs/scripts/gen_popular_issues.py index 46e6bfdf2c..45767751b0 100644 --- a/argilla/docs/scripts/gen_popular_issues.py +++ b/argilla/docs/scripts/gen_popular_issues.py @@ -86,6 +86,9 @@ def get_org_members(auth_token): members_url = "https://api.github.com/orgs/argilla-io/members" + if auth_token is None: + return [] + while members_url: response = requests.get(members_url, headers=headers) members = response.json() diff --git a/argilla/docs/tutorials/index.md b/argilla/docs/tutorials/index.md new file mode 100644 index 0000000000..0b3e569aa6 --- /dev/null +++ b/argilla/docs/tutorials/index.md @@ -0,0 +1,20 @@ +--- +description: These are the tutorials for the Argilla SDK. They provide step-by-step instructions for common tasks. +hide: toc +--- + + +# Tutorials + +These are the tutorials for *the Argilla SDK*. They provide step-by-step instructions for common tasks. + +
+ +- __Text classification task__ + + --- + + Learn about a standard workflow to improve data quality for a text classification task. + [:octicons-arrow-right-24: How-to guide](text_classification.ipynb) + +
\ No newline at end of file diff --git a/argilla/docs/tutorials/text_classification.ipynb b/argilla/docs/tutorials/text_classification.ipynb new file mode 100644 index 0000000000..7f9d606c89 --- /dev/null +++ b/argilla/docs/tutorials/text_classification.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text classification task" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will show a standard workflow for a text classification task, in this case, using SetFit and Argilla.\n", + "\n", + "We will follow these steps:\n", + "\n", + "* Configure the Argilla dataset\n", + "* Add initial model suggestions\n", + "* Evaluate with Argilla\n", + "* Train your model\n", + "* Update the suggestions with the new model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting started" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the Argilla server" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you have already deployed Argilla Server, you can skip this step. Otherwise, you can quickly deploy it in two different ways:\n", + "\n", + "* Remotely using a [HF Space](https://huggingface.co/new-space?template=argilla/argilla-template-space). ⚠️ If persistent storage is not enabled, you will lose your data when the server is stopped.\n", + "* Locally using Docker: `docker run -d --name quickstart -p 6900:6900 argilla/argilla-quickstart:latest`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To complete this tutorial, you need to install the Argilla SDK and a few third-party libraries via `pip`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install argilla setfit==1.0.3 transformers==4.40.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make the required imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import argilla as rg\n", + "\n", + "from datasets import load_dataset, Dataset\n", + "from setfit import SetFitModel, Trainer, get_templated_dataset, sample_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You also need to connect to the Argilla server using the `api_url` and `api_key`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace api_url with your url if using Docker\n", + "# Replace api_key if you configured a custom API key\n", + "# Uncomment the last line and set your HF_TOKEN if your space is private\n", + "client = rg.Argilla(\n", + " api_url=\"https://[your-owner-name]-[your_space_name].hf.space\",\n", + " api_key=\"owner.apikey\"\n", + " # extra_headers={\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure and create the Argilla dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we will need to configure the dataset. In the settings, we can specify the guidelines, fields, and questions. If needed, you can also add metadata and vectors. However, for our use case, we just need a text field and a label question.\n", + "\n", + "!!! note\n", + " Check this [how-to guide](../how_to_guides/dataset.md) to know more about configuring and creating a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "labels = [\"positive\", \"negative\"]\n", + "\n", + "settings = rg.Settings(\n", + " guidelines=\"Classify the reviews as positive or negative.\",\n", + " fields=[\n", + " rg.TextField(\n", + " name=\"review\",\n", + " title=\"Text from the review\",\n", + " use_markdown=False,\n", + " ),\n", + " ],\n", + " questions=[\n", + " rg.LabelQuestion(\n", + " name=\"sentiment_label\",\n", + " title=\"In which category does this article fit?\",\n", + " labels=labels,\n", + " )\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the dataset with the name and the defined settings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = rg.Dataset(\n", + " name=\"text_classification_dataset1\",\n", + " settings=settings,\n", + ")\n", + "dataset.create()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add records" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even if we have created the dataset, it still lacks the information to be annotated (you can check it in the UI). We will use the `imdb` dataset from the [Hugging Face Hub](https://huggingface.co/datasets/stanfordnlp/imdb). Specifically, we will use 100 samples from the `train` split." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "hf_dataset = load_dataset(\"imdb\", split=\"train[:100]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will easily add them to the dataset using `log` and the mapping, where we indicate that the column `text` is the data that should be added to the field `review`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.records.log(records=hf_dataset, mapping={\"text\": \"review\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add initial model suggestions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to add suggestions to the dataset. In our case, we will generate them using a zero-shot SetFit model. However, you can use a framework or technique of your choice." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will start by defining an example training set with the required labels: `positive` and `negative`. Using `get_templated_dataset` will create sentences from the default template: \"This sentence is {label}.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "zero_ds = get_templated_dataset(\n", + " candidate_labels=labels,\n", + " sample_size=8,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we will prepare a function to train the SetFit model.\n", + "\n", + "!!! note\n", + " For further customization, you can check the [SetFit documentation](https://huggingface.co/docs/setfit/reference/main)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def train_model(model_name, dataset):\n", + " \n", + " model = SetFitModel.from_pretrained(model_name)\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " train_dataset=dataset,\n", + " )\n", + "\n", + " trainer.train()\n", + " \n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's train the model. We will use `TaylorAI/bge-micro-v2`, available in the [Hugging Face Hub](https://huggingface.co/TaylorAI/bge-micro-v2)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = train_model(model_name=\"TaylorAI/bge-micro-v2\", dataset=zero_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can save it locally or push it to the Hub. And then, load it from there." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Save and load locally\n", + "# model.save_pretrained(\"text_classification_model\")\n", + "# model = SetFitModel.from_pretrained(\"text_classification_model\")\n", + "\n", + "# Push and load in HF\n", + "# model.push_to_hub(\"[username]/text_classification_model\")\n", + "# model = SetFitModel.from_pretrained(\"[username]/text_classification_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's time to make the predictions! We will set a function that uses the `predict` method to get the suggested label. The model will infer the label based on the text." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "def predict(model, input, labels):\n", + " \n", + " model.labels = labels\n", + " \n", + " prediction = model.predict([input])\n", + " \n", + " return prediction[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To update the records, we will need to retrieve them from the server and update them with the new suggestions. The `id` will always need to be provided as it is the records' identifier to update a record and avoid creating a new one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = dataset.records.to_list(flatten=True)\n", + "updated_data = [\n", + " {\n", + " \"sentiment_label\": predict(model, sample[\"review\"], labels),\n", + " \"id\": sample[\"id\"],\n", + " }\n", + " for sample in data\n", + "]\n", + "dataset.records.log(records=updated_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voilà! We have added the suggestions to the dataset, and they will appear in the UI marked with a ✨. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate with Argilla" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can start the annotation process. Just open the dataset in the Argilla UI and start annotating the records. If the suggestions are correct, you can just click on `Submit`. Otherwise, you can select the correct label." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "!!! note\n", + " Check this [how-to guide](../how_to_guides/annotate.md) to know more about annotating in the UI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train your model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the annotation, we will have a robust dataset to train the main model. In our case, we will fine-tune using SetFit. However, you can select the one that best fits your requirements. So, let's start by retrieving the annotated records.\n", + "\n", + "!!! note\n", + " Check this [how-to guide](../how_to_guides/query_export.md) to know more about filtering and querying in Argilla." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = client.datasets(\"text_classification_dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "status_filter = rg.Query(filter = rg.Filter((\"status\", \"==\", \"submitted\")))\n", + "submitted = list(dataset.records(status_filter))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we have a single response per record, we can retrieve the selected label straightforwardly and create the training set with 8 samples per label. We selected 8 samples per label to have a balanced dataset for few-shot learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_records = [{\n", + " \"text\" : r.fields[\"review\"],\n", + " \"label\" : r.responses.sentiment_label[0].value,\n", + " } for r in submitted\n", + "]\n", + "train_dataset = Dataset.from_list(train_records)\n", + "train_dataset = sample_dataset(train_dataset, label_column=\"label\", num_samples=8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can train the model using our previous function, but this time with a high-quality human-annotated training set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = train_model(model_name=\"TaylorAI/bge-micro-v2\", dataset=train_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the training data had a better-quality, we can expect a better model. So, we can update the remaining non-annotated records with the new model's suggestions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = dataset.records.to_list(flatten=True)\n", + "updated_data = [\n", + " {\n", + " \"sentiment_label\": predict(model, sample[\"review\"], labels),\n", + " \"id\": sample[\"id\"],\n", + " }\n", + " for sample in data\n", + "]\n", + "dataset.records.log(records=updated_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we present an end-to-end example of a text classification task. This serves as the base, but it can be performed iteratively and seamlessly integrated into your workflow to ensure high-quality curation of your data and improved results.\n", + "\n", + "We started by configuring the dataset, adding records, and training a zero-shot SetFit model, as an example, to add suggestions. After the annotation process, we trained a new model with the annotated data and updated the remaining records with the new suggestions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/argilla/mkdocs.yml b/argilla/mkdocs.yml index 3a891141e9..e9a9e8bc83 100644 --- a/argilla/mkdocs.yml +++ b/argilla/mkdocs.yml @@ -6,9 +6,9 @@ site_description: The Argilla python server SDK copyright: Copyright © 2017 - 2024 Argilla # Repository -repo_name: argilla-io/argilla-python -repo_url: https://github.com/argilla-io/argilla-python -edit_uri: edit/main/docs/ +repo_name: argilla-io/argilla +repo_url: https://github.com/argilla-io/argilla +edit_uri: edit/main/argilla/docs/ extra: version: @@ -91,8 +91,8 @@ markdown_extensions: - footnotes - tables - pymdownx.emoji: - emoji_index: "!!python/name:material.extensions.emoji.twemoji" - emoji_generator: "!!python/name:material.extensions.emoji.to_svg" + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg # activating permalink: true makes the anchor link works in the notebooks - toc: permalink: true @@ -140,6 +140,9 @@ nav: - Add, update, and delete records: how_to_guides/record.md - Query, filter, and export records: how_to_guides/query_export.md - Migrate your legacy datasets to Argilla V2: how_to_guides/migrate_from_legacy_datasets.md + - Tutorials: + - tutorials/index.md + - Text classification task: tutorials/text_classification.ipynb - API Reference: - Python SDK: reference/argilla/ - Community: