diff --git a/argilla/migration_legacy_datasets.ipynb b/argilla/migration_legacy_datasets.ipynb new file mode 100644 index 0000000000..a629275ba2 --- /dev/null +++ b/argilla/migration_legacy_datasets.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "cdcdc5de3766b7", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "!pip install datasets ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "493e2b3a28f31838", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import argilla.v1 as rg_v1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaaed42e9d3ccae6", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import os\n", + "url = \"https://demo.argilla.io\"\n", + "api_key = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2529993e751c8ec", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "rg_v1.init(url, api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18db55cfdf95534e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dataset_name = \"news-programmatic-labeling\"\n", + "workspace = \"demo\"" + ] + }, + { + "cell_type": "markdown", + "id": "70b4119f4f6f9a18", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Read the current dataset labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cda50a9dfa5f604", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "settings_v1 = rg_v1.load_dataset_settings(dataset_name, workspace)\n", + "settings_v1" + ] + }, + { + "cell_type": "markdown", + "id": "38fb97336911ff85", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Read the dataset records\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d22a690ea37d009a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "hf_records = rg_v1.load(dataset_name, workspace, limit=100, query=\"_exists_:annotated_by\").to_datasets()\n", + "hf_records" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b5ebfa0261359d7", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "hf_records[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62a27d3e660f92bb", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import argilla as rg\n", + "\n", + "client = rg.Argilla() # create the new dataset into a different argilla server instance " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2df678a236594d7", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "settings = rg.Settings(\n", + " fields=[\n", + " # The default name for text classification is `text`,\n", + " # but we should provide all names included in `record.inputs`\n", + " rg.TextField(name=\"text\"),\n", + " ],\n", + " questions=[\n", + " # The basis question for text classification is a LabelQuestion for single-label\n", + " # or MultiLabelQuestion for multi-label classification\n", + " rg.LabelQuestion(name=\"label\", labels=settings_v1.label_schema),\n", + " ],\n", + " metadata=[\n", + " # Here, we need to provide all relevant metadata fields.\n", + " rg.TermsMetadataProperty(name=\"split\"),\n", + " ],\n", + " vectors=[\n", + " # The vectors fields available in the dataset\n", + " rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),\n", + " ],\n", + ")\n", + "settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9211d122d9da1d79", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "ds = client.datasets(name=dataset_name)\n", + "if ds.exists():\n", + " ds.delete()\n", + "\n", + "dataset = rg.Dataset(name=dataset_name, settings=settings)\n", + "dataset.create()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e898827d7768e6a5", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# For assign responses to users, we need to load the existing users \n", + "users_by_name = {user.username: user for user in client.users}\n", + "current_user = client.me\n" + ] + }, + { + "cell_type": "markdown", + "id": "224d97d640973a83", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Now, we can upload the records to the new dataset. For that, we can convert the loaded records into a hf dataset which can be uploaded to the new dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5517c0d9e1d9b4b9", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "def map_to_record_for_single_label(data: dict, users_by_name: dict, current_user: rg.User) -> rg.Record:\n", + " \"\"\" This function maps a text classification record dictionary to the new Argilla record.\"\"\"\n", + " suggestions = []\n", + " responses = []\n", + " if data.get(\"prediction\"):\n", + " # From data[\"prediction\"]\n", + " label, score = data[\"prediction\"]\n", + " agent = data.get(\"prediction_agent\")\n", + " suggestions.append(rg.Suggestion(question_name=\"label\", value=label, score=score, agent=agent))\n", + " if data.get(\"annotation\"):\n", + " # From data[annotation] and data[annotation_agent]\n", + " user_id = users_by_name.get(data[\"annotation_agent\"], current_user).id\n", + " responses.append(rg.Response(question_name=\"label\", value=data[\"annotation\"], user_id=user_id))\n", + "\n", + " return rg.Record(\n", + " id=data[\"id\"],\n", + " fields=data[\"inputs\"],\n", + " # The inputs field should be a dictionary with the same keys as the `fields` in the settings\n", + " metadata=data[\"metadata\"],\n", + " # The metadata field should be a dictionary with the same keys as the `metadata` in the settings\n", + " vectors=[rg.Vector(name=name, values=value) for name, value in data.get(\"vectors\", {}).items()],\n", + " # The vectors field should be a dictionary with the same keys as the `vectors` in the settings\n", + " suggestions=suggestions,\n", + " responses=responses,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e8dc8727215ecd0", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "def map_to_record_for_multi_label(data: dict, users_by_name: dict, current_user: rg.User) -> rg.Record:\n", + " suggestions = []\n", + " responses = []\n", + " if data.get(\"prediction\"):\n", + " # From data[\"prediction\"]\n", + " labels, scores = zip(*data[\"prediction\"])\n", + " agent = data.get(\"prediction_agent\")\n", + " suggestions.append(rg.Suggestion(question_name=\"labels\", value=labels, score=scores, agent=agent))\n", + " if data.get(\"annotation\"):\n", + " # From data[annotation] and data[annotation_agent]\n", + " user_id = users_by_name.get(data[\"annotation_agent\"], current_user).id\n", + " responses.append(rg.Response(question_name=\"label\", value=data[\"annotation\"], user_id=user_id))\n", + "\n", + " return rg.Record(\n", + " id=data[\"id\"],\n", + " fields=data[\"inputs\"],\n", + " # The inputs field should be a dictionary with the same keys as the `fields` in the settings\n", + " metadata=data[\"metadata\"],\n", + " # The metadata field should be a dictionary with the same keys as the `metadata` in the settings\n", + " vectors=[rg.Vector(name=name, values=value) for name, value in data.get(\"vectors\", {}).items()],\n", + " # The vectors field should be a dictionary with the same keys as the `vectors` in the settings\n", + " suggestions=suggestions,\n", + " responses=responses,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eef3f30ae80243e7", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "records = []\n", + "for data in hf_records:\n", + " records.append(map_to_record_for_single_label(data, users_by_name, current_user))\n", + "\n", + "# 4. Upload the records to the new dataset\n", + "dataset.records.log(records)" + ] + }, + { + "cell_type": "markdown", + "id": "21de114bdc890641", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## Notes about the migration workflow\n", + "\n", + "1. We only need a subset of functions from v1 SDK\n", + " - `rg.init`\n", + " - `rg.load_dataset_settings`\n", + " - `rg.load`\n", + " But, event like this, it would be nice to provide the whole SDK, so users can migrate iteratively their code to the v2 version.\n", + "2. Users should have a clear notion of the dataset structure (extra inputs, vectors, or metadata).\n", + "3. Created responses have a Draft status by default. There is no way to preset the status for a response, which can be blocking, even more if bulk operations in UI cannot be applied to the Draft queue.\n", + "4. When creating a rg.Record we need to prepare all about suggestions and responses before create the record itself. It could help if we can add responses or suggestions to an existing record (record.suggestions.add(...), record.responses.add(...))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1381e42fa4034769", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}