From d026061c9519d43300cf5133dca988486d4ae2e5 Mon Sep 17 00:00:00 2001 From: FelixFehse Date: Fri, 14 Jun 2024 13:35:55 +0200 Subject: [PATCH] refactor: added custom multilabel classify example data --- .../data/classify_examples_multilabel.json | 157 ++++++++++++++++++ src/documentation/evaluation.ipynb | 35 ++-- 2 files changed, 179 insertions(+), 13 deletions(-) create mode 100644 src/documentation/data/classify_examples_multilabel.json diff --git a/src/documentation/data/classify_examples_multilabel.json b/src/documentation/data/classify_examples_multilabel.json new file mode 100644 index 000000000..e81920fe3 --- /dev/null +++ b/src/documentation/data/classify_examples_multilabel.json @@ -0,0 +1,157 @@ +[ + { + "label": [ + "Finance", + "Sales" + ], + "message": "I just traveled to Paris for a conference, where can I get the train ride refunded?" + }, + { + "label": [ + "Sales", + "Communications", + "Customer" + ], + "message": "Hello, we would like to get in contact with your sales team, because we are interested in your solution." + }, + { + "label": [ + "Communications", + "Research" + ], + "message": "We are working on a documentation on AI and would like to film a piece about you. Would you be interested?" + }, + { + "label": [ + "Research" + ], + "message": "I am working with Stanford and was hoping to win you over for a research collaboration." + }, + { + "label": [ + "IT Support" + ], + "message": "My laptop is broken" + }, + { + "label": [ + "Communications" + ], + "message": "Can you send your models via email?" + }, + { + "label": [ + "Research" + ], + "message": "We should do a research collaboration." + }, + { + "label": [ + "Research", + "Sales" + ], + "message": "My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market and research strategy." + }, + { + "label": [ + "Human Resources" + ], + "message": "Full stack developer in your area available now." + }, + { + "label": [ + "Product", + "IT Support" + ], + "message": "Hi,\n\nI recently bought your offering. I am having trouble running your docker container in my environment. It fails to start. Can you help?" + }, + { + "label": [ + "Product" + ], + "message": "Hello,\n\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?" + }, + { + "label": [ + "Product", + "Sales", + "Customer" + ], + "message": "Can you show me a demo of different use cases your offering can solve?" + }, + { + "label": [ + "Human Resources", + "Communications" + ], + "message": "Hey, I did not get a t-shirt in the onboarding. Could I still get one?" + }, + { + "label": [ + "Customer" + ], + "message": "Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?" + }, + { + "label": [ + "Product" + ], + "message": "Hi Jan, is your product ISO 37301 compliant?" + }, + { + "label": [ + "IT Support" + ], + "message": "I can\u2019t login to Mattermost or Sharepoint, how can I gain access?" + }, + { + "label": [ + "Finance", + "Human Resources" + ], + "message": "I did not get paid last month, when do I get paid? What is going on?" + }, + { + "label": [ + "Security" + ], + "message": "Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. " + }, + { + "label": [ + "Marketing" + ], + "message": "I have a question concerning your marketing strategy, would you have time to hop on a call?" + }, + { + "label": [ + "CEO Office" + ], + "message": "Dear Jonas Andrulis,\n\nWe have met each other at the event in N\u00fcrnberg, can we meet for a follow up in your Office in Heidelberg?" + }, + { + "label": [ + "Security", + "IT Support" + ], + "message": "Your hTTPs Certificate is not valid on your www.aleph-alpha.de" + }, + { + "label": [ + "Human Resources" + ], + "message": "I want to take a week off immediately" + }, + { + "label": [ + "Human Resources" + ], + "message": "I want to take a sabbatical" + }, + { + "label": [ + "Human Resources" + ], + "message": "How can I work more, I want to work weekends, can I get paid overtime?" + } +] diff --git a/src/documentation/evaluation.ipynb b/src/documentation/evaluation.ipynb index 1baf85991..94a434c90 100644 --- a/src/documentation/evaluation.ipynb +++ b/src/documentation/evaluation.ipynb @@ -6,11 +6,12 @@ "metadata": {}, "outputs": [], "source": [ + "import json\n", "from collections import defaultdict\n", "from collections.abc import Mapping, Sequence\n", + "from pathlib import Path\n", "from typing import Any\n", "\n", - "from datasets import load_dataset\n", "from dotenv import load_dotenv\n", "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", @@ -152,9 +153,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = load_dataset(\"cardiffnlp/tweet_topic_multi\", split=\"validation_random\")\n", - "all_data = list(dataset)\n", - "data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now" + "with Path(\"data/classify_examples_multilabel.json\").open() as json_data:\n", + " data = json.load(json_data)" ] }, { @@ -192,7 +192,7 @@ "\n", "We want the `input` in each `Example` to mimic the input of an actual task. Therefore, we have to always include the text (chunk) and all possible labels.\n", "The `expected_output` shall correspond to anything we wish to compare our generated output to.\n", - "In this case, that means the correct class(es), i.e., the label name(s)." + "In this case, that means the correct class(es), i.e., the label(s)." ] }, { @@ -201,12 +201,12 @@ "metadata": {}, "outputs": [], "source": [ - "all_labels = list(set(label_name for item in data for label_name in item[\"label_name\"]))\n", + "all_labels = list(set(item[\"label\"][0] for item in data))\n", "dataset = dataset_repository.create_dataset(\n", " examples=[\n", " Example(\n", - " input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n", - " expected_output=item[\"label_name\"][0],\n", + " input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n", + " expected_output=item[\"label\"][0],\n", " )\n", " for item in data\n", " ],\n", @@ -214,6 +214,15 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_labels" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -314,9 +323,9 @@ "dataset = dataset_repository.create_dataset(\n", " examples=[\n", " Example(\n", - " input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n", + " input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n", " expected_output=item[\n", - " \"label_name\"\n", + " \"label\"\n", " ], # <- difference here, we take all labels instead of a single one\n", " )\n", " for item in data\n", @@ -341,10 +350,10 @@ "def build_labels_and_examples(hf_data: Any) -> Mapping[str, Sequence[str]]:\n", " examples = defaultdict(list)\n", " for item in hf_data:\n", - " labels = item[\"label_name\"]\n", + " labels = item[\"label\"]\n", " for label in labels:\n", " if len(examples[label]) < 20:\n", - " examples[label].append(item[\"text\"])\n", + " examples[label].append(item[\"message\"])\n", " return examples\n", "\n", "\n", @@ -353,7 +362,7 @@ " client=client,\n", " labels_with_examples=[\n", " LabelWithExamples(name=name, examples=examples)\n", - " for name, examples in build_labels_and_examples(all_data[25:]).items()\n", + " for name, examples in build_labels_and_examples(data).items()\n", " ],\n", ")" ]