From d026061c9519d43300cf5133dca988486d4ae2e5 Mon Sep 17 00:00:00 2001
From: FelixFehse <felix.fehse@ext.aleph-alpha.com>
Date: Fri, 14 Jun 2024 13:35:55 +0200
Subject: [PATCH] refactor: added custom multilabel classify example data

---
 .../data/classify_examples_multilabel.json    | 157 ++++++++++++++++++
 src/documentation/evaluation.ipynb            |  35 ++--
 2 files changed, 179 insertions(+), 13 deletions(-)
 create mode 100644 src/documentation/data/classify_examples_multilabel.json

diff --git a/src/documentation/data/classify_examples_multilabel.json b/src/documentation/data/classify_examples_multilabel.json
new file mode 100644
index 000000000..e81920fe3
--- /dev/null
+++ b/src/documentation/data/classify_examples_multilabel.json
@@ -0,0 +1,157 @@
+[
+  {
+    "label": [
+      "Finance",
+      "Sales"
+    ],
+    "message": "I just traveled to Paris for a conference, where can I get the train ride refunded?"
+  },
+  {
+    "label": [
+      "Sales",
+      "Communications",
+      "Customer"
+    ],
+    "message": "Hello, we would like to get in contact with your sales team, because we are interested in your solution."
+  },
+  {
+    "label": [
+      "Communications",
+      "Research"
+    ],
+    "message": "We are working on a documentation on AI and would like to film a piece about you. Would you be interested?"
+  },
+  {
+    "label": [
+      "Research"
+    ],
+    "message": "I am working with Stanford and was hoping to win you over for a research collaboration."
+  },
+  {
+    "label": [
+      "IT Support"
+    ],
+    "message": "My laptop is broken"
+  },
+  {
+    "label": [
+      "Communications"
+    ],
+    "message": "Can you send your models via email?"
+  },
+  {
+    "label": [
+      "Research"
+    ],
+    "message": "We should do a research collaboration."
+  },
+  {
+    "label": [
+      "Research",
+      "Sales"
+    ],
+    "message": "My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market and research strategy."
+  },
+  {
+    "label": [
+      "Human Resources"
+    ],
+    "message": "Full stack developer in your area available now."
+  },
+  {
+    "label": [
+      "Product",
+      "IT Support"
+    ],
+    "message": "Hi,\n\nI recently bought your offering. I am having trouble running your docker container in my environment. It fails to start. Can you help?"
+  },
+  {
+    "label": [
+      "Product"
+    ],
+    "message": "Hello,\n\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?"
+  },
+  {
+    "label": [
+      "Product",
+      "Sales",
+      "Customer"
+    ],
+    "message": "Can you show me a demo of different use cases your offering can solve?"
+  },
+  {
+    "label": [
+      "Human Resources",
+      "Communications"
+    ],
+    "message": "Hey, I did not get a t-shirt in the onboarding. Could I still get one?"
+  },
+  {
+    "label": [
+      "Customer"
+    ],
+    "message": "Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?"
+  },
+  {
+    "label": [
+      "Product"
+    ],
+    "message": "Hi Jan, is your product ISO 37301 compliant?"
+  },
+  {
+    "label": [
+      "IT Support"
+    ],
+    "message": "I can\u2019t login to Mattermost or Sharepoint, how can I gain access?"
+  },
+  {
+    "label": [
+      "Finance",
+      "Human Resources"
+    ],
+    "message": "I did not get paid last month, when do I get paid? What is going on?"
+  },
+  {
+    "label": [
+      "Security"
+    ],
+    "message": "Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. "
+  },
+  {
+    "label": [
+      "Marketing"
+    ],
+    "message": "I have a question concerning your marketing strategy, would you have time to hop on a call?"
+  },
+  {
+    "label": [
+      "CEO Office"
+    ],
+    "message": "Dear Jonas Andrulis,\n\nWe have met each other at the event in N\u00fcrnberg, can we meet for a follow up in your Office in Heidelberg?"
+  },
+  {
+    "label": [
+      "Security",
+      "IT Support"
+    ],
+    "message": "Your hTTPs Certificate is not valid on your www.aleph-alpha.de"
+  },
+  {
+    "label": [
+      "Human Resources"
+    ],
+    "message": "I want to take a week off immediately"
+  },
+  {
+    "label": [
+      "Human Resources"
+    ],
+    "message": "I want to take a sabbatical"
+  },
+  {
+    "label": [
+      "Human Resources"
+    ],
+    "message": "How can I work more, I want to work weekends, can I get paid overtime?"
+  }
+]
diff --git a/src/documentation/evaluation.ipynb b/src/documentation/evaluation.ipynb
index 1baf85991..94a434c90 100644
--- a/src/documentation/evaluation.ipynb
+++ b/src/documentation/evaluation.ipynb
@@ -6,11 +6,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import json\n",
     "from collections import defaultdict\n",
     "from collections.abc import Mapping, Sequence\n",
+    "from pathlib import Path\n",
     "from typing import Any\n",
     "\n",
-    "from datasets import load_dataset\n",
     "from dotenv import load_dotenv\n",
     "\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
@@ -152,9 +153,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = load_dataset(\"cardiffnlp/tweet_topic_multi\", split=\"validation_random\")\n",
-    "all_data = list(dataset)\n",
-    "data = all_data[:25]  # this has 573 datapoints, let's take a look at 25 for now"
+    "with Path(\"data/classify_examples_multilabel.json\").open() as json_data:\n",
+    "    data = json.load(json_data)"
    ]
   },
   {
@@ -192,7 +192,7 @@
     "\n",
     "We want the `input` in each `Example` to mimic the input of an actual task. Therefore, we have to always include the text (chunk) and all possible labels.\n",
     "The `expected_output` shall correspond to anything we wish to compare our generated output to.\n",
-    "In this case, that means the correct class(es), i.e., the label name(s)."
+    "In this case, that means the correct class(es), i.e., the label(s)."
    ]
   },
   {
@@ -201,12 +201,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_labels = list(set(label_name for item in data for label_name in item[\"label_name\"]))\n",
+    "all_labels = list(set(item[\"label\"][0] for item in data))\n",
     "dataset = dataset_repository.create_dataset(\n",
     "    examples=[\n",
     "        Example(\n",
-    "            input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n",
-    "            expected_output=item[\"label_name\"][0],\n",
+    "            input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n",
+    "            expected_output=item[\"label\"][0],\n",
     "        )\n",
     "        for item in data\n",
     "    ],\n",
@@ -214,6 +214,15 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_labels"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -314,9 +323,9 @@
     "dataset = dataset_repository.create_dataset(\n",
     "    examples=[\n",
     "        Example(\n",
-    "            input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n",
+    "            input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n",
     "            expected_output=item[\n",
-    "                \"label_name\"\n",
+    "                \"label\"\n",
     "            ],  # <- difference here, we take all labels instead of a single one\n",
     "        )\n",
     "        for item in data\n",
@@ -341,10 +350,10 @@
     "def build_labels_and_examples(hf_data: Any) -> Mapping[str, Sequence[str]]:\n",
     "    examples = defaultdict(list)\n",
     "    for item in hf_data:\n",
-    "        labels = item[\"label_name\"]\n",
+    "        labels = item[\"label\"]\n",
     "        for label in labels:\n",
     "            if len(examples[label]) < 20:\n",
-    "                examples[label].append(item[\"text\"])\n",
+    "                examples[label].append(item[\"message\"])\n",
     "    return examples\n",
     "\n",
     "\n",
@@ -353,7 +362,7 @@
     "    client=client,\n",
     "    labels_with_examples=[\n",
     "        LabelWithExamples(name=name, examples=examples)\n",
-    "        for name, examples in build_labels_and_examples(all_data[25:]).items()\n",
+    "        for name, examples in build_labels_and_examples(data).items()\n",
     "    ],\n",
     ")"
    ]