Skip to content

Commit

Permalink
refactor: added custom multilabel classify example data
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixFehse committed Jun 14, 2024
1 parent c1723eb commit d026061
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 13 deletions.
157 changes: 157 additions & 0 deletions src/documentation/data/classify_examples_multilabel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
[
{
"label": [
"Finance",
"Sales"
],
"message": "I just traveled to Paris for a conference, where can I get the train ride refunded?"
},
{
"label": [
"Sales",
"Communications",
"Customer"
],
"message": "Hello, we would like to get in contact with your sales team, because we are interested in your solution."
},
{
"label": [
"Communications",
"Research"
],
"message": "We are working on a documentation on AI and would like to film a piece about you. Would you be interested?"
},
{
"label": [
"Research"
],
"message": "I am working with Stanford and was hoping to win you over for a research collaboration."
},
{
"label": [
"IT Support"
],
"message": "My laptop is broken"
},
{
"label": [
"Communications"
],
"message": "Can you send your models via email?"
},
{
"label": [
"Research"
],
"message": "We should do a research collaboration."
},
{
"label": [
"Research",
"Sales"
],
"message": "My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market and research strategy."
},
{
"label": [
"Human Resources"
],
"message": "Full stack developer in your area available now."
},
{
"label": [
"Product",
"IT Support"
],
"message": "Hi,\n\nI recently bought your offering. I am having trouble running your docker container in my environment. It fails to start. Can you help?"
},
{
"label": [
"Product"
],
"message": "Hello,\n\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?"
},
{
"label": [
"Product",
"Sales",
"Customer"
],
"message": "Can you show me a demo of different use cases your offering can solve?"
},
{
"label": [
"Human Resources",
"Communications"
],
"message": "Hey, I did not get a t-shirt in the onboarding. Could I still get one?"
},
{
"label": [
"Customer"
],
"message": "Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?"
},
{
"label": [
"Product"
],
"message": "Hi Jan, is your product ISO 37301 compliant?"
},
{
"label": [
"IT Support"
],
"message": "I can\u2019t login to Mattermost or Sharepoint, how can I gain access?"
},
{
"label": [
"Finance",
"Human Resources"
],
"message": "I did not get paid last month, when do I get paid? What is going on?"
},
{
"label": [
"Security"
],
"message": "Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. "
},
{
"label": [
"Marketing"
],
"message": "I have a question concerning your marketing strategy, would you have time to hop on a call?"
},
{
"label": [
"CEO Office"
],
"message": "Dear Jonas Andrulis,\n\nWe have met each other at the event in N\u00fcrnberg, can we meet for a follow up in your Office in Heidelberg?"
},
{
"label": [
"Security",
"IT Support"
],
"message": "Your hTTPs Certificate is not valid on your www.aleph-alpha.de"
},
{
"label": [
"Human Resources"
],
"message": "I want to take a week off immediately"
},
{
"label": [
"Human Resources"
],
"message": "I want to take a sabbatical"
},
{
"label": [
"Human Resources"
],
"message": "How can I work more, I want to work weekends, can I get paid overtime?"
}
]
35 changes: 22 additions & 13 deletions src/documentation/evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from collections import defaultdict\n",
"from collections.abc import Mapping, Sequence\n",
"from pathlib import Path\n",
"from typing import Any\n",
"\n",
"from datasets import load_dataset\n",
"from dotenv import load_dotenv\n",
"\n",
"from intelligence_layer.connectors import LimitedConcurrencyClient\n",
Expand Down Expand Up @@ -152,9 +153,8 @@
"metadata": {},
"outputs": [],
"source": [
"dataset = load_dataset(\"cardiffnlp/tweet_topic_multi\", split=\"validation_random\")\n",
"all_data = list(dataset)\n",
"data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now"
"with Path(\"data/classify_examples_multilabel.json\").open() as json_data:\n",
" data = json.load(json_data)"
]
},
{
Expand Down Expand Up @@ -192,7 +192,7 @@
"\n",
"We want the `input` in each `Example` to mimic the input of an actual task. Therefore, we have to always include the text (chunk) and all possible labels.\n",
"The `expected_output` shall correspond to anything we wish to compare our generated output to.\n",
"In this case, that means the correct class(es), i.e., the label name(s)."
"In this case, that means the correct class(es), i.e., the label(s)."
]
},
{
Expand All @@ -201,19 +201,28 @@
"metadata": {},
"outputs": [],
"source": [
"all_labels = list(set(label_name for item in data for label_name in item[\"label_name\"]))\n",
"all_labels = list(set(item[\"label\"][0] for item in data))\n",
"dataset = dataset_repository.create_dataset(\n",
" examples=[\n",
" Example(\n",
" input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n",
" expected_output=item[\"label_name\"][0],\n",
" input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n",
" expected_output=item[\"label\"][0],\n",
" )\n",
" for item in data\n",
" ],\n",
" dataset_name=\"tweet_topic_single\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -314,9 +323,9 @@
"dataset = dataset_repository.create_dataset(\n",
" examples=[\n",
" Example(\n",
" input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n",
" input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n",
" expected_output=item[\n",
" \"label_name\"\n",
" \"label\"\n",
" ], # <- difference here, we take all labels instead of a single one\n",
" )\n",
" for item in data\n",
Expand All @@ -341,10 +350,10 @@
"def build_labels_and_examples(hf_data: Any) -> Mapping[str, Sequence[str]]:\n",
" examples = defaultdict(list)\n",
" for item in hf_data:\n",
" labels = item[\"label_name\"]\n",
" labels = item[\"label\"]\n",
" for label in labels:\n",
" if len(examples[label]) < 20:\n",
" examples[label].append(item[\"text\"])\n",
" examples[label].append(item[\"message\"])\n",
" return examples\n",
"\n",
"\n",
Expand All @@ -353,7 +362,7 @@
" client=client,\n",
" labels_with_examples=[\n",
" LabelWithExamples(name=name, examples=examples)\n",
" for name, examples in build_labels_and_examples(all_data[25:]).items()\n",
" for name, examples in build_labels_and_examples(data).items()\n",
" ],\n",
")"
]
Expand Down

0 comments on commit d026061

Please sign in to comment.