fix(moderation prompt): lower false positives in toxic category (#102)

oaknational · Sep 10, 2024 · 507eedc · 507eedc
1 parent ce3e62b
commit 507eedc
Show file tree

Hide file tree

Showing 10 changed files with 73 additions and 58 deletions.
diff --git a/apps/nextjs/package.json b/apps/nextjs/package.json
@@ -80,7 +80,7 @@
     "languagedetect": "^2.0.0",
     "next": "14.2.5",
     "object-hash": "^3.0.0",
-    "openai": "^4.52.0",
+    "openai": "^4.58.1",
     "p-limit": "^6.1.0",
     "partial-json-parser": "^1.2.2",
     "posthog-js": "^1.139.1",

diff --git a/packages/aila/package.json b/packages/aila/package.json
@@ -27,7 +27,7 @@
     "cloudinary": "^1.41.1",
     "dotenv-cli": "^6.0.0",
     "jsonrepair": "^3.8.0",
-    "openai": "^4.52.0",
+    "openai": "^4.58.1",
     "remeda": "^1.29.0",
     "superjson": "^1.9.1",
     "tiny-invariant": "^1.3.1",

diff --git a/packages/aila/src/constants.ts b/packages/aila/src/constants.ts
@@ -1,6 +1,9 @@
-export const DEFAULT_MODEL = "gpt-4o";
-export const DEFAULT_MODERATION_MODEL = "gpt-4o";
-export const DEFAULT_CATEGORISE_MODEL = "gpt-4o";
+import OpenAI from "openai";
+
+export const DEFAULT_MODEL: OpenAI.Chat.ChatModel = "gpt-4o";
+export const DEFAULT_MODERATION_MODEL: OpenAI.Chat.ChatModel =
+  "gpt-4o-2024-08-06";
+export const DEFAULT_CATEGORISE_MODEL: OpenAI.Chat.ChatModel = "gpt-4o";
 export const DEFAULT_TEMPERATURE = 0.7;
 export const DEFAULT_MODERATION_TEMPERATURE = 0.7;
 export const DEFAULT_RAG_LESSON_PLANS = 5;

diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts
@@ -5,6 +5,7 @@ import {
   moderationResponseSchema,
 } from "@oakai/core/src/utils/ailaModeration/moderationSchema";
 import OpenAI from "openai";
+import zodToJsonSchema from "zod-to-json-schema";
 
 import { AilaModerator, AilaModerationError } from ".";
 import {
@@ -56,6 +57,8 @@ export class OpenAiModerator extends AilaModerator {
       throw new AilaModerationError("Failed to moderate after 3 attempts");
     }
 
+    const schema = zodToJsonSchema(moderationResponseSchema);
+
     const moderationResponse = await this._openAIClient.chat.completions.create(
       {
         model: this._model,
@@ -67,7 +70,19 @@ export class OpenAiModerator extends AilaModerator {
           { role: "user", content: input },
         ],
         temperature: this._temperature,
-        response_format: { type: "json_object" },
+        response_format: {
+          type: "json_schema",
+          json_schema: {
+            name: "moderationResponse",
+            /**
+             * Currently `strict` mode does not support minimum/maxiumum integer types, which
+             * we use for the likert scale in the moderation schema.
+             * @see https://community.openai.com/t/new-function-calling-with-strict-has-a-problem-with-minimum-integer-type/903258
+             */
+            // strict: true,
+            schema,
+          },
+        },
       },
       {
         headers: {
@@ -102,17 +117,25 @@ export class OpenAiModerator extends AilaModerator {
       throw new AilaModerationError(`No moderation response`);
     }
 
-    // FIX: Sometimes the LLM incorrectly flags all available categories.
-    // The dummy smoke test shouldn't be triggered in normal use, and indicates this bug
-    if (response.data.categories.includes("t/dummy-smoke-test")) {
-      console.log(
-        "Moderation: dummy-smoke-test detected, retrying. Attempts: ",
-        attempts + 1,
-      );
-      return await this._moderate(input, attempts + 1);
-    }
+    const { categories, justification, scores } = response.data;
 
-    return response.data;
+    return {
+      justification,
+      categories: categories.filter((category) => {
+        /**
+         * We only want to include the category if the parent category scores below a certain threshold.
+         * Seems to improve accuracy of the moderation.
+         * In future we may want to adjust this threshold based on subject and key-stage, and the
+         * category itself.
+         */
+        const parentKey = category[0];
+        for (const [key, score] of Object.entries(scores)) {
+          if (key === parentKey && score < 5) {
+            return true;
+          }
+        }
+      }),
+    };
   }
 
   async moderate(input: string): Promise<ModerationResult> {

diff --git a/packages/core/package.json b/packages/core/package.json
@@ -49,7 +49,7 @@
     "inngest": "^3.16.1",
     "langchain": "^0.0.184",
     "obscenity": "^0.1.4",
-    "openai": "^4.52.0",
+    "openai": "^4.58.1",
     "tiny-invariant": "^1.3.1",
     "ts-dedent": "^2.2.0",
     "ts-md5": "^1.3.1",

diff --git a/packages/core/src/utils/ailaModeration/moderationCategories.json b/packages/core/src/utils/ailaModeration/moderationCategories.json
@@ -145,7 +145,7 @@
     "codePrefix": "t",
     "criteria5": "(Fully compliant): The lesson plan does not contain any guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or encourage harmful behavior, illegal activity, or violence.",
     "criteria1": "(Severely non-compliant): The lesson plan contains explicit guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or directly encourages harmful behavior, illegal activity, or violence.",
-    "note": "",
+    "note": "The Toxic group is the only one which suggests the content is not legitimate, and has no place on our platform. It is not a just a 'Content guidance' warning as the others are. Consider whether there could be potential reputational damage to the organisation by this content being publicly available on our platform.",
     "categories": [
       {
         "code": "t/guides-self-harm",
@@ -200,12 +200,6 @@
         "title": "Encouragement of Violence",
         "userDescription": "encouragement of violence",
         "llmDescription": "Encouragement of violence"
-      },
-      {
-        "code": "t/dummy-smoke-test",
-        "title": "",
-        "userDescription": "",
-        "llmDescription": "no content matches this category"
       }
     ]
   }

diff --git a/packages/core/src/utils/ailaModeration/moderationPrompt.ts b/packages/core/src/utils/ailaModeration/moderationPrompt.ts
@@ -1,7 +1,4 @@
-import zodToJsonSchema from "zod-to-json-schema";
-
 import moderationCategories from "./moderationCategories.json";
-import { moderationResponseSchema } from "./moderationSchema";
 
 export const moderationPrompt = `
 OBJECTIVE:
@@ -38,12 +35,12 @@ Rating Criteria:
   )
   .join("")}
 
-NOTE: Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner.
+NOTE:
 
-INSTRUCTION: Use the above  to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. For category groups with a score less than 5, you must list the categories on which the lesson plan failed to meet the criteria.
+Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner.
 
-OUTPUT: Ensure that your response is a valid JSON document that conforms to the following JSON schema: ${JSON.stringify(
-  zodToJsonSchema(moderationResponseSchema),
-)}
+INSTRUCTION:
 
-Do not include backticks in your response -- I should be able JSON parse your response.`;
+Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5.
+For each group other than toxic, it's vital that you consider the key-stage (therefore age group) when scoring the content.
+`;
diff --git a/packages/core/src/utils/ailaModeration/moderationSchema.ts b/packages/core/src/utils/ailaModeration/moderationSchema.ts
@@ -36,10 +36,9 @@ export const moderationCategoriesSchema = z.array(
       z.literal("t/encouragement-illegal-activity"),
       z.literal("t/encouragement-violence"),
       z.literal("t/encouragement-violence"),
-      z.literal("t/dummy-smoke-test"),
     ])
     .describe(
-      `If the content scores less then 5 for any group, specify the categories on which it failed.`,
+      `If the content scores less than 5 for any group, specify the categories on which it failed.`,
     ),
 );
 
@@ -58,10 +57,7 @@ export const moderationResponseSchema = z.object({
     p: likertScale.describe("Physical activity and safety score"),
     t: likertScale.describe("Toxic score"),
   }),
-  justification: z
-    .string()
-    .optional()
-    .describe(`Add justification for your scores.`),
+  justification: z.string().describe(`Add justification for your scores.`),
   categories: moderationCategoriesSchema,
 });
 

diff --git a/packages/db/package.json b/packages/db/package.json
@@ -59,7 +59,7 @@
     "cheerio": "1.0.0-rc.12",
     "chunk-text": "^2.0.1",
     "graphql-request": "^6.1.0",
-    "openai": "^4.52.0",
+    "openai": "^4.58.1",
     "p-queue": "^7.4.1",
     "p-queue-compat": "^1.0.225",
     "ts-node": "^10.9.2",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml