From 507eedc1f7574aa25478059a130e599945224cf8 Mon Sep 17 00:00:00 2001 From: MG Date: Tue, 10 Sep 2024 11:08:03 +0100 Subject: [PATCH] fix(moderation prompt): lower false positives in toxic category (#102) --- apps/nextjs/package.json | 2 +- packages/aila/package.json | 2 +- packages/aila/src/constants.ts | 9 ++-- .../moderation/moderators/OpenAiModerator.ts | 45 ++++++++++++++----- packages/core/package.json | 2 +- .../ailaModeration/moderationCategories.json | 8 +--- .../utils/ailaModeration/moderationPrompt.ts | 15 +++---- .../utils/ailaModeration/moderationSchema.ts | 8 +--- packages/db/package.json | 2 +- pnpm-lock.yaml | 38 ++++++++-------- 10 files changed, 73 insertions(+), 58 deletions(-) diff --git a/apps/nextjs/package.json b/apps/nextjs/package.json index effbde15a..99e9ba726 100644 --- a/apps/nextjs/package.json +++ b/apps/nextjs/package.json @@ -80,7 +80,7 @@ "languagedetect": "^2.0.0", "next": "14.2.5", "object-hash": "^3.0.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "p-limit": "^6.1.0", "partial-json-parser": "^1.2.2", "posthog-js": "^1.139.1", diff --git a/packages/aila/package.json b/packages/aila/package.json index 9dc09c676..63bd6e69a 100644 --- a/packages/aila/package.json +++ b/packages/aila/package.json @@ -27,7 +27,7 @@ "cloudinary": "^1.41.1", "dotenv-cli": "^6.0.0", "jsonrepair": "^3.8.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "remeda": "^1.29.0", "superjson": "^1.9.1", "tiny-invariant": "^1.3.1", diff --git a/packages/aila/src/constants.ts b/packages/aila/src/constants.ts index 4d8d71edc..9ae7858dc 100644 --- a/packages/aila/src/constants.ts +++ b/packages/aila/src/constants.ts @@ -1,6 +1,9 @@ -export const DEFAULT_MODEL = "gpt-4o"; -export const DEFAULT_MODERATION_MODEL = "gpt-4o"; -export const DEFAULT_CATEGORISE_MODEL = "gpt-4o"; +import OpenAI from "openai"; + +export const DEFAULT_MODEL: OpenAI.Chat.ChatModel = "gpt-4o"; +export const DEFAULT_MODERATION_MODEL: OpenAI.Chat.ChatModel = + "gpt-4o-2024-08-06"; +export const DEFAULT_CATEGORISE_MODEL: OpenAI.Chat.ChatModel = "gpt-4o"; export const DEFAULT_TEMPERATURE = 0.7; export const DEFAULT_MODERATION_TEMPERATURE = 0.7; export const DEFAULT_RAG_LESSON_PLANS = 5; diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts index 5611739ba..86b2ecf7c 100644 --- a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts +++ b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts @@ -5,6 +5,7 @@ import { moderationResponseSchema, } from "@oakai/core/src/utils/ailaModeration/moderationSchema"; import OpenAI from "openai"; +import zodToJsonSchema from "zod-to-json-schema"; import { AilaModerator, AilaModerationError } from "."; import { @@ -56,6 +57,8 @@ export class OpenAiModerator extends AilaModerator { throw new AilaModerationError("Failed to moderate after 3 attempts"); } + const schema = zodToJsonSchema(moderationResponseSchema); + const moderationResponse = await this._openAIClient.chat.completions.create( { model: this._model, @@ -67,7 +70,19 @@ export class OpenAiModerator extends AilaModerator { { role: "user", content: input }, ], temperature: this._temperature, - response_format: { type: "json_object" }, + response_format: { + type: "json_schema", + json_schema: { + name: "moderationResponse", + /** + * Currently `strict` mode does not support minimum/maxiumum integer types, which + * we use for the likert scale in the moderation schema. + * @see https://community.openai.com/t/new-function-calling-with-strict-has-a-problem-with-minimum-integer-type/903258 + */ + // strict: true, + schema, + }, + }, }, { headers: { @@ -102,17 +117,25 @@ export class OpenAiModerator extends AilaModerator { throw new AilaModerationError(`No moderation response`); } - // FIX: Sometimes the LLM incorrectly flags all available categories. - // The dummy smoke test shouldn't be triggered in normal use, and indicates this bug - if (response.data.categories.includes("t/dummy-smoke-test")) { - console.log( - "Moderation: dummy-smoke-test detected, retrying. Attempts: ", - attempts + 1, - ); - return await this._moderate(input, attempts + 1); - } + const { categories, justification, scores } = response.data; - return response.data; + return { + justification, + categories: categories.filter((category) => { + /** + * We only want to include the category if the parent category scores below a certain threshold. + * Seems to improve accuracy of the moderation. + * In future we may want to adjust this threshold based on subject and key-stage, and the + * category itself. + */ + const parentKey = category[0]; + for (const [key, score] of Object.entries(scores)) { + if (key === parentKey && score < 5) { + return true; + } + } + }), + }; } async moderate(input: string): Promise { diff --git a/packages/core/package.json b/packages/core/package.json index f5b089128..821ef6e34 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -49,7 +49,7 @@ "inngest": "^3.16.1", "langchain": "^0.0.184", "obscenity": "^0.1.4", - "openai": "^4.52.0", + "openai": "^4.58.1", "tiny-invariant": "^1.3.1", "ts-dedent": "^2.2.0", "ts-md5": "^1.3.1", diff --git a/packages/core/src/utils/ailaModeration/moderationCategories.json b/packages/core/src/utils/ailaModeration/moderationCategories.json index 6946f7f4d..de0da0ae4 100644 --- a/packages/core/src/utils/ailaModeration/moderationCategories.json +++ b/packages/core/src/utils/ailaModeration/moderationCategories.json @@ -145,7 +145,7 @@ "codePrefix": "t", "criteria5": "(Fully compliant): The lesson plan does not contain any guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or encourage harmful behavior, illegal activity, or violence.", "criteria1": "(Severely non-compliant): The lesson plan contains explicit guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or directly encourages harmful behavior, illegal activity, or violence.", - "note": "", + "note": "The Toxic group is the only one which suggests the content is not legitimate, and has no place on our platform. It is not a just a 'Content guidance' warning as the others are. Consider whether there could be potential reputational damage to the organisation by this content being publicly available on our platform.", "categories": [ { "code": "t/guides-self-harm", @@ -200,12 +200,6 @@ "title": "Encouragement of Violence", "userDescription": "encouragement of violence", "llmDescription": "Encouragement of violence" - }, - { - "code": "t/dummy-smoke-test", - "title": "", - "userDescription": "", - "llmDescription": "no content matches this category" } ] } diff --git a/packages/core/src/utils/ailaModeration/moderationPrompt.ts b/packages/core/src/utils/ailaModeration/moderationPrompt.ts index a4d14ae41..48a3dffea 100644 --- a/packages/core/src/utils/ailaModeration/moderationPrompt.ts +++ b/packages/core/src/utils/ailaModeration/moderationPrompt.ts @@ -1,7 +1,4 @@ -import zodToJsonSchema from "zod-to-json-schema"; - import moderationCategories from "./moderationCategories.json"; -import { moderationResponseSchema } from "./moderationSchema"; export const moderationPrompt = ` OBJECTIVE: @@ -38,12 +35,12 @@ Rating Criteria: ) .join("")} -NOTE: Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner. +NOTE: -INSTRUCTION: Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. For category groups with a score less than 5, you must list the categories on which the lesson plan failed to meet the criteria. +Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner. -OUTPUT: Ensure that your response is a valid JSON document that conforms to the following JSON schema: ${JSON.stringify( - zodToJsonSchema(moderationResponseSchema), -)} +INSTRUCTION: -Do not include backticks in your response -- I should be able JSON parse your response.`; +Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. +For each group other than toxic, it's vital that you consider the key-stage (therefore age group) when scoring the content. +`; diff --git a/packages/core/src/utils/ailaModeration/moderationSchema.ts b/packages/core/src/utils/ailaModeration/moderationSchema.ts index 1a5d92cf0..fafe5b7e6 100644 --- a/packages/core/src/utils/ailaModeration/moderationSchema.ts +++ b/packages/core/src/utils/ailaModeration/moderationSchema.ts @@ -36,10 +36,9 @@ export const moderationCategoriesSchema = z.array( z.literal("t/encouragement-illegal-activity"), z.literal("t/encouragement-violence"), z.literal("t/encouragement-violence"), - z.literal("t/dummy-smoke-test"), ]) .describe( - `If the content scores less then 5 for any group, specify the categories on which it failed.`, + `If the content scores less than 5 for any group, specify the categories on which it failed.`, ), ); @@ -58,10 +57,7 @@ export const moderationResponseSchema = z.object({ p: likertScale.describe("Physical activity and safety score"), t: likertScale.describe("Toxic score"), }), - justification: z - .string() - .optional() - .describe(`Add justification for your scores.`), + justification: z.string().describe(`Add justification for your scores.`), categories: moderationCategoriesSchema, }); diff --git a/packages/db/package.json b/packages/db/package.json index c526cae34..7d30204c7 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -59,7 +59,7 @@ "cheerio": "1.0.0-rc.12", "chunk-text": "^2.0.1", "graphql-request": "^6.1.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "p-queue": "^7.4.1", "p-queue-compat": "^1.0.225", "ts-node": "^10.9.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 944a40032..699ea600b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -251,8 +251,8 @@ importers: specifier: ^3.0.0 version: 3.0.0 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) p-limit: specifier: ^6.1.0 version: 6.1.0 @@ -468,8 +468,8 @@ importers: specifier: ^3.8.0 version: 3.8.0 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) remeda: specifier: ^1.29.0 version: 1.29.0 @@ -623,8 +623,8 @@ importers: specifier: ^0.1.4 version: 0.1.4 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) tiny-invariant: specifier: ^1.3.1 version: 1.3.1 @@ -675,8 +675,8 @@ importers: specifier: ^6.1.0 version: 6.1.0(graphql@16.8.1) openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) p-queue: specifier: ^7.4.1 version: 7.4.1 @@ -3913,7 +3913,7 @@ packages: dependencies: '@langchain/core': 0.1.30 js-tiktoken: 1.0.7 - openai: 4.52.0 + openai: 4.58.1(zod@3.23.5) zod: 3.23.5 zod-to-json-schema: 3.23.0(zod@3.23.5) transitivePeerDependencies: @@ -15291,7 +15291,7 @@ packages: langsmith: 0.0.48 lodash: 4.17.21 ml-distance: 4.0.1 - openai: 4.52.0 + openai: 4.58.1(zod@3.23.5) openapi-types: 12.1.3 p-queue: 6.6.2 p-retry: 4.6.2 @@ -17098,18 +17098,25 @@ packages: is-wsl: 2.2.0 dev: true - /openai@4.52.0: - resolution: {integrity: sha512-xmiNcdA9QJ5wffHpZDpIsge6AsPTETJ6h5iqDNuFQ7qGSNtonHn8Qe0VHy4UwLE8rBWiSqh4j+iSvuYZSeKkPg==} + /openai@4.58.1(zod@3.23.5): + resolution: {integrity: sha512-n9fN4RIjbj4PbZU6IN/FOBBbxHbHEcW18rDZ4nW2cDNfZP2+upm/FM20UCmRNMQTvhOvw/2Tw4vgioQyQb5nlA==} hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true dependencies: '@types/node': 18.18.5 '@types/node-fetch': 2.6.4 + '@types/qs': 6.9.15 abort-controller: 3.0.0 agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 node-fetch: 2.7.0 - web-streams-polyfill: 3.2.1 + qs: 6.13.0 + zod: 3.23.5 transitivePeerDependencies: - encoding dev: false @@ -21174,11 +21181,6 @@ packages: dependencies: defaults: 1.0.4 - /web-streams-polyfill@3.2.1: - resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} - engines: {node: '>= 8'} - dev: false - /web-streams-polyfill@4.0.0-beta.3: resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==} engines: {node: '>= 14'}