From 4b71e92cb66505b7cb4d33cdc09a61f0ece41ea9 Mon Sep 17 00:00:00 2001 From: mantagen Date: Mon, 9 Sep 2024 18:52:17 +0100 Subject: [PATCH 1/4] fix(moderation prompt): lower false positives in toxic category --- apps/nextjs/package.json | 2 +- packages/aila/package.json | 2 +- .../moderation/moderators/OpenAiModerator.ts | 20 +++++----- packages/core/package.json | 2 +- .../ailaModeration/moderationCategories.json | 8 +--- .../utils/ailaModeration/moderationPrompt.ts | 15 +++----- .../utils/ailaModeration/moderationSchema.ts | 8 +--- packages/db/package.json | 2 +- pnpm-lock.yaml | 38 ++++++++++--------- 9 files changed, 42 insertions(+), 55 deletions(-) diff --git a/apps/nextjs/package.json b/apps/nextjs/package.json index effbde15a..99e9ba726 100644 --- a/apps/nextjs/package.json +++ b/apps/nextjs/package.json @@ -80,7 +80,7 @@ "languagedetect": "^2.0.0", "next": "14.2.5", "object-hash": "^3.0.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "p-limit": "^6.1.0", "partial-json-parser": "^1.2.2", "posthog-js": "^1.139.1", diff --git a/packages/aila/package.json b/packages/aila/package.json index 9dc09c676..63bd6e69a 100644 --- a/packages/aila/package.json +++ b/packages/aila/package.json @@ -27,7 +27,7 @@ "cloudinary": "^1.41.1", "dotenv-cli": "^6.0.0", "jsonrepair": "^3.8.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "remeda": "^1.29.0", "superjson": "^1.9.1", "tiny-invariant": "^1.3.1", diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts index 0c38e000d..7ecd16dea 100644 --- a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts +++ b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts @@ -5,6 +5,7 @@ import { moderationResponseSchema, } from "@oakai/core/src/utils/ailaModeration/moderationSchema"; import OpenAI from "openai"; +import zodToJsonSchema from "zod-to-json-schema"; import { AilaModerator, AilaModerationError } from "."; import { @@ -67,7 +68,14 @@ export class OpenAiModerator extends AilaModerator { { role: "user", content: input }, ], temperature: this._temperature, - response_format: { type: "json_object" }, + response_format: { + type: "json_schema", + json_schema: { + name: "moderationResponse", + strict: true, + schema: zodToJsonSchema(moderationResponseSchema), + }, + }, }, { headers: { @@ -103,16 +111,6 @@ export class OpenAiModerator extends AilaModerator { throw new AilaModerationError(`No moderation response`); } - // FIX: Sometimes the LLM incorrectly flags all available categories. - // The dummy smoke test shouldn't be triggered in normal use, and indicates this bug - if (response.data.categories.includes("t/dummy-smoke-test")) { - console.log( - "Moderation: dummy-smoke-test detected, retrying. Attempts: ", - attempts + 1, - ); - return await this._moderate(input, attempts + 1); - } - return response.data; } diff --git a/packages/core/package.json b/packages/core/package.json index f5b089128..821ef6e34 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -49,7 +49,7 @@ "inngest": "^3.16.1", "langchain": "^0.0.184", "obscenity": "^0.1.4", - "openai": "^4.52.0", + "openai": "^4.58.1", "tiny-invariant": "^1.3.1", "ts-dedent": "^2.2.0", "ts-md5": "^1.3.1", diff --git a/packages/core/src/utils/ailaModeration/moderationCategories.json b/packages/core/src/utils/ailaModeration/moderationCategories.json index 6946f7f4d..de0da0ae4 100644 --- a/packages/core/src/utils/ailaModeration/moderationCategories.json +++ b/packages/core/src/utils/ailaModeration/moderationCategories.json @@ -145,7 +145,7 @@ "codePrefix": "t", "criteria5": "(Fully compliant): The lesson plan does not contain any guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or encourage harmful behavior, illegal activity, or violence.", "criteria1": "(Severely non-compliant): The lesson plan contains explicit guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or directly encourages harmful behavior, illegal activity, or violence.", - "note": "", + "note": "The Toxic group is the only one which suggests the content is not legitimate, and has no place on our platform. It is not a just a 'Content guidance' warning as the others are. Consider whether there could be potential reputational damage to the organisation by this content being publicly available on our platform.", "categories": [ { "code": "t/guides-self-harm", @@ -200,12 +200,6 @@ "title": "Encouragement of Violence", "userDescription": "encouragement of violence", "llmDescription": "Encouragement of violence" - }, - { - "code": "t/dummy-smoke-test", - "title": "", - "userDescription": "", - "llmDescription": "no content matches this category" } ] } diff --git a/packages/core/src/utils/ailaModeration/moderationPrompt.ts b/packages/core/src/utils/ailaModeration/moderationPrompt.ts index a4d14ae41..48a3dffea 100644 --- a/packages/core/src/utils/ailaModeration/moderationPrompt.ts +++ b/packages/core/src/utils/ailaModeration/moderationPrompt.ts @@ -1,7 +1,4 @@ -import zodToJsonSchema from "zod-to-json-schema"; - import moderationCategories from "./moderationCategories.json"; -import { moderationResponseSchema } from "./moderationSchema"; export const moderationPrompt = ` OBJECTIVE: @@ -38,12 +35,12 @@ Rating Criteria: ) .join("")} -NOTE: Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner. +NOTE: -INSTRUCTION: Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. For category groups with a score less than 5, you must list the categories on which the lesson plan failed to meet the criteria. +Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner. -OUTPUT: Ensure that your response is a valid JSON document that conforms to the following JSON schema: ${JSON.stringify( - zodToJsonSchema(moderationResponseSchema), -)} +INSTRUCTION: -Do not include backticks in your response -- I should be able JSON parse your response.`; +Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. +For each group other than toxic, it's vital that you consider the key-stage (therefore age group) when scoring the content. +`; diff --git a/packages/core/src/utils/ailaModeration/moderationSchema.ts b/packages/core/src/utils/ailaModeration/moderationSchema.ts index 1a5d92cf0..fafe5b7e6 100644 --- a/packages/core/src/utils/ailaModeration/moderationSchema.ts +++ b/packages/core/src/utils/ailaModeration/moderationSchema.ts @@ -36,10 +36,9 @@ export const moderationCategoriesSchema = z.array( z.literal("t/encouragement-illegal-activity"), z.literal("t/encouragement-violence"), z.literal("t/encouragement-violence"), - z.literal("t/dummy-smoke-test"), ]) .describe( - `If the content scores less then 5 for any group, specify the categories on which it failed.`, + `If the content scores less than 5 for any group, specify the categories on which it failed.`, ), ); @@ -58,10 +57,7 @@ export const moderationResponseSchema = z.object({ p: likertScale.describe("Physical activity and safety score"), t: likertScale.describe("Toxic score"), }), - justification: z - .string() - .optional() - .describe(`Add justification for your scores.`), + justification: z.string().describe(`Add justification for your scores.`), categories: moderationCategoriesSchema, }); diff --git a/packages/db/package.json b/packages/db/package.json index c526cae34..7d30204c7 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -59,7 +59,7 @@ "cheerio": "1.0.0-rc.12", "chunk-text": "^2.0.1", "graphql-request": "^6.1.0", - "openai": "^4.52.0", + "openai": "^4.58.1", "p-queue": "^7.4.1", "p-queue-compat": "^1.0.225", "ts-node": "^10.9.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 944a40032..699ea600b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -251,8 +251,8 @@ importers: specifier: ^3.0.0 version: 3.0.0 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) p-limit: specifier: ^6.1.0 version: 6.1.0 @@ -468,8 +468,8 @@ importers: specifier: ^3.8.0 version: 3.8.0 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) remeda: specifier: ^1.29.0 version: 1.29.0 @@ -623,8 +623,8 @@ importers: specifier: ^0.1.4 version: 0.1.4 openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) tiny-invariant: specifier: ^1.3.1 version: 1.3.1 @@ -675,8 +675,8 @@ importers: specifier: ^6.1.0 version: 6.1.0(graphql@16.8.1) openai: - specifier: ^4.52.0 - version: 4.52.0 + specifier: ^4.58.1 + version: 4.58.1(zod@3.23.5) p-queue: specifier: ^7.4.1 version: 7.4.1 @@ -3913,7 +3913,7 @@ packages: dependencies: '@langchain/core': 0.1.30 js-tiktoken: 1.0.7 - openai: 4.52.0 + openai: 4.58.1(zod@3.23.5) zod: 3.23.5 zod-to-json-schema: 3.23.0(zod@3.23.5) transitivePeerDependencies: @@ -15291,7 +15291,7 @@ packages: langsmith: 0.0.48 lodash: 4.17.21 ml-distance: 4.0.1 - openai: 4.52.0 + openai: 4.58.1(zod@3.23.5) openapi-types: 12.1.3 p-queue: 6.6.2 p-retry: 4.6.2 @@ -17098,18 +17098,25 @@ packages: is-wsl: 2.2.0 dev: true - /openai@4.52.0: - resolution: {integrity: sha512-xmiNcdA9QJ5wffHpZDpIsge6AsPTETJ6h5iqDNuFQ7qGSNtonHn8Qe0VHy4UwLE8rBWiSqh4j+iSvuYZSeKkPg==} + /openai@4.58.1(zod@3.23.5): + resolution: {integrity: sha512-n9fN4RIjbj4PbZU6IN/FOBBbxHbHEcW18rDZ4nW2cDNfZP2+upm/FM20UCmRNMQTvhOvw/2Tw4vgioQyQb5nlA==} hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true dependencies: '@types/node': 18.18.5 '@types/node-fetch': 2.6.4 + '@types/qs': 6.9.15 abort-controller: 3.0.0 agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 node-fetch: 2.7.0 - web-streams-polyfill: 3.2.1 + qs: 6.13.0 + zod: 3.23.5 transitivePeerDependencies: - encoding dev: false @@ -21174,11 +21181,6 @@ packages: dependencies: defaults: 1.0.4 - /web-streams-polyfill@3.2.1: - resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} - engines: {node: '>= 8'} - dev: false - /web-streams-polyfill@4.0.0-beta.3: resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==} engines: {node: '>= 14'} From 23e111ff63ce6f04de3653d23a5a6228ddbe3fa2 Mon Sep 17 00:00:00 2001 From: mantagen Date: Mon, 9 Sep 2024 19:19:28 +0100 Subject: [PATCH 2/4] add parent threshold check --- .../moderation/moderators/OpenAiModerator.ts | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts index 7ecd16dea..1ee898671 100644 --- a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts +++ b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts @@ -111,7 +111,25 @@ export class OpenAiModerator extends AilaModerator { throw new AilaModerationError(`No moderation response`); } - return response.data; + const { categories, justification, scores } = response.data; + + return { + justification, + categories: categories.filter((category) => { + /** + * We only want to include the category if the parent category scores below a certain threshold. + * Seems to improve accuracy of the moderation. + * In future we may want to adjust this threshold based on subject and key-stage, and the + * category itself. + */ + const parentKey = category[0]; + for (const [key, score] of Object.entries(scores)) { + if (key === parentKey && score < 5) { + return true; + } + } + }), + }; } async moderate(input: string): Promise { From 3b20f291e287a257c0c9081718988f7e99c439f0 Mon Sep 17 00:00:00 2001 From: mantagen Date: Tue, 10 Sep 2024 10:20:29 +0100 Subject: [PATCH 3/4] use model that supports json_schema --- packages/aila/src/constants.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/aila/src/constants.ts b/packages/aila/src/constants.ts index 4d8d71edc..9ae7858dc 100644 --- a/packages/aila/src/constants.ts +++ b/packages/aila/src/constants.ts @@ -1,6 +1,9 @@ -export const DEFAULT_MODEL = "gpt-4o"; -export const DEFAULT_MODERATION_MODEL = "gpt-4o"; -export const DEFAULT_CATEGORISE_MODEL = "gpt-4o"; +import OpenAI from "openai"; + +export const DEFAULT_MODEL: OpenAI.Chat.ChatModel = "gpt-4o"; +export const DEFAULT_MODERATION_MODEL: OpenAI.Chat.ChatModel = + "gpt-4o-2024-08-06"; +export const DEFAULT_CATEGORISE_MODEL: OpenAI.Chat.ChatModel = "gpt-4o"; export const DEFAULT_TEMPERATURE = 0.7; export const DEFAULT_MODERATION_TEMPERATURE = 0.7; export const DEFAULT_RAG_LESSON_PLANS = 5; From 7553340257693d25b94a315d3a6cf90745b68198 Mon Sep 17 00:00:00 2001 From: mantagen Date: Tue, 10 Sep 2024 10:53:46 +0100 Subject: [PATCH 4/4] remove 'strict' from chat.completion call --- .../features/moderation/moderators/OpenAiModerator.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts index 6f5bf4e6d..86b2ecf7c 100644 --- a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts +++ b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts @@ -57,6 +57,8 @@ export class OpenAiModerator extends AilaModerator { throw new AilaModerationError("Failed to moderate after 3 attempts"); } + const schema = zodToJsonSchema(moderationResponseSchema); + const moderationResponse = await this._openAIClient.chat.completions.create( { model: this._model, @@ -72,8 +74,13 @@ export class OpenAiModerator extends AilaModerator { type: "json_schema", json_schema: { name: "moderationResponse", - strict: true, - schema: zodToJsonSchema(moderationResponseSchema), + /** + * Currently `strict` mode does not support minimum/maxiumum integer types, which + * we use for the likert scale in the moderation schema. + * @see https://community.openai.com/t/new-function-calling-with-strict-has-a-problem-with-minimum-integer-type/903258 + */ + // strict: true, + schema, }, }, },