Skip to content

Commit

Permalink
fix(moderation prompt): lower false positives in toxic category (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
mantagen authored Sep 10, 2024
1 parent ce3e62b commit 507eedc
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 58 deletions.
2 changes: 1 addition & 1 deletion apps/nextjs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"languagedetect": "^2.0.0",
"next": "14.2.5",
"object-hash": "^3.0.0",
"openai": "^4.52.0",
"openai": "^4.58.1",
"p-limit": "^6.1.0",
"partial-json-parser": "^1.2.2",
"posthog-js": "^1.139.1",
Expand Down
2 changes: 1 addition & 1 deletion packages/aila/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"cloudinary": "^1.41.1",
"dotenv-cli": "^6.0.0",
"jsonrepair": "^3.8.0",
"openai": "^4.52.0",
"openai": "^4.58.1",
"remeda": "^1.29.0",
"superjson": "^1.9.1",
"tiny-invariant": "^1.3.1",
Expand Down
9 changes: 6 additions & 3 deletions packages/aila/src/constants.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
export const DEFAULT_MODEL = "gpt-4o";
export const DEFAULT_MODERATION_MODEL = "gpt-4o";
export const DEFAULT_CATEGORISE_MODEL = "gpt-4o";
import OpenAI from "openai";

export const DEFAULT_MODEL: OpenAI.Chat.ChatModel = "gpt-4o";
export const DEFAULT_MODERATION_MODEL: OpenAI.Chat.ChatModel =
"gpt-4o-2024-08-06";
export const DEFAULT_CATEGORISE_MODEL: OpenAI.Chat.ChatModel = "gpt-4o";
export const DEFAULT_TEMPERATURE = 0.7;
export const DEFAULT_MODERATION_TEMPERATURE = 0.7;
export const DEFAULT_RAG_LESSON_PLANS = 5;
Expand Down
45 changes: 34 additions & 11 deletions packages/aila/src/features/moderation/moderators/OpenAiModerator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
moderationResponseSchema,
} from "@oakai/core/src/utils/ailaModeration/moderationSchema";
import OpenAI from "openai";
import zodToJsonSchema from "zod-to-json-schema";

import { AilaModerator, AilaModerationError } from ".";
import {
Expand Down Expand Up @@ -56,6 +57,8 @@ export class OpenAiModerator extends AilaModerator {
throw new AilaModerationError("Failed to moderate after 3 attempts");
}

const schema = zodToJsonSchema(moderationResponseSchema);

const moderationResponse = await this._openAIClient.chat.completions.create(
{
model: this._model,
Expand All @@ -67,7 +70,19 @@ export class OpenAiModerator extends AilaModerator {
{ role: "user", content: input },
],
temperature: this._temperature,
response_format: { type: "json_object" },
response_format: {
type: "json_schema",
json_schema: {
name: "moderationResponse",
/**
* Currently `strict` mode does not support minimum/maxiumum integer types, which
* we use for the likert scale in the moderation schema.
* @see https://community.openai.com/t/new-function-calling-with-strict-has-a-problem-with-minimum-integer-type/903258
*/
// strict: true,
schema,
},
},
},
{
headers: {
Expand Down Expand Up @@ -102,17 +117,25 @@ export class OpenAiModerator extends AilaModerator {
throw new AilaModerationError(`No moderation response`);
}

// FIX: Sometimes the LLM incorrectly flags all available categories.
// The dummy smoke test shouldn't be triggered in normal use, and indicates this bug
if (response.data.categories.includes("t/dummy-smoke-test")) {
console.log(
"Moderation: dummy-smoke-test detected, retrying. Attempts: ",
attempts + 1,
);
return await this._moderate(input, attempts + 1);
}
const { categories, justification, scores } = response.data;

return response.data;
return {
justification,
categories: categories.filter((category) => {
/**
* We only want to include the category if the parent category scores below a certain threshold.
* Seems to improve accuracy of the moderation.
* In future we may want to adjust this threshold based on subject and key-stage, and the
* category itself.
*/
const parentKey = category[0];
for (const [key, score] of Object.entries(scores)) {
if (key === parentKey && score < 5) {
return true;
}
}
}),
};
}

async moderate(input: string): Promise<ModerationResult> {
Expand Down
2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"inngest": "^3.16.1",
"langchain": "^0.0.184",
"obscenity": "^0.1.4",
"openai": "^4.52.0",
"openai": "^4.58.1",
"tiny-invariant": "^1.3.1",
"ts-dedent": "^2.2.0",
"ts-md5": "^1.3.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
"codePrefix": "t",
"criteria5": "(Fully compliant): The lesson plan does not contain any guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or encourage harmful behavior, illegal activity, or violence.",
"criteria1": "(Severely non-compliant): The lesson plan contains explicit guides, instructions, or learning materials on how to harm oneself or others, create weapons or harmful substances, or directly encourages harmful behavior, illegal activity, or violence.",
"note": "",
"note": "The Toxic group is the only one which suggests the content is not legitimate, and has no place on our platform. It is not a just a 'Content guidance' warning as the others are. Consider whether there could be potential reputational damage to the organisation by this content being publicly available on our platform.",
"categories": [
{
"code": "t/guides-self-harm",
Expand Down Expand Up @@ -200,12 +200,6 @@
"title": "Encouragement of Violence",
"userDescription": "encouragement of violence",
"llmDescription": "Encouragement of violence"
},
{
"code": "t/dummy-smoke-test",
"title": "",
"userDescription": "",
"llmDescription": "no content matches this category"
}
]
}
Expand Down
15 changes: 6 additions & 9 deletions packages/core/src/utils/ailaModeration/moderationPrompt.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import zodToJsonSchema from "zod-to-json-schema";

import moderationCategories from "./moderationCategories.json";
import { moderationResponseSchema } from "./moderationSchema";

export const moderationPrompt = `
OBJECTIVE:
Expand Down Expand Up @@ -38,12 +35,12 @@ Rating Criteria:
)
.join("")}
NOTE: Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner.
NOTE:
INSTRUCTION: Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5. For category groups with a score less than 5, you must list the categories on which the lesson plan failed to meet the criteria.
Your objective is to provide content guidance to ensure that certain content can be flagged with a 'adult supervision recommended/required' label. Regardless of the educational context, the score must be adjusted to reflect the presence of such content, even if the content is presented in a careful, considerate manner.
OUTPUT: Ensure that your response is a valid JSON document that conforms to the following JSON schema: ${JSON.stringify(
zodToJsonSchema(moderationResponseSchema),
)}
INSTRUCTION:
Do not include backticks in your response -- I should be able JSON parse your response.`;
Use the above to classify the lesson provided, providing a justification for your scores. Your justification should be concise, precise and directly support your rating. A detailed justification is crucial, even for a score of 5.
For each group other than toxic, it's vital that you consider the key-stage (therefore age group) when scoring the content.
`;
8 changes: 2 additions & 6 deletions packages/core/src/utils/ailaModeration/moderationSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ export const moderationCategoriesSchema = z.array(
z.literal("t/encouragement-illegal-activity"),
z.literal("t/encouragement-violence"),
z.literal("t/encouragement-violence"),
z.literal("t/dummy-smoke-test"),
])
.describe(
`If the content scores less then 5 for any group, specify the categories on which it failed.`,
`If the content scores less than 5 for any group, specify the categories on which it failed.`,
),
);

Expand All @@ -58,10 +57,7 @@ export const moderationResponseSchema = z.object({
p: likertScale.describe("Physical activity and safety score"),
t: likertScale.describe("Toxic score"),
}),
justification: z
.string()
.optional()
.describe(`Add justification for your scores.`),
justification: z.string().describe(`Add justification for your scores.`),
categories: moderationCategoriesSchema,
});

Expand Down
2 changes: 1 addition & 1 deletion packages/db/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"cheerio": "1.0.0-rc.12",
"chunk-text": "^2.0.1",
"graphql-request": "^6.1.0",
"openai": "^4.52.0",
"openai": "^4.58.1",
"p-queue": "^7.4.1",
"p-queue-compat": "^1.0.225",
"ts-node": "^10.9.2",
Expand Down
38 changes: 20 additions & 18 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 507eedc

Please sign in to comment.