diff --git a/packages/aila/src/features/moderation/AilaModeration.ts b/packages/aila/src/features/moderation/AilaModeration.ts index b7d3c8a3e..ee0564143 100644 --- a/packages/aila/src/features/moderation/AilaModeration.ts +++ b/packages/aila/src/features/moderation/AilaModeration.ts @@ -71,6 +71,7 @@ export class AilaModeration implements AilaModerationFeature { appSessionId: chatId, messageId: lastAssistantMessage.id, categories: moderationResult.categories, + scores: moderationResult.scores, justification: moderationResult.justification, lesson: lessonPlan, }); @@ -195,10 +196,11 @@ export class AilaModeration implements AilaModerationFeature { messages: Message[]; lessonPlan: LooseLessonPlan; retries: number; - }) { + }): Promise { if (retries < 1) { return { categories: [], + scores: undefined, justification: "Failed to parse moderation response", }; } diff --git a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts index d4cea0f55..10ed5b29c 100644 --- a/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts +++ b/packages/aila/src/features/moderation/moderators/OpenAiModerator.ts @@ -123,10 +123,7 @@ export class OpenAiModerator extends AilaModerator { ); const log = aiLogger("aila:moderation:response"); - log.info( - "Moderation response: ", - JSON.stringify(moderationResponse, null, 2), - ); + log.info(JSON.stringify(moderationResponse)); const response = moderationResponseSchema.safeParse( JSON.parse(moderationResponse.choices[0]?.message.content ?? "null"), @@ -152,6 +149,7 @@ export class OpenAiModerator extends AilaModerator { return { justification, + scores, categories: categories.filter((category) => { /** * We only want to include the category if the parent category scores below a certain threshold. diff --git a/packages/core/src/models/moderations.ts b/packages/core/src/models/moderations.ts index 7f7acb858..c73087306 100644 --- a/packages/core/src/models/moderations.ts +++ b/packages/core/src/models/moderations.ts @@ -34,6 +34,7 @@ export class Moderations { appSessionId, messageId, categories, + scores, justification, lesson, }: { @@ -41,6 +42,7 @@ export class Moderations { appSessionId: string; messageId: string; categories: ModerationResult["categories"]; + scores: ModerationResult["scores"]; justification?: string; lesson: Snapshot; }): Promise { @@ -58,6 +60,7 @@ export class Moderations { userId, categories, justification, + scores, appSessionId, messageId, lessonSnapshotId, diff --git a/packages/core/src/utils/ailaModeration/moderationSchema.ts b/packages/core/src/utils/ailaModeration/moderationSchema.ts index 8c1bc6612..c649a5483 100644 --- a/packages/core/src/utils/ailaModeration/moderationSchema.ts +++ b/packages/core/src/utils/ailaModeration/moderationSchema.ts @@ -44,19 +44,21 @@ export const moderationCategoriesSchema = z.array( const likertScale = z.number().int().min(1).max(5); +const moderationScoresSchema = z.object({ + l: likertScale.describe("Language and discrimination score"), + v: likertScale.describe("Violence and crime score"), + u: likertScale.describe("Upsetting, disturbing and sensitive score"), + s: likertScale.describe("Nudity and sex score"), + p: likertScale.describe("Physical activity and safety score"), + t: likertScale.describe("Toxic score"), +}); + /** * Schema for the moderation response from the LLM. * Note: it's important that 'categories' is the last field in the schema */ export const moderationResponseSchema = z.object({ - scores: z.object({ - l: likertScale.describe("Language and discrimination score"), - v: likertScale.describe("Violence and crime score"), - u: likertScale.describe("Upsetting, disturbing and sensitive score"), - s: likertScale.describe("Nudity and sex score"), - p: likertScale.describe("Physical activity and safety score"), - t: likertScale.describe("Toxic score"), - }), + scores: moderationScoresSchema, justification: z.string().describe("Add justification for your scores."), categories: moderationCategoriesSchema, }); @@ -65,8 +67,9 @@ export const moderationResponseSchema = z.object({ * Schema for the moderation result, once parsed from the moderation response */ export const moderationResultSchema = z.object({ - categories: moderationCategoriesSchema, justification: z.string().optional(), + scores: moderationScoresSchema.optional(), + categories: moderationCategoriesSchema, }); export type ModerationResult = z.infer; diff --git a/packages/db/prisma/migrations/20241202151820_moderation_scores/migration.sql b/packages/db/prisma/migrations/20241202151820_moderation_scores/migration.sql new file mode 100644 index 000000000..e84b992a3 --- /dev/null +++ b/packages/db/prisma/migrations/20241202151820_moderation_scores/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "public"."moderations" ADD COLUMN "scores" JSONB; diff --git a/packages/db/prisma/schema.prisma b/packages/db/prisma/schema.prisma index aa9b59a59..4cabd577c 100644 --- a/packages/db/prisma/schema.prisma +++ b/packages/db/prisma/schema.prisma @@ -789,6 +789,7 @@ model Moderation { appSessionId String @map("app_session_id") messageId String @map("message_id") categories Json[] + scores Json? @map("scores") @db.JsonB justification String? lessonSnapshotId String? @map("lesson_snapshot_id") // A user's comment in relation to the moderation. Likely they are contesting it.