diff --git a/docs/core_docs/docs/modules/experimental/index.mdx b/docs/core_docs/docs/modules/experimental/index.mdx new file mode 100644 index 000000000000..c438bd88faba --- /dev/null +++ b/docs/core_docs/docs/modules/experimental/index.mdx @@ -0,0 +1,5 @@ +--- +sidebar_position: 6 +--- + +# Experimental diff --git a/docs/core_docs/docs/modules/experimental/mask/mask.mdx b/docs/core_docs/docs/modules/experimental/mask/mask.mdx new file mode 100644 index 000000000000..d1ffd31da0d8 --- /dev/null +++ b/docs/core_docs/docs/modules/experimental/mask/mask.mdx @@ -0,0 +1,64 @@ +# Masking + +The experimental masking parser and transformer is an extendable module for masking and rehydrating strings. One of the primary use cases for this module is to redact PII (Personal Identifiable Information) from a string before making a call to an llm. + +### Real world scenario + +A customer support system receives messages containing sensitive customer information. The system must parse these messages, mask any PII (like names, email addresses, and phone numbers), and log them for analysis while complying with privacy regulations. Before logging the transcript a summary is generated using an llm. + +## Example + +Use the RegexMaskingTransformer to create a simple mask for email and phone. + +```typescript +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; + +// Define masking strategy +const emailMask = () => `[email-${Math.random().toString(16).slice(2)}]`; +const phoneMask = () => `[phone-${Math.random().toString(16).slice(2)}]`; + +// Configure pii transformer +const piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\d{3}-\d{3}-\d{4}/g, mask: phoneMask }, +}); + +maskingParser = new MaskingParser(); +maskingParser.addTransformer(piiMaskingTransformer); + +const input = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; +const masked = await maskingParser.parse(message); + +console.log(masked); +// Contact me at [email-a31e486e324f6] or [phone-da8fc1584f224]. Also reach me at [email-d5b6237633d95] + +const rehydrated = maskingParser.rehydrate(masked); +console.log(rehydrated); +// Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com +``` + +:::note +If you plan on storing the masking state to rehydrate the original values asynchronously ensure you are following best security practices. In most cases you will want to define a custom hashing and salting strategy. +::: + +## Get started + +import CodeBlock from "@theme/CodeBlock"; +import ExampleKitchenSink from "@examples/experimental/masking/kitchen_sink.ts"; +import ExampleNext from "@examples/experimental/masking/next.ts"; +import ExampleStream from "@examples/chains/llm_chain_stream.ts"; +import ExampleCancellation from "@examples/chains/llm_chain_cancellation.ts"; + +### Next.js stream + +Example nextjs chat endpoint leveraging the RegexMaskingTransformer. The current chat message and chat message history are masked every time the api is called with a chat payload. + +{ExampleNext} + +### Kitchen sink + +{ExampleKitchenSink} diff --git a/docs/core_docs/docs/modules/index.mdx b/docs/core_docs/docs/modules/index.mdx index dfae71f5175a..8e8bd58c0e91 100644 --- a/docs/core_docs/docs/modules/index.mdx +++ b/docs/core_docs/docs/modules/index.mdx @@ -29,3 +29,7 @@ Persist application state between runs of a chain #### [Callbacks](/docs/modules/callbacks/) Log and stream intermediate steps of any chain + +#### [Experimental](/docs/modules/experimental/) + +Experimental modules not yet ready for production consumption diff --git a/examples/src/experimental/masking/kitchen_sink.ts b/examples/src/experimental/masking/kitchen_sink.ts new file mode 100644 index 000000000000..f242848e8139 --- /dev/null +++ b/examples/src/experimental/masking/kitchen_sink.ts @@ -0,0 +1,80 @@ +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; + +// A simple hash function for demonstration purposes +function simpleHash(input: string): string { + let hash = 0; + for (let i = 0; i < input.length; i++) { + const char = input.charCodeAt(i); + hash = (hash << 5) - hash + char; + hash |= 0; // Convert to 32bit integer + } + return hash.toString(16); +} + +const emailMask = (match: string) => `[email-${simpleHash(match)}]`; +const phoneMask = (match: string) => `[phone-${simpleHash(match)}]`; +const nameMask = (match: string) => `[name-${simpleHash(match)}]`; +const ssnMask = (match: string) => `[ssn-${simpleHash(match)}]`; +const creditCardMask = (match: string) => `[creditcard-${simpleHash(match)}]`; +const passportMask = (match: string) => `[passport-${simpleHash(match)}]`; +const licenseMask = (match: string) => `[license-${simpleHash(match)}]`; +const addressMask = (match: string) => `[address-${simpleHash(match)}]`; +const dobMask = (match: string) => `[dob-${simpleHash(match)}]`; +const bankAccountMask = (match: string) => `[bankaccount-${simpleHash(match)}]`; + +// Regular expressions for different types of PII +const patterns = { + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\b\d{3}-\d{3}-\d{4}\b/g, mask: phoneMask }, + name: { regex: /\b[A-Z][a-z]+ [A-Z][a-z]+\b/g, mask: nameMask }, + ssn: { regex: /\b\d{3}-\d{2}-\d{4}\b/g, mask: ssnMask }, + creditCard: { regex: /\b(?:\d{4}[ -]?){3}\d{4}\b/g, mask: creditCardMask }, + passport: { regex: /(?i)\b[A-Z]{1,2}\d{6,9}\b/g, mask: passportMask }, + license: { regex: /(?i)\b[A-Z]{1,2}\d{6,8}\b/g, mask: licenseMask }, + address: { + regex: /\b\d{1,5}\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)\*\b/g, + mask: addressMask, + }, + dob: { regex: /\b\d{4}-\d{2}-\d{2}\b/g, mask: dobMask }, + bankAccount: { regex: /\b\d{8,17}\b/g, mask: bankAccountMask }, +}; + +// Create a RegexMaskingTransformer with multiple patterns +const piiMaskingTransformer = new RegexMaskingTransformer(patterns); + +// Hooks for different stages of masking and rehydrating +const onMaskingStart = (message: string) => + console.log(`Starting to mask message: ${message}`); +const onMaskingEnd = (maskedMessage: string) => + console.log(`Masked message: ${maskedMessage}`); +const onRehydratingStart = (message: string) => + console.log(`Starting to rehydrate message: ${message}`); +const onRehydratingEnd = (rehydratedMessage: string) => + console.log(`Rehydrated message: ${rehydratedMessage}`); + +// Initialize MaskingParser with the transformer and hooks +const maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + onRehydratingStart, + onRehydratingEnd, +}); + +// Example message containing multiple types of PII +const message = + "Contact Jane Doe at jane.doe@email.com or 555-123-4567. Her SSN is 123-45-6789 and her credit card number is 1234-5678-9012-3456. Passport number: AB1234567, Driver's License: X1234567, Address: 123 Main St, Date of Birth: 1990-01-01, Bank Account: 12345678901234567."; + +// Mask and rehydrate the message +maskingParser + .parse(message) + .then((maskedMessage: string) => { + console.log(`Masked message: ${maskedMessage}`); + return maskingParser.rehydrate(maskedMessage); + }) + .then((rehydratedMessage: string) => { + console.log(`Final rehydrated message: ${rehydratedMessage}`); + }); diff --git a/examples/src/experimental/masking/next.ts b/examples/src/experimental/masking/next.ts new file mode 100644 index 000000000000..7be6b77edbcf --- /dev/null +++ b/examples/src/experimental/masking/next.ts @@ -0,0 +1,73 @@ +// app/api/chat + +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; +import { PromptTemplate } from "langchain/prompts"; +import { ChatOpenAI } from "langchain/chat_models/openai"; +import { BytesOutputParser } from "langchain/schema/output_parser"; + +export const runtime = "edge"; + +// Function to format chat messages for consistency +const formatMessage = (message: any) => { + return `${message.role}: ${message.content}`; +}; + +const CUSTOMER_SUPPORT = `You are a customer support summarizer agent. Always include masked PII in your response. + Current conversation: + {chat_history} + User: {input} + AI:`; + +// Configure Masking Parser +const maskingParser = new MaskingParser(); +// Define transformations for masking emails and phone numbers using regular expressions +const piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g }, // If a regex is provided without a mask we fallback to a simple default hashing function + phone: { regex: /\d{3}-\d{3}-\d{4}/g }, +}); + +maskingParser.addTransformer(piiMaskingTransformer); + +export async function POST(req: Request) { + try { + const body = await req.json(); + const messages = body.messages ?? []; + const formattedPreviousMessages = messages.slice(0, -1).map(formatMessage); + const currentMessageContent = messages[messages.length - 1].content; // Extract the content of the last message + // Mask sensitive information in the current message + const guardedMessageContent = await maskingParser.parse( + currentMessageContent + ); + // Mask sensitive information in the chat history + const guardedHistory = await maskingParser.parse( + formattedPreviousMessages.join("\n") + ); + + const prompt = PromptTemplate.fromTemplate(CUSTOMER_SUPPORT); + const model = new ChatOpenAI({ temperature: 0.8 }); + // Initialize an output parser that handles serialization and byte-encoding for streaming + const outputParser = new BytesOutputParser(); + const chain = prompt.pipe(model).pipe(outputParser); // Chain the prompt, model, and output parser together + + console.log("[GUARDED INPUT]", guardedMessageContent); // Contact me at -1157967895 or -1626926859. + console.log("[GUARDED HISTORY]", guardedHistory); // user: Contact me at -1157967895 or -1626926859. assistant: Thank you for providing your contact information. + console.log("[STATE]", maskingParser.getState()); // { '-1157967895' => 'jane.doe@email.com', '-1626926859' => '555-123-4567'} + + // Stream the AI response based on the masked chat history and current message + const stream = await chain.stream({ + chat_history: guardedHistory, + input: guardedMessageContent, + }); + + // npm i ai + // import { StreamingTextResponse } from "ai"; + // return new StreamingTextResponse(stream); + + return; + } catch (e: any) { + return Response.json({ error: e.message }, { status: 500 }); + } +} diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 446eaac7856b..69e7e77a64ae 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -17,7 +17,8 @@ const entrypoints = { "agents/toolkits/aws_sfn": "agents/toolkits/aws_sfn", "agents/toolkits/sql": "agents/toolkits/sql/index", "agents/format_scratchpad": "agents/format_scratchpad/openai_functions", - "agents/format_scratchpad/openai_tools": "agents/format_scratchpad/openai_tools", + "agents/format_scratchpad/openai_tools": + "agents/format_scratchpad/openai_tools", "agents/format_scratchpad/log": "agents/format_scratchpad/log", "agents/format_scratchpad/xml": "agents/format_scratchpad/xml", "agents/format_scratchpad/log_to_message": @@ -319,6 +320,7 @@ const entrypoints = { "experimental/hubs/makersuite/googlemakersuitehub", "experimental/chains/violation_of_expectations": "experimental/chains/violation_of_expectations/index", + "experimental/masking": "experimental/masking/index", "experimental/tools/pyinterpreter": "experimental/tools/pyinterpreter", // evaluation evaluation: "evaluation/index", diff --git a/langchain/src/experimental/masking/index.ts b/langchain/src/experimental/masking/index.ts new file mode 100644 index 000000000000..d8b07444e1d5 --- /dev/null +++ b/langchain/src/experimental/masking/index.ts @@ -0,0 +1,7 @@ +export { MaskingParser } from "./parser.js"; +export { RegexMaskingTransformer } from "./regex_masking_transformer.js"; +export { + type MaskingParserConfig, + type HashFunction, + type HookFunction, +} from "./types.js"; diff --git a/langchain/src/experimental/masking/parser.ts b/langchain/src/experimental/masking/parser.ts new file mode 100644 index 000000000000..197685d67dd5 --- /dev/null +++ b/langchain/src/experimental/masking/parser.ts @@ -0,0 +1,139 @@ +import { MaskingTransformer } from "./transformer.js"; +import { MaskingParserConfig } from "./types.js"; + +/** + * MaskingParser class for handling the masking and rehydrating of messages. + */ +export class MaskingParser { + private transformers: MaskingTransformer[]; + private state: Map; + private config: MaskingParserConfig; + + constructor(config: MaskingParserConfig = {}) { + this.transformers = config.transformers || []; + this.state = new Map(); + this.config = config; + } + + /** + * Adds a transformer to the parser. + * @param transformer - An instance of a class extending MaskingTransformer. + */ + addTransformer(transformer: MaskingTransformer) { + this.transformers.push(transformer); + } + + /** + * Getter method for retrieving the current state. + * @returns The current state map. + */ + public getState(): Map { + return this.state; + } + + /** + * Masks the provided message using the added transformers. + * This method sequentially applies each transformer's masking logic to the message. + * It utilizes a state map to track original values corresponding to their masked versions. + * + * @param message - The message to be masked. + * @returns A masked version of the message. + * @throws {TypeError} If the message is not a string. + * @throws {Error} If no transformers are added. + */ + async parse(message: string): Promise { + this.config.onMaskingStart?.(message); + + // Check if there are any transformers added to the parser. If not, throw an error + // as masking requires at least one transformer to apply its logic. + if (this.transformers.length === 0) { + throw new Error( + "MaskingParser.parse Error: No transformers have been added. Please add at least one transformer before parsing." + ); + } + + if (typeof message !== "string") { + throw new TypeError( + "MaskingParser.parse Error: The 'message' argument must be a string." + ); + } + + // Initialize the variable to hold the progressively masked message. + // It starts as the original message and gets transformed by each transformer. + let processedMessage = message; + + // Iterate through each transformer added to the parser. + this.transformers.forEach((transformer) => { + // Apply the transformer's transform method to the current state of the message. + // The transform method returns a tuple containing the updated message and state. + // The state is a map that tracks the original values of masked content. + // This state is essential for the rehydration process to restore the original message. + [processedMessage, this.state] = transformer.transform( + processedMessage, + this.state + ); + }); + + this.config.onMaskingEnd?.(processedMessage); + // Return the fully masked message after all transformers have been applied. + return processedMessage; + } + + /** + * Rehydrates a masked message back to its original form. + * This method sequentially applies the rehydration logic of each added transformer in reverse order. + * It relies on the state map to correctly map the masked values back to their original values. + * + * The rehydration process is essential for restoring the original content of a message + * that has been transformed (masked) by the transformers. This process is the inverse of the masking process. + * + * @param message - The masked message to be rehydrated. + * @returns The original (rehydrated) version of the message. + */ + async rehydrate( + message: string, + state?: Map + ): Promise { + this.config.onRehydratingStart?.(message); + + if (typeof message !== "string") { + throw new TypeError( + "MaskingParser.rehydrate Error: The 'message' argument must be a string." + ); + } + // Check if any transformers have been added to the parser. + // If no transformers are present, throw an error as rehydration requires at least one transformer. + if (this.transformers.length === 0) { + throw new Error( + "MaskingParser.rehydrate Error: No transformers have been added. Please add at least one transformer before rehydrating." + ); + } + + if (state && !(state instanceof Map)) { + throw new TypeError( + "MaskingParser.rehydrate Error: The 'state' argument, if provided, must be an instance of Map." + ); + } + + const rehydrationState = state || this.state; // Use provided state or fallback to internal state + // Initialize the rehydratedMessage with the input masked message. + // This variable will undergo rehydration by each transformer in reverse order. + let rehydratedMessage = message; + this.transformers + .slice() + .reverse() + .forEach((transformer) => { + // Apply the transformer's rehydrate method to the current state of the message. + // The rehydrate method uses the stored state (this.state) to map masked values + // back to their original values, effectively undoing the masking transformation. + rehydratedMessage = transformer.rehydrate( + rehydratedMessage, + rehydrationState + ); + }); + + this.config.onRehydratingEnd?.(rehydratedMessage); + // Return the fully rehydrated message after all transformers have been applied. + return rehydratedMessage; + } +} diff --git a/langchain/src/experimental/masking/regex_masking_transformer.ts b/langchain/src/experimental/masking/regex_masking_transformer.ts new file mode 100644 index 000000000000..775cfaa21275 --- /dev/null +++ b/langchain/src/experimental/masking/regex_masking_transformer.ts @@ -0,0 +1,160 @@ +import { MaskingTransformer } from "./transformer.js"; +import { HashFunction, MaskingPattern } from "./types.js"; +/** + * RegexMaskingTransformer class for masking and rehydrating messages with Regex. + */ +export class RegexMaskingTransformer extends MaskingTransformer { + private patterns: { [key: string]: MaskingPattern }; + private hashFunction: HashFunction; + + /** + * Constructs a RegexMaskingTransformer with given patterns and an optional hash function. + * Validates the provided patterns to ensure they conform to the expected structure. + * + * @param patterns - An object containing masking patterns. Each pattern should include + * a regular expression (`regex`) and optionally a `replacement` string + * or a `mask` function. + * @param hashFunction - An optional custom hash function to be used for masking. + */ + constructor( + patterns: { [key: string]: MaskingPattern }, + hashFunction?: HashFunction + ) { + super(); + // Validates the provided masking patterns before initializing the transformer. + // This ensures that each pattern has a valid regular expression. + this.validatePatterns(patterns); + + // Assigns the validated patterns and the hash function to the transformer. + // If no custom hash function is provided, the default hash function is used. + this.patterns = patterns; + this.hashFunction = hashFunction || this.defaultHashFunction; + } + + /** + * Validates the given masking patterns to ensure each pattern has a valid regular expression. + * Throws an error if any pattern is found to be invalid. + * + * @param patterns - The patterns object to validate. + */ + private validatePatterns(patterns: { [key: string]: MaskingPattern }) { + for (const key in patterns) { + const pattern = patterns[key]; + // Checks that each pattern is an object and has a regex property that is an instance of RegExp. + // Throws an error if these conditions are not met, indicating an invalid pattern configuration. + if ( + !pattern || + typeof pattern !== "object" || + !(pattern.regex instanceof RegExp) + ) { + throw new Error("Invalid pattern configuration."); + } + } + } + + /** + * Masks content in a message based on the defined patterns. + * @param message - The message to be masked. + * @param state - The current state containing original values. + * @returns A tuple of the masked message and the updated state. + */ + transform( + message: string, + state: Map + ): [string, Map] { + if (typeof message !== "string") { + throw new TypeError( + "RegexMaskingTransformer.transform Error: The 'message' argument must be a string." + ); + } + + if (!(state instanceof Map)) { + throw new TypeError( + "RegexMaskingTransformer.transform Error: The 'state' argument must be an instance of Map." + ); + } + + // Holds the progressively masked message + let processedMessage = message; + + // Initialize original values map with the current state or a new map + let originalValues = state || new Map(); + + // Iterate over each pattern defined in the transformer + for (const key in this.patterns) { + const pattern = this.patterns[key]; + + // Apply the current pattern's regex to the message + processedMessage = processedMessage.replace(pattern.regex, (match) => { + // Determine the masked value: use the mask function if provided, else use the replacement string, + // else use the hash function. + const maskedValue = pattern.mask + ? pattern.mask(match) + : pattern.replacement ?? this.hashFunction(match); + + // Store the mapping of the masked value to the original value (match) + originalValues.set(maskedValue, match); + + // Return the masked value to replace the original value in the message + return maskedValue; + }); + } + + // Return the fully masked message and the state map with all original values + return [processedMessage, originalValues]; + } + + /** + * Rehydrates a masked message back to its original form using the provided state. + * @param message - The masked message to be rehydrated. + * @param state - The state map containing mappings of masked values to their original values. + * @returns The rehydrated (original) message. + */ + rehydrate(message: string, state: Map): string { + if (typeof message !== "string") { + throw new TypeError( + "RegexMaskingTransformer.rehydrate Error: The 'message' argument must be a string." + ); + } + + if (!(state instanceof Map)) { + throw new TypeError( + "RegexMaskingTransformer.rehydrate Error: The 'state' argument must be an instance of Map." + ); + } + + // Convert the state map to an array and use reduce to sequentially replace masked values with original values. + return Array.from(state).reduce((msg, [masked, original]) => { + // Escape special characters in the masked string to ensure it can be used in a regular expression safely. + // This is necessary because masked values might contain characters that have special meanings in regex. + const escapedMasked = masked.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + + // Replace all instances of the escaped masked value in the message with the original value. + // The 'g' flag in the RegExp ensures that all occurrences of the masked value are replaced. + return msg.replace(new RegExp(escapedMasked, "g"), original); + }, message); + } + + /** + * Default hash function for creating unique hash values. + * @param input - The input string to hash. + * @returns The resulting hash as a string. + */ + private defaultHashFunction(input: string): string { + let hash = 0; + // Iterate over each character in the input string + for (let i = 0; i < input.length; i++) { + // Get ASCII value of the character + const char = input.charCodeAt(i); + // Combine the current hash with the new character and ensure it remains a 32-bit integer + hash = (hash << 5) - hash + char; + // Bitwise OR operation to convert to a 32-bit integer. + // This is a common technique to ensure the final hash value stays within the 32-bit limit, + // effectively wrapping the value when it becomes too large. + hash |= 0; + } + + // Convert the numerical hash value to a string and return + return hash.toString(); + } +} diff --git a/langchain/src/experimental/masking/tests/mask-integration.test.ts b/langchain/src/experimental/masking/tests/mask-integration.test.ts new file mode 100644 index 000000000000..d0b80d1738da --- /dev/null +++ b/langchain/src/experimental/masking/tests/mask-integration.test.ts @@ -0,0 +1,73 @@ +// yarn test:single src/experimental/masking/tests/mask-integration.test.ts +import { MaskingParser, RegexMaskingTransformer } from "../index.js"; + +// Mock database for simulating state storage and retrieval +const mockDB = (() => { + const db = new Map(); + return { + async saveState(key: string, serializedState: string) { + db.set(key, serializedState); + }, + async getState(key: string): Promise { + return db.get(key) || ""; + }, + }; +})(); + +function serializeState(state: Map): string { + return JSON.stringify(Array.from(state.entries())); +} + +function deserializeState(serializedState: string): Map { + return new Map(JSON.parse(serializedState)); +} + +describe("MaskingParser Integration Test", () => { + let parser: MaskingParser; + let transformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + transformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + parser = new MaskingParser(); + parser.addTransformer(transformer); + }); + + it("should mask, store state, and rehydrate with altered order", async () => { + const originalMessage = "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await parser.parse(originalMessage); + + // Serialize and store the state + const serializedState = serializeState(parser.getState()); + await mockDB.saveState("uniqueMessageId", serializedState); + + // Simulate retrieving and altering the masked message + // Here, we assume the AI processing reverses the order of masked content + // Simulate retrieving and altering the masked message + const alteredMaskedMessage = maskedMessage.split(" ").reverse().join(" "); + + // Retrieve and deserialize the state + const retrievedSerializedState = await mockDB.getState("uniqueMessageId"); + const retrievedState = deserializeState(retrievedSerializedState); + + // Rehydrate the altered message + const rehydratedMessage = await parser.rehydrate( + alteredMaskedMessage, + retrievedState + ); + + // The expectation depends on how the alteration affects the masked message. + // Here, we assume that the rehydrated message should match the original message + // even after the alteration since the masked content still aligns with the stored state. + const expectedRehydratedMessage = originalMessage + .split(" ") + .reverse() + .join(" "); + expect(rehydratedMessage).toEqual(expectedRehydratedMessage); + }); +}); diff --git a/langchain/src/experimental/masking/tests/masking.test.ts b/langchain/src/experimental/masking/tests/masking.test.ts new file mode 100644 index 000000000000..9eec3f7cdb7a --- /dev/null +++ b/langchain/src/experimental/masking/tests/masking.test.ts @@ -0,0 +1,351 @@ +// yarn test:single src/experimental/masking/tests/masking.test.ts +import { MaskingParser, RegexMaskingTransformer } from "../index.js"; +import { jest } from "@jest/globals"; + +describe("MaskingParser and PIIMaskingTransformer", () => { + describe("Masking with Static Identifiers", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("masks single occurrences of PII with static identifiers", async () => { + const message = "Contact me at jane.doe@email.com or 555-123-4567."; + const expectedMaskedMessage = "Contact me at [email] or [phone]."; + + const maskedMessage = await maskingParser.parse(message); + + expect(maskedMessage).toBe(expectedMaskedMessage); + }); + + it("rehydrates static masked data to its original form", async () => { + const maskedMessage = "Contact me at [email] or [phone]."; + const expectedOriginalMessage = + "Contact me at jane.doe@email.com or 555-123-4567."; + + await maskingParser.parse(expectedOriginalMessage); // Masking original message + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(expectedOriginalMessage); + }); + + function generateLargeMessage() { + let largeMessage = ""; + for (let i = 0; i < 10000; i++) { + // Adjust the number for desired message size + largeMessage += `User${i}: jane.doe${i}@email.com, 555-123-${i + .toString() + .padStart(4, "0")}. `; + } + return largeMessage; + } + + describe("Performance Testing", () => { + it("efficiently processes large data sets", async () => { + const largeMessage = generateLargeMessage(); + const startTime = performance.now(); + const maskedMessage = await maskingParser.parse(largeMessage); + const endTime = performance.now(); + + const someAcceptableDuration = 5000; // Set this to a duration you consider acceptable, e.g., 5000 milliseconds (5 seconds) + + expect(maskedMessage).toBeDefined(); + expect(endTime - startTime).toBeLessThan(someAcceptableDuration); + }); + }); + }); + + describe("Masking with Dynamic Identifiers", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailMask = () => `[email-${Math.random().toString(16).slice(2)}]`; + const phoneMask = () => `[phone-${Math.random().toString(16).slice(2)}]`; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\d{3}-\d{3}-\d{4}/g, mask: phoneMask }, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("masks multiple occurrences of different PII with unique identifiers", async () => { + const message = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; + const maskedMessage = await maskingParser.parse(message); + + expect(maskedMessage).toMatch(/\[email-[a-f0-9]+\]/g); + expect(maskedMessage).toMatch(/\[phone-[a-f0-9]+\]/g); + expect((maskedMessage.match(/\[email-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + expect((maskedMessage.match(/\[phone-[a-f0-9]+\]/g) || []).length).toBe( + 1 + ); + }); + + it("rehydrates dynamic masked data to its original form", async () => { + const originalMessage = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; + const maskedMessage = await maskingParser.parse(originalMessage); + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(originalMessage); + }); + + it("masks identical PII with consistent dynamic identifiers", async () => { + const message = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com and 555-123-4567"; + const maskedMessage = await maskingParser.parse(message); + + expect(maskedMessage).toMatch(/\[email-[a-f0-9]+\]/g); + expect(maskedMessage).toMatch(/\[phone-[a-f0-9]+\]/g); + expect((maskedMessage.match(/\[email-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + expect((maskedMessage.match(/\[phone-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + }); + }); + + describe("PIIMaskingTransformer with Default Hash Function", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("should mask email and phone using default hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + const maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + + const message = + "My email is jane.doe@email.com and phone is 555-123-4567."; + const maskedMessage = await maskingParser.parse(message); + + expect(maskedMessage).toContain("[email]"); + expect(maskedMessage).toContain("[phone]"); + }); + }); + + describe("PIIMaskingTransformer with Custom Hash Function", () => { + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + const customHashFunction = (input: string) => { + // A simple hash function that creates a mock hash representation of the input. + // This is just for demonstration purposes and not a secure hashing method. + return input + .split("") + .map(() => "*") + .join(""); + }; + it("should mask email and phone using custom hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer( + { + email: { + regex: /\S+@\S+\.\S+/, + mask: (match) => `custom-email-${customHashFunction(match)}`, + }, + phone: { + regex: /\d{3}-\d{3}-\d{4}/, + mask: (match) => `custom-phone-${customHashFunction(match)}`, + }, + }, + customHashFunction + ); + + const maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + + const message = "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await maskingParser.parse(message); + + // The lengths of the masked parts should be equal to the lengths of the original email and phone number. + const expectedEmailMask = + "custom-email-" + "*".repeat("jane.doe@email.com".length); + const expectedPhoneMask = + "custom-phone-" + "*".repeat("555-123-4567".length); + + expect(maskedMessage).toContain(expectedEmailMask); + expect(maskedMessage).toContain(expectedPhoneMask); + }); + + it("should rehydrate masked data correctly using custom hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer( + { + email: { + regex: /\S+@\S+\.\S+/, + mask: (match) => `custom-email-${customHashFunction(match)}`, + }, + phone: { + regex: /\d{3}-\d{3}-\d{4}/, + mask: (match) => `custom-phone-${customHashFunction(match)}`, + }, + }, + customHashFunction + ); + + maskingParser.addTransformer(piiMaskingTransformer); + + const originalMessage = + "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await maskingParser.parse(originalMessage); + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(originalMessage); + }); + }); + + describe("Error Handling in MaskingParser", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({}); + maskingParser = new MaskingParser(); + }); + + it("throws an error when no transformers are added and parse is called", async () => { + const message = "Some message"; + await expect(maskingParser.parse(message)).rejects.toThrow( + "MaskingParser.parse Error: No transformers have been added. Please add at least one transformer before parsing." + ); + }); + + it("throws an error when no transformers are added and rehydrate is called", async () => { + const message = "Some masked message"; + await expect(maskingParser.rehydrate(message)).rejects.toThrow( + "MaskingParser.rehydrate Error: No transformers have been added. Please add at least one transformer before rehydrating." + ); + }); + + it("throws an error for invalid message type in parse", async () => { + const invalidMessage: any = 123; // intentionally incorrect type + maskingParser.addTransformer(piiMaskingTransformer); // Add a transformer + await expect(maskingParser.parse(invalidMessage)).rejects.toThrow( + "The 'message' argument must be a string." + ); + }); + + it("throws an error for invalid message type in rehydrate", async () => { + const invalidMessage: any = 123; // intentionally incorrect type + await expect(maskingParser.rehydrate(invalidMessage)).rejects.toThrow( + "The 'message' argument must be a string." + ); + }); + }); + + describe("Error Handling in PIIMaskingTransformer", () => { + it("throws an error for invalid message type in transform", () => { + const transformer = new RegexMaskingTransformer({}); + const invalidMessage: any = 123; // intentionally incorrect type + const state = new Map(); + expect(() => transformer.transform(invalidMessage, state)).toThrow( + "The 'message' argument must be a string." + ); + }); + + it("throws an error for invalid state type in transform", () => { + const transformer = new RegexMaskingTransformer({}); + const message = "Some message"; + const invalidState: any = {}; // intentionally incorrect type + expect(() => transformer.transform(message, invalidState)).toThrow( + "The 'state' argument must be an instance of Map." + ); + }); + + it("throws an error when initialized with invalid regex pattern", () => { + expect(() => { + // @ts-expect-error + new RegexMaskingTransformer({ invalid: { regex: null } }); + }).toThrow("Invalid pattern configuration."); + }); + }); + + describe("MaskingParser Hooks", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + }); + }); + + it("calls onMaskingStart and onMaskingEnd hooks during parse", async () => { + const onMaskingStart = jest.fn(); + const onMaskingEnd = jest.fn(); + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + }); + + const message = "Contact me at jane.doe@email.com"; + await maskingParser.parse(message); + + expect(onMaskingStart).toHaveBeenCalledWith(message); + expect(onMaskingEnd).toHaveBeenCalled(); + }); + + it("calls onRehydratingStart and onRehydratingEnd hooks during rehydrate", async () => { + const onRehydratingStart = jest.fn(); + const onRehydratingEnd = jest.fn(); + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onRehydratingStart, + onRehydratingEnd, + }); + + const message = "Contact me at [email]"; + await maskingParser.parse(message); // necessary to populate the state + await maskingParser.rehydrate(message); + + expect(onRehydratingStart).toHaveBeenCalledWith(message); + expect(onRehydratingEnd).toHaveBeenCalled(); + }); + }); +}); diff --git a/langchain/src/experimental/masking/transformer.ts b/langchain/src/experimental/masking/transformer.ts new file mode 100644 index 000000000000..2ae89cb856d3 --- /dev/null +++ b/langchain/src/experimental/masking/transformer.ts @@ -0,0 +1,10 @@ +/** + * Abstract class representing a transformer used for masking and rehydrating messages. + */ +export abstract class MaskingTransformer { + abstract transform( + message: string, + state?: Map + ): [string, Map]; + abstract rehydrate(message: string, state: Map): string; +} diff --git a/langchain/src/experimental/masking/types.ts b/langchain/src/experimental/masking/types.ts new file mode 100644 index 000000000000..4bc475f768c2 --- /dev/null +++ b/langchain/src/experimental/masking/types.ts @@ -0,0 +1,29 @@ +import { MaskingTransformer } from "./transformer.js"; +/** + * Configuration type for MaskingParser. + */ + +export type MaskingParserConfig = { + transformers?: MaskingTransformer[]; + defaultHashFunction?: HashFunction; + onMaskingStart?: HookFunction; + onMaskingEnd?: HookFunction; + onRehydratingStart?: HookFunction; + onRehydratingEnd?: HookFunction; +}; + +/** + * Regex Masking Pattern used for masking in PIIMaskingTransformer. + */ +export type MaskingPattern = { + regex: RegExp; + replacement?: string; + mask?: (match: string) => string; +}; + +export type HookFunction = (message: string) => void; + +/** + * Represents a function that can hash a string input. + */ +export type HashFunction = (input: string) => string;