From 078c5e66b5f619b5fec65067051eb52bc0deb873 Mon Sep 17 00:00:00 2001 From: Nolansym Date: Thu, 7 Dec 2023 01:05:06 -0500 Subject: [PATCH] langchain[minor]: Experimental Masking Module (#3548) * [Feature] Implementation of experimental masking parser/transformer * test: add perf unit test * fix: rename piitransformer to regextransformer * added example Kitchen Sink for masking parser * docs: Add documentation, nextjs example and kitchen sink example * fix: wording * docs: add basic example * fix: remove comment and return stream * feat: async hooks, immutable parser state * fix: parse -> mask * fix: || -> ?? * Fix lint, style * Fix build * Update mask.mdx --------- Co-authored-by: Dzmitry Dubarau Co-authored-by: Dzmitry A Dubarau Co-authored-by: jacoblee93 --- .../docs/modules/experimental/index.mdx | 5 + .../docs/modules/experimental/mask/mask.mdx | 34 ++ docs/core_docs/docs/modules/index.mdx | 4 + examples/src/experimental/masking/basic.ts | 30 + .../src/experimental/masking/kitchen_sink.ts | 80 +++ examples/src/experimental/masking/next.ts | 69 +++ langchain/scripts/create-entrypoints.js | 1 + langchain/src/experimental/masking/index.ts | 8 + langchain/src/experimental/masking/parser.ts | 154 +++++ .../masking/regex_masking_transformer.ts | 174 ++++++ .../masking/tests/masking-extended.test.ts | 73 +++ .../masking/tests/masking.test.ts | 542 ++++++++++++++++++ .../src/experimental/masking/transformer.ts | 14 + langchain/src/experimental/masking/types.ts | 31 + langchain/src/load/import_map.ts | 1 + 15 files changed, 1220 insertions(+) create mode 100644 docs/core_docs/docs/modules/experimental/index.mdx create mode 100644 docs/core_docs/docs/modules/experimental/mask/mask.mdx create mode 100644 examples/src/experimental/masking/basic.ts create mode 100644 examples/src/experimental/masking/kitchen_sink.ts create mode 100644 examples/src/experimental/masking/next.ts create mode 100644 langchain/src/experimental/masking/index.ts create mode 100644 langchain/src/experimental/masking/parser.ts create mode 100644 langchain/src/experimental/masking/regex_masking_transformer.ts create mode 100644 langchain/src/experimental/masking/tests/masking-extended.test.ts create mode 100644 langchain/src/experimental/masking/tests/masking.test.ts create mode 100644 langchain/src/experimental/masking/transformer.ts create mode 100644 langchain/src/experimental/masking/types.ts diff --git a/docs/core_docs/docs/modules/experimental/index.mdx b/docs/core_docs/docs/modules/experimental/index.mdx new file mode 100644 index 000000000000..c438bd88faba --- /dev/null +++ b/docs/core_docs/docs/modules/experimental/index.mdx @@ -0,0 +1,5 @@ +--- +sidebar_position: 6 +--- + +# Experimental diff --git a/docs/core_docs/docs/modules/experimental/mask/mask.mdx b/docs/core_docs/docs/modules/experimental/mask/mask.mdx new file mode 100644 index 000000000000..2827bb708f61 --- /dev/null +++ b/docs/core_docs/docs/modules/experimental/mask/mask.mdx @@ -0,0 +1,34 @@ +# Masking + +The experimental masking parser and transformer is an extendable module for masking and rehydrating strings. One of the primary use cases for this module is to redact PII (Personal Identifiable Information) from a string before making a call to an llm. + +### Real world scenario + +A customer support system receives messages containing sensitive customer information. The system must parse these messages, mask any PII (like names, email addresses, and phone numbers), and log them for analysis while complying with privacy regulations. Before logging the transcript a summary is generated using an llm. + +## Get started + +import CodeBlock from "@theme/CodeBlock"; +import ExampleBasic from "@examples/experimental/masking/basic.ts"; +import ExampleNext from "@examples/experimental/masking/next.ts"; +import ExampleKitchenSink from "@examples/experimental/masking/kitchen_sink.ts"; + +### Basic Example + +Use the RegexMaskingTransformer to create a simple mask for email and phone. + +{ExampleBasic} + +:::note +If you plan on storing the masking state to rehydrate the original values asynchronously ensure you are following best security practices. In most cases you will want to define a custom hashing and salting strategy. +::: + +### Next.js stream + +Example nextjs chat endpoint leveraging the RegexMaskingTransformer. The current chat message and chat message history are masked every time the api is called with a chat payload. + +{ExampleNext} + +### Kitchen sink + +{ExampleKitchenSink} diff --git a/docs/core_docs/docs/modules/index.mdx b/docs/core_docs/docs/modules/index.mdx index dfae71f5175a..fe45f86187ec 100644 --- a/docs/core_docs/docs/modules/index.mdx +++ b/docs/core_docs/docs/modules/index.mdx @@ -29,3 +29,7 @@ Persist application state between runs of a chain #### [Callbacks](/docs/modules/callbacks/) Log and stream intermediate steps of any chain + +#### [Experimental](/docs/modules/experimental/) + +Experimental modules whose abstractions have not fully settled diff --git a/examples/src/experimental/masking/basic.ts b/examples/src/experimental/masking/basic.ts new file mode 100644 index 000000000000..77a5c78de2b2 --- /dev/null +++ b/examples/src/experimental/masking/basic.ts @@ -0,0 +1,30 @@ +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; + +// Define masking strategy +const emailMask = () => `[email-${Math.random().toString(16).slice(2)}]`; +const phoneMask = () => `[phone-${Math.random().toString(16).slice(2)}]`; + +// Configure pii transformer +const piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\d{3}-\d{3}-\d{4}/g, mask: phoneMask }, +}); + +const maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], +}); +maskingParser.addTransformer(piiMaskingTransformer); + +const input = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; +const masked = await maskingParser.parse(input); + +console.log(masked); +// Contact me at [email-a31e486e324f6] or [phone-da8fc1584f224]. Also reach me at [email-d5b6237633d95] + +const rehydrated = maskingParser.rehydrate(masked); +console.log(rehydrated); +// Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com diff --git a/examples/src/experimental/masking/kitchen_sink.ts b/examples/src/experimental/masking/kitchen_sink.ts new file mode 100644 index 000000000000..07e85e7fc50f --- /dev/null +++ b/examples/src/experimental/masking/kitchen_sink.ts @@ -0,0 +1,80 @@ +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; + +// A simple hash function for demonstration purposes +function simpleHash(input: string): string { + let hash = 0; + for (let i = 0; i < input.length; i += 1) { + const char = input.charCodeAt(i); + hash = (hash << 5) - hash + char; + hash |= 0; // Convert to 32bit integer + } + return hash.toString(16); +} + +const emailMask = (match: string) => `[email-${simpleHash(match)}]`; +const phoneMask = (match: string) => `[phone-${simpleHash(match)}]`; +const nameMask = (match: string) => `[name-${simpleHash(match)}]`; +const ssnMask = (match: string) => `[ssn-${simpleHash(match)}]`; +const creditCardMask = (match: string) => `[creditcard-${simpleHash(match)}]`; +const passportMask = (match: string) => `[passport-${simpleHash(match)}]`; +const licenseMask = (match: string) => `[license-${simpleHash(match)}]`; +const addressMask = (match: string) => `[address-${simpleHash(match)}]`; +const dobMask = (match: string) => `[dob-${simpleHash(match)}]`; +const bankAccountMask = (match: string) => `[bankaccount-${simpleHash(match)}]`; + +// Regular expressions for different types of PII +const patterns = { + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\b\d{3}-\d{3}-\d{4}\b/g, mask: phoneMask }, + name: { regex: /\b[A-Z][a-z]+ [A-Z][a-z]+\b/g, mask: nameMask }, + ssn: { regex: /\b\d{3}-\d{2}-\d{4}\b/g, mask: ssnMask }, + creditCard: { regex: /\b(?:\d{4}[ -]?){3}\d{4}\b/g, mask: creditCardMask }, + passport: { regex: /(?i)\b[A-Z]{1,2}\d{6,9}\b/g, mask: passportMask }, + license: { regex: /(?i)\b[A-Z]{1,2}\d{6,8}\b/g, mask: licenseMask }, + address: { + regex: /\b\d{1,5}\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)\*\b/g, + mask: addressMask, + }, + dob: { regex: /\b\d{4}-\d{2}-\d{2}\b/g, mask: dobMask }, + bankAccount: { regex: /\b\d{8,17}\b/g, mask: bankAccountMask }, +}; + +// Create a RegexMaskingTransformer with multiple patterns +const piiMaskingTransformer = new RegexMaskingTransformer(patterns); + +// Hooks for different stages of masking and rehydrating +const onMaskingStart = (message: string) => + console.log(`Starting to mask message: ${message}`); +const onMaskingEnd = (maskedMessage: string) => + console.log(`Masked message: ${maskedMessage}`); +const onRehydratingStart = (message: string) => + console.log(`Starting to rehydrate message: ${message}`); +const onRehydratingEnd = (rehydratedMessage: string) => + console.log(`Rehydrated message: ${rehydratedMessage}`); + +// Initialize MaskingParser with the transformer and hooks +const maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + onRehydratingStart, + onRehydratingEnd, +}); + +// Example message containing multiple types of PII +const message = + "Contact Jane Doe at jane.doe@email.com or 555-123-4567. Her SSN is 123-45-6789 and her credit card number is 1234-5678-9012-3456. Passport number: AB1234567, Driver's License: X1234567, Address: 123 Main St, Date of Birth: 1990-01-01, Bank Account: 12345678901234567."; + +// Mask and rehydrate the message +maskingParser + .parse(message) + .then((maskedMessage: string) => { + console.log(`Masked message: ${maskedMessage}`); + return maskingParser.rehydrate(maskedMessage); + }) + .then((rehydratedMessage: string) => { + console.log(`Final rehydrated message: ${rehydratedMessage}`); + }); diff --git a/examples/src/experimental/masking/next.ts b/examples/src/experimental/masking/next.ts new file mode 100644 index 000000000000..85621a4b8dca --- /dev/null +++ b/examples/src/experimental/masking/next.ts @@ -0,0 +1,69 @@ +// app/api/chat + +import { + MaskingParser, + RegexMaskingTransformer, +} from "langchain/experimental/masking"; +import { PromptTemplate } from "langchain/prompts"; +import { ChatOpenAI } from "langchain/chat_models/openai"; +import { BytesOutputParser } from "langchain/schema/output_parser"; + +export const runtime = "edge"; + +// Function to format chat messages for consistency +const formatMessage = (message: any) => `${message.role}: ${message.content}`; + +const CUSTOMER_SUPPORT = `You are a customer support summarizer agent. Always include masked PII in your response. + Current conversation: + {chat_history} + User: {input} + AI:`; + +// Configure Masking Parser +const maskingParser = new MaskingParser(); +// Define transformations for masking emails and phone numbers using regular expressions +const piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g }, // If a regex is provided without a mask we fallback to a simple default hashing function + phone: { regex: /\d{3}-\d{3}-\d{4}/g }, +}); + +maskingParser.addTransformer(piiMaskingTransformer); + +export async function POST(req: Request) { + try { + const body = await req.json(); + const messages = body.messages ?? []; + const formattedPreviousMessages = messages.slice(0, -1).map(formatMessage); + const currentMessageContent = messages[messages.length - 1].content; // Extract the content of the last message + // Mask sensitive information in the current message + const guardedMessageContent = await maskingParser.parse( + currentMessageContent + ); + // Mask sensitive information in the chat history + const guardedHistory = await maskingParser.parse( + formattedPreviousMessages.join("\n") + ); + + const prompt = PromptTemplate.fromTemplate(CUSTOMER_SUPPORT); + const model = new ChatOpenAI({ temperature: 0.8 }); + // Initialize an output parser that handles serialization and byte-encoding for streaming + const outputParser = new BytesOutputParser(); + const chain = prompt.pipe(model).pipe(outputParser); // Chain the prompt, model, and output parser together + + console.log("[GUARDED INPUT]", guardedMessageContent); // Contact me at -1157967895 or -1626926859. + console.log("[GUARDED HISTORY]", guardedHistory); // user: Contact me at -1157967895 or -1626926859. assistant: Thank you for providing your contact information. + console.log("[STATE]", maskingParser.getState()); // { '-1157967895' => 'jane.doe@email.com', '-1626926859' => '555-123-4567'} + + // Stream the AI response based on the masked chat history and current message + const stream = await chain.stream({ + chat_history: guardedHistory, + input: guardedMessageContent, + }); + + return new Response(stream, { + headers: { "content-type": "text/plain; charset=utf-8" }, + }); + } catch (e: any) { + return Response.json({ error: e.message }, { status: 500 }); + } +} diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 65cc773bdf0c..e5bb0b7c3b2e 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -325,6 +325,7 @@ const entrypoints = { "experimental/hubs/makersuite/googlemakersuitehub", "experimental/chains/violation_of_expectations": "experimental/chains/violation_of_expectations/index", + "experimental/masking": "experimental/masking/index", "experimental/tools/pyinterpreter": "experimental/tools/pyinterpreter", // evaluation evaluation: "evaluation/index", diff --git a/langchain/src/experimental/masking/index.ts b/langchain/src/experimental/masking/index.ts new file mode 100644 index 000000000000..6aca47bb0581 --- /dev/null +++ b/langchain/src/experimental/masking/index.ts @@ -0,0 +1,8 @@ +export { MaskingParser } from "./parser.js"; +export { RegexMaskingTransformer } from "./regex_masking_transformer.js"; +export { MaskingTransformer } from "./transformer.js"; +export { + type MaskingParserConfig, + type HashFunction, + type HookFunction, +} from "./types.js"; diff --git a/langchain/src/experimental/masking/parser.ts b/langchain/src/experimental/masking/parser.ts new file mode 100644 index 000000000000..7592775d3e78 --- /dev/null +++ b/langchain/src/experimental/masking/parser.ts @@ -0,0 +1,154 @@ +import { MaskingTransformer } from "./transformer.js"; +import type { MaskingParserConfig } from "./types.js"; + +/** + * MaskingParser class for handling the masking and rehydrating of messages. + */ +export class MaskingParser { + private transformers: MaskingTransformer[]; + + private state: Map; + + private config: MaskingParserConfig; + + constructor(config: MaskingParserConfig = {}) { + this.transformers = config.transformers ?? []; + this.state = new Map(); + this.config = config; + } + + /** + * Adds a transformer to the parser. + * @param transformer - An instance of a class extending MaskingTransformer. + */ + addTransformer(transformer: MaskingTransformer) { + this.transformers.push(transformer); + } + + /** + * Getter method for retrieving the current state. + * @returns The current state map. + */ + public getState(): Map { + return this.state; + } + + /** + * Masks the provided message using the added transformers. + * This method sequentially applies each transformer's masking logic to the message. + * It utilizes a state map to track original values corresponding to their masked versions. + * + * @param message - The message to be masked. + * @returns A masked version of the message. + * @throws {TypeError} If the message is not a string. + * @throws {Error} If no transformers are added. + */ + async mask(message: string): Promise { + // If onMaskingStart is a function, handle it accordingly + if (this.config.onMaskingStart) { + await this.config.onMaskingStart(message); + } + + // Check if there are any transformers added to the parser. If not, throw an error + // as masking requires at least one transformer to apply its logic. + if (this.transformers.length === 0) { + throw new Error( + "MaskingParser.mask Error: No transformers have been added. Please add at least one transformer before parsing." + ); + } + + if (typeof message !== "string") { + throw new TypeError( + "MaskingParser.mask Error: The 'message' argument must be a string." + ); + } + + // Initialize the variable to hold the progressively masked message. + // It starts as the original message and gets transformed by each transformer. + let processedMessage = message; + + // Iterate through each transformer and apply their transform method. + for (const transformer of this.transformers) { + // Transform the message and get the transformer's state changes, ensuring no direct mutation of the shared state. + const [transformedMessage, transformerState] = + await transformer.transform(processedMessage, new Map(this.state)); + + // Update the processed message for subsequent transformers. + processedMessage = transformedMessage; + + // Merge state changes from the transformer into the parser's state. + // This accumulates all transformations' effects on the state. + transformerState.forEach((value, key) => this.state.set(key, value)); + } + + // Handle onMaskingEnd callback + if (this.config.onMaskingEnd) { + await this.config.onMaskingEnd(processedMessage); + } + // Return the fully masked message after all transformers have been applied. + return processedMessage; + } + + /** + * Rehydrates a masked message back to its original form. + * This method sequentially applies the rehydration logic of each added transformer in reverse order. + * It relies on the state map to correctly map the masked values back to their original values. + * + * The rehydration process is essential for restoring the original content of a message + * that has been transformed (masked) by the transformers. This process is the inverse of the masking process. + * + * @param message - The masked message to be rehydrated. + * @returns The original (rehydrated) version of the message. + */ + async rehydrate( + message: string, + state?: Map + ): Promise { + // Handle onRehydratingStart callback + if (this.config.onRehydratingStart) { + await this.config.onRehydratingStart(message); + } + + if (typeof message !== "string") { + throw new TypeError( + "MaskingParser.rehydrate Error: The 'message' argument must be a string." + ); + } + // Check if any transformers have been added to the parser. + // If no transformers are present, throw an error as rehydration requires at least one transformer. + if (this.transformers.length === 0) { + throw new Error( + "MaskingParser.rehydrate Error: No transformers have been added. Please add at least one transformer before rehydrating." + ); + } + + // eslint-disable-next-line no-instanceof/no-instanceof + if (state && !(state instanceof Map)) { + throw new TypeError( + "MaskingParser.rehydrate Error: The 'state' argument, if provided, must be an instance of Map." + ); + } + + const rehydrationState = state || this.state; // Use provided state or fallback to internal state + // Initialize the rehydratedMessage with the input masked message. + // This variable will undergo rehydration by each transformer in reverse order. + let rehydratedMessage = message; + // Use a reverse for...of loop to accommodate asynchronous rehydrate methods + const reversedTransformers = this.transformers.slice().reverse(); + for (const transformer of reversedTransformers) { + // Check if the result is a Promise and use await, otherwise use it directly + rehydratedMessage = await transformer.rehydrate( + rehydratedMessage, + rehydrationState + ); + } + + // Handle onRehydratingEnd callback + if (this.config.onRehydratingEnd) { + await this.config.onRehydratingEnd(rehydratedMessage); + } + + // Return the fully rehydrated message after all transformers have been applied. + return rehydratedMessage; + } +} diff --git a/langchain/src/experimental/masking/regex_masking_transformer.ts b/langchain/src/experimental/masking/regex_masking_transformer.ts new file mode 100644 index 000000000000..c52a8b6f83f4 --- /dev/null +++ b/langchain/src/experimental/masking/regex_masking_transformer.ts @@ -0,0 +1,174 @@ +import { MaskingTransformer } from "./transformer.js"; +import type { HashFunction, MaskingPattern } from "./types.js"; +/** + * RegexMaskingTransformer class for masking and rehydrating messages with Regex. + */ +export class RegexMaskingTransformer extends MaskingTransformer { + private patterns: { [key: string]: MaskingPattern }; + + private hashFunction: HashFunction; + + /** + * Constructs a RegexMaskingTransformer with given patterns and an optional hash function. + * Validates the provided patterns to ensure they conform to the expected structure. + * + * @param patterns - An object containing masking patterns. Each pattern should include + * a regular expression (`regex`) and optionally a `replacement` string + * or a `mask` function. + * @param hashFunction - An optional custom hash function to be used for masking. + */ + constructor( + patterns: { [key: string]: MaskingPattern }, + hashFunction?: HashFunction + ) { + super(); + // Validates the provided masking patterns before initializing the transformer. + // This ensures that each pattern has a valid regular expression. + this.validatePatterns(patterns); + + // Assigns the validated patterns and the hash function to the transformer. + // If no custom hash function is provided, the default hash function is used. + this.patterns = patterns; + this.hashFunction = hashFunction || this.defaultHashFunction; + } + + /** + * Validates the given masking patterns to ensure each pattern has a valid regular expression. + * Throws an error if any pattern is found to be invalid. + * + * @param patterns - The patterns object to validate. + */ + private validatePatterns(patterns: { [key: string]: MaskingPattern }) { + for (const key of Object.keys(patterns)) { + const pattern = patterns[key]; + // Checks that each pattern is an object and has a regex property that is an instance of RegExp. + // Throws an error if these conditions are not met, indicating an invalid pattern configuration. + if ( + !pattern || + typeof pattern !== "object" || + // eslint-disable-next-line no-instanceof/no-instanceof + !(pattern.regex instanceof RegExp) + ) { + throw new Error("Invalid pattern configuration."); + } + } + } + + /** + * Masks content in a message based on the defined patterns. + * @param message - The message to be masked. + * @param state - The current state containing original values. + * @returns A tuple of the masked message and the updated state. + */ + async transform( + message: string, + state: Map + ): Promise<[string, Map]> { + if (typeof message !== "string") { + throw new TypeError( + "RegexMaskingTransformer.transform Error: The 'message' argument must be a string." + ); + } + + // eslint-disable-next-line no-instanceof/no-instanceof + if (!(state instanceof Map)) { + throw new TypeError( + "RegexMaskingTransformer.transform Error: The 'state' argument must be an instance of Map." + ); + } + + // Holds the progressively masked message + let processedMessage = message; + + // Initialize original values map with the current state or a new map + const originalValues = state || new Map(); + + // Iterate over each pattern defined in the transformer + for (const key of Object.keys(this.patterns)) { + const pattern = this.patterns[key]; + + // Apply the current pattern's regex to the message + processedMessage = processedMessage.replace(pattern.regex, (match) => { + // Determine the masked value: use the mask function if provided, else use the replacement string, + // else use the hash function. + const maskedValue = pattern.mask + ? pattern.mask(match) + : pattern.replacement ?? this.hashFunction(match); + + // Store the mapping of the masked value to the original value (match) + originalValues.set(maskedValue, match); + + // Return the masked value to replace the original value in the message + return maskedValue; + }); + } + + // Return the fully masked message and the state map with all original values + // Wrap the synchronous return values in Promise.resolve() to maintain compatibility + // with the MaskingParser's expectation of a Promise return type. + return [processedMessage, originalValues]; + } + + /** + * Rehydrates a masked message back to its original form using the provided state. + * @param message - The masked message to be rehydrated. + * @param state - The state map containing mappings of masked values to their original values. + * @returns The rehydrated (original) message. + */ + async rehydrate( + message: string, + state: Map + ): Promise { + if (typeof message !== "string") { + throw new TypeError( + "RegexMaskingTransformer.rehydrate Error: The 'message' argument must be a string." + ); + } + + // eslint-disable-next-line no-instanceof/no-instanceof + if (!(state instanceof Map)) { + throw new TypeError( + "RegexMaskingTransformer.rehydrate Error: The 'state' argument must be an instance of Map." + ); + } + + // Convert the state map to an array and use reduce to sequentially replace masked values with original values. + const rehydratedMessage = Array.from(state).reduce( + (msg, [masked, original]) => { + // Escape special characters in the masked string to ensure it can be used in a regular expression safely. + // This is necessary because masked values might contain characters that have special meanings in regex. + const escapedMasked = masked.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + + // Replace all instances of the escaped masked value in the message with the original value. + // The 'g' flag in the RegExp ensures that all occurrences of the masked value are replaced. + return msg.replace(new RegExp(escapedMasked, "g"), original); + }, + message + ); + + return rehydratedMessage; + } + + /** + * Default hash function for creating unique hash values. + * @param input - The input string to hash. + * @returns The resulting hash as a string. + */ + private defaultHashFunction(input: string): string { + let hash = 0; + // Iterate over each character in the input string + for (let i = 0; i < input.length; i += 1) { + // Get ASCII value of the character + const char = input.charCodeAt(i); + // Combine the current hash with the new character and ensure it remains a 32-bit integer + hash = (hash << 5) - hash + char; + // Bitwise OR operation to convert to a 32-bit integer. + // This is a common technique to ensure the final hash value stays within the 32-bit limit, + // effectively wrapping the value when it becomes too large. + hash |= 0; + } + + // Convert the numerical hash value to a string and return + return hash.toString(); + } +} diff --git a/langchain/src/experimental/masking/tests/masking-extended.test.ts b/langchain/src/experimental/masking/tests/masking-extended.test.ts new file mode 100644 index 000000000000..8342063eae1c --- /dev/null +++ b/langchain/src/experimental/masking/tests/masking-extended.test.ts @@ -0,0 +1,73 @@ +// yarn test:single src/experimental/masking/tests/masking-extended.test.ts +import { MaskingParser, RegexMaskingTransformer } from "../index.js"; + +// Mock database for simulating state storage and retrieval +const mockDB = (() => { + const db = new Map(); + return { + async saveState(key: string, serializedState: string) { + db.set(key, serializedState); + }, + async getState(key: string): Promise { + return db.get(key) || ""; + }, + }; +})(); + +function serializeState(state: Map): string { + return JSON.stringify(Array.from(state.entries())); +} + +function deserializeState(serializedState: string): Map { + return new Map(JSON.parse(serializedState)); +} + +describe("MaskingParser Integration Test", () => { + let parser: MaskingParser; + let transformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + transformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + parser = new MaskingParser(); + parser.addTransformer(transformer); + }); + + it("should mask, store state, and rehydrate with altered order", async () => { + const originalMessage = "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await parser.mask(originalMessage); + + // Serialize and store the state + const serializedState = serializeState(parser.getState()); + await mockDB.saveState("uniqueMessageId", serializedState); + + // Simulate retrieving and altering the masked message + // Here, we assume the AI processing reverses the order of masked content + // Simulate retrieving and altering the masked message + const alteredMaskedMessage = maskedMessage.split(" ").reverse().join(" "); + + // Retrieve and deserialize the state + const retrievedSerializedState = await mockDB.getState("uniqueMessageId"); + const retrievedState = deserializeState(retrievedSerializedState); + + // Rehydrate the altered message + const rehydratedMessage = await parser.rehydrate( + alteredMaskedMessage, + retrievedState + ); + + // The expectation depends on how the alteration affects the masked message. + // Here, we assume that the rehydrated message should match the original message + // even after the alteration since the masked content still aligns with the stored state. + const expectedRehydratedMessage = originalMessage + .split(" ") + .reverse() + .join(" "); + expect(rehydratedMessage).toEqual(expectedRehydratedMessage); + }); +}); diff --git a/langchain/src/experimental/masking/tests/masking.test.ts b/langchain/src/experimental/masking/tests/masking.test.ts new file mode 100644 index 000000000000..a829a75d4f92 --- /dev/null +++ b/langchain/src/experimental/masking/tests/masking.test.ts @@ -0,0 +1,542 @@ +/* eslint-disable no-promise-executor-return */ +/* eslint-disable @typescript-eslint/no-explicit-any */ +// yarn test:single src/experimental/masking/tests/masking.test.ts +import { jest } from "@jest/globals"; +import { + MaskingParser, + RegexMaskingTransformer, + MaskingTransformer, +} from "../index.js"; + +describe("MaskingParser and PIIMaskingTransformer", () => { + describe("Masking with Static Identifiers", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("masks single occurrences of PII with static identifiers", async () => { + const message = "Contact me at jane.doe@email.com or 555-123-4567."; + const expectedMaskedMessage = "Contact me at [email] or [phone]."; + + const maskedMessage = await maskingParser.mask(message); + + expect(maskedMessage).toBe(expectedMaskedMessage); + }); + + it("rehydrates static masked data to its original form", async () => { + const maskedMessage = "Contact me at [email] or [phone]."; + const expectedOriginalMessage = + "Contact me at jane.doe@email.com or 555-123-4567."; + + await maskingParser.mask(expectedOriginalMessage); // Masking original message + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(expectedOriginalMessage); + }); + + function generateLargeMessage() { + let largeMessage = ""; + for (let i = 0; i < 10000; i += 1) { + // Adjust the number for desired message size + largeMessage += `User${i}: jane.doe${i}@email.com, 555-123-${i + .toString() + .padStart(4, "0")}. `; + } + return largeMessage; + } + + describe("Performance Testing", () => { + it("efficiently processes large data sets", async () => { + const largeMessage = generateLargeMessage(); + const startTime = performance.now(); + const maskedMessage = await maskingParser.mask(largeMessage); + const endTime = performance.now(); + + const someAcceptableDuration = 5000; // Set this to a duration you consider acceptable, e.g., 5000 milliseconds (5 seconds) + + expect(maskedMessage).toBeDefined(); + expect(endTime - startTime).toBeLessThan(someAcceptableDuration); + }); + }); + }); + + describe("Masking with Dynamic Identifiers", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailMask = () => `[email-${Math.random().toString(16).slice(2)}]`; + const phoneMask = () => `[phone-${Math.random().toString(16).slice(2)}]`; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: { regex: /\S+@\S+\.\S+/g, mask: emailMask }, + phone: { regex: /\d{3}-\d{3}-\d{4}/g, mask: phoneMask }, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("masks multiple occurrences of different PII with unique identifiers", async () => { + const message = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; + const maskedMessage = await maskingParser.mask(message); + + expect(maskedMessage).toMatch(/\[email-[a-f0-9]+\]/g); + expect(maskedMessage).toMatch(/\[phone-[a-f0-9]+\]/g); + expect((maskedMessage.match(/\[email-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + expect((maskedMessage.match(/\[phone-[a-f0-9]+\]/g) || []).length).toBe( + 1 + ); + }); + + it("rehydrates dynamic masked data to its original form", async () => { + const originalMessage = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com"; + const maskedMessage = await maskingParser.mask(originalMessage); + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(originalMessage); + }); + + it("masks identical PII with consistent dynamic identifiers", async () => { + const message = + "Contact me at jane.doe@email.com or 555-123-4567. Also reach me at john.smith@email.com and 555-123-4567"; + const maskedMessage = await maskingParser.mask(message); + + expect(maskedMessage).toMatch(/\[email-[a-f0-9]+\]/g); + expect(maskedMessage).toMatch(/\[phone-[a-f0-9]+\]/g); + expect((maskedMessage.match(/\[email-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + expect((maskedMessage.match(/\[phone-[a-f0-9]+\]/g) || []).length).toBe( + 2 + ); + }); + }); + + describe("PIIMaskingTransformer with Default Hash Function", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + it("should mask email and phone using default hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + const maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + + const message = + "My email is jane.doe@email.com and phone is 555-123-4567."; + const maskedMessage = await maskingParser.mask(message); + + expect(maskedMessage).toContain("[email]"); + expect(maskedMessage).toContain("[phone]"); + }); + }); + + describe("PIIMaskingTransformer with Custom Hash Function", () => { + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + const phonePattern = { regex: /\d{3}-\d{3}-\d{4}/, replacement: "[phone]" }; + + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + phone: phonePattern, + }); + + maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + }); + + // A simple hash function that creates a mock hash representation of the input. + // This is just for demonstration purposes and not a secure hashing method. + const customHashFunction = (input: string) => + input + .split("") + .map(() => "*") + .join(""); + it("should mask email and phone using custom hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer( + { + email: { + regex: /\S+@\S+\.\S+/, + mask: (match) => `custom-email-${customHashFunction(match)}`, + }, + phone: { + regex: /\d{3}-\d{3}-\d{4}/, + mask: (match) => `custom-phone-${customHashFunction(match)}`, + }, + }, + customHashFunction + ); + + const maskingParser = new MaskingParser(); + maskingParser.addTransformer(piiMaskingTransformer); + + const message = "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await maskingParser.mask(message); + + // The lengths of the masked parts should be equal to the lengths of the original email and phone number. + const expectedEmailMask = `custom-email-${"*".repeat( + "jane.doe@email.com".length + )}`; + const expectedPhoneMask = `custom-phone-${"*".repeat( + "555-123-4567".length + )}`; + + expect(maskedMessage).toContain(expectedEmailMask); + expect(maskedMessage).toContain(expectedPhoneMask); + }); + + it("should rehydrate masked data correctly using custom hash function", async () => { + const piiMaskingTransformer = new RegexMaskingTransformer( + { + email: { + regex: /\S+@\S+\.\S+/, + mask: (match) => `custom-email-${customHashFunction(match)}`, + }, + phone: { + regex: /\d{3}-\d{3}-\d{4}/, + mask: (match) => `custom-phone-${customHashFunction(match)}`, + }, + }, + customHashFunction + ); + + maskingParser.addTransformer(piiMaskingTransformer); + + const originalMessage = + "Contact me at jane.doe@email.com or 555-123-4567."; + const maskedMessage = await maskingParser.mask(originalMessage); + const rehydratedMessage = await maskingParser.rehydrate(maskedMessage); + + expect(rehydratedMessage).toBe(originalMessage); + }); + }); + + describe("Error Handling in MaskingParser", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({}); + maskingParser = new MaskingParser(); + }); + + it("throws an error when no transformers are added and parse is called", async () => { + const message = "Some message"; + await expect(maskingParser.mask(message)).rejects.toThrow( + "MaskingParser.mask Error: No transformers have been added. Please add at least one transformer before parsing." + ); + }); + + it("throws an error when no transformers are added and rehydrate is called", async () => { + const message = "Some masked message"; + await expect(maskingParser.rehydrate(message)).rejects.toThrow( + "MaskingParser.rehydrate Error: No transformers have been added. Please add at least one transformer before rehydrating." + ); + }); + + it("throws an error for invalid message type in parse", async () => { + const invalidMessage: any = 123; // intentionally incorrect type + maskingParser.addTransformer(piiMaskingTransformer); // Add a transformer + await expect(maskingParser.mask(invalidMessage)).rejects.toThrow( + "The 'message' argument must be a string." + ); + }); + + it("throws an error for invalid message type in rehydrate", async () => { + const invalidMessage: any = 123; // intentionally incorrect type + await expect(maskingParser.rehydrate(invalidMessage)).rejects.toThrow( + "The 'message' argument must be a string." + ); + }); + }); + + describe("Error Handling in PIIMaskingTransformer", () => { + it("throws an error for invalid message type in transform", async () => { + const transformer = new RegexMaskingTransformer({}); + const invalidMessage: any = 123; // intentionally incorrect type + const state = new Map(); + await expect( + transformer.transform(invalidMessage, state) + ).rejects.toThrow("The 'message' argument must be a string."); + }); + + it("throws an error for invalid state type in transform", async () => { + const transformer = new RegexMaskingTransformer({}); + const message = "Some message"; + const invalidState: any = {}; // intentionally incorrect type + await expect( + transformer.transform(message, invalidState) + ).rejects.toThrow("The 'state' argument must be an instance of Map."); + }); + + it("throws an error when initialized with invalid regex pattern", () => { + expect(() => { + const transformer = new RegexMaskingTransformer({ + // @ts-expect-error Should throw with invalid regex + invalid: { regex: null }, + }); + console.log(transformer); + }).toThrow("Invalid pattern configuration."); + }); + }); + + describe("MaskingParser Hooks", () => { + let maskingParser: MaskingParser; + let piiMaskingTransformer: RegexMaskingTransformer; + const emailPattern = { regex: /\S+@\S+\.\S+/, replacement: "[email]" }; + + beforeEach(() => { + piiMaskingTransformer = new RegexMaskingTransformer({ + email: emailPattern, + }); + }); + + // Masking hooks + it("handles synchronous onMaskingStart and onMaskingEnd hooks during parse", async () => { + const onMaskingStart = jest.fn(); // Synchronous mock + const onMaskingEnd = jest.fn(); // Synchronous mock + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + }); + + const message = "Contact me at jane.doe@email.com"; + await maskingParser.mask(message); + + expect(onMaskingStart).toHaveBeenCalledWith(message); + expect(onMaskingEnd).toHaveBeenCalled(); + }); + + it("handles asynchronous onMaskingStart and onMaskingEnd hooks during parse", async () => { + const onMaskingStart = jest.fn(() => Promise.resolve()); // Correctly mocked as an async function + const onMaskingEnd = jest.fn(() => Promise.resolve()); // Correctly mocked as an async function + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + }); + + const message = "Contact me at jane.doe@email.com"; + await maskingParser.mask(message); + + expect(onMaskingStart).toHaveBeenCalledWith(message); + expect(onMaskingEnd).toHaveBeenCalled(); + }); + + it("handles errors in synchronous onMaskingStart and onMaskingEnd hooks during parse", async () => { + const error = new Error("Test Error"); + const onMaskingStart = jest.fn(() => { + throw error; + }); // Synchronous mock that throws an error + const onMaskingEnd = jest.fn(() => { + throw error; + }); // Synchronous mock that throws an error + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + }); + + const message = "Contact me at jane.doe@email.com"; + await expect(maskingParser.mask(message)).rejects.toThrow(error); + + expect(onMaskingStart).toHaveBeenCalledWith(message); + // onMaskingEnd should not be called because an error is thrown in onMaskingStart + expect(onMaskingEnd).not.toHaveBeenCalled(); + }); + + it("handles errors in asynchronous onMaskingStart and onMaskingEnd hooks during parse", async () => { + const error = new Error("Test Error"); + const onMaskingStart = jest.fn(() => Promise.reject(error)); // Asynchronous mock that rejects with an error + const onMaskingEnd = jest.fn(() => Promise.reject(error)); // Asynchronous mock that rejects with an error + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onMaskingStart, + onMaskingEnd, + }); + + const message = "Contact me at jane.doe@email.com"; + await expect(maskingParser.mask(message)).rejects.toThrow(error); + + expect(onMaskingStart).toHaveBeenCalledWith(message); + // onMaskingEnd should not be called because an error is thrown in onMaskingStart + expect(onMaskingEnd).not.toHaveBeenCalled(); + }); + + // Rehydration hooks + it("handles synchronous onRehydratingStart and onRehydratingEnd hooks during rehydrate", async () => { + const onRehydratingStart = jest.fn(); // Synchronous mock + const onRehydratingEnd = jest.fn(); // Synchronous mock + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onRehydratingStart, + onRehydratingEnd, + }); + + const maskedMessage = await maskingParser.mask( + "Contact me at jane.doe@email.com" + ); + await maskingParser.rehydrate(maskedMessage); + + expect(onRehydratingStart).toHaveBeenCalledWith(maskedMessage); + expect(onRehydratingEnd).toHaveBeenCalled(); + }); + + it("handles asynchronous onRehydratingStart and onRehydratingEnd hooks during rehydrate", async () => { + const onRehydratingStart = jest.fn(() => Promise.resolve()); // Asynchronous mock + const onRehydratingEnd = jest.fn(() => Promise.resolve()); // Asynchronous mock + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onRehydratingStart, + onRehydratingEnd, + }); + + const maskedMessage = await maskingParser.mask( + "Contact me at jane.doe@email.com" + ); + await maskingParser.rehydrate(maskedMessage); + + expect(onRehydratingStart).toHaveBeenCalledWith(maskedMessage); + expect(onRehydratingEnd).toHaveBeenCalled(); + }); + + it("handles errors in synchronous onRehydratingStart and onRehydratingEnd hooks during rehydrate", async () => { + const error = new Error("Test Error"); + const onRehydratingStart = jest.fn(() => { + throw error; + }); // Synchronous mock that throws an error + const onRehydratingEnd = jest.fn(() => { + throw error; + }); // Synchronous mock that throws an error + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onRehydratingStart, + onRehydratingEnd, + }); + + const maskedMessage = await maskingParser.mask( + "Contact me at jane.doe@email.com" + ); + await expect(maskingParser.rehydrate(maskedMessage)).rejects.toThrow( + error + ); + + expect(onRehydratingStart).toHaveBeenCalledWith(maskedMessage); + // onRehydratingEnd should not be called because an error is thrown in onRehydratingStart + expect(onRehydratingEnd).not.toHaveBeenCalled(); + }); + + it("handles errors in asynchronous onRehydratingStart and onRehydratingEnd hooks during rehydrate", async () => { + const error = new Error("Test Error"); + const onRehydratingStart = jest.fn(() => Promise.reject(error)); // Asynchronous mock that rejects with an error + const onRehydratingEnd = jest.fn(() => Promise.reject(error)); // Asynchronous mock that rejects with an error + + maskingParser = new MaskingParser({ + transformers: [piiMaskingTransformer], + onRehydratingStart, + onRehydratingEnd, + }); + + const maskedMessage = await maskingParser.mask( + "Contact me at jane.doe@email.com" + ); + await expect(maskingParser.rehydrate(maskedMessage)).rejects.toThrow( + error + ); + + expect(onRehydratingStart).toHaveBeenCalledWith(maskedMessage); + // onRehydratingEnd should not be called because an error is thrown in onRehydratingStart + expect(onRehydratingEnd).not.toHaveBeenCalled(); + }); + }); + + describe("MaskingParser with Asynchronous Transformers", () => { + let maskingParser: MaskingParser; + let asyncTransformer: MaskingTransformer; + + beforeEach(() => { + // Mock an asynchronous transformer + asyncTransformer = { + async transform(message, state) { + // Simulate an asynchronous operation + await new Promise((resolve) => setTimeout(resolve, 100)); + // Return transformed message and updated state + const transformedMessage = message.replace( + /sensitiveData/g, + "[REDACTED]" + ); + const newState = new Map(state).set( + "redacted", + "sensitive string :(" + ); + return [transformedMessage, newState]; + }, + // Mock or placeholder rehydrate method + async rehydrate(message, _state) { + return message; + }, + }; + + maskingParser = new MaskingParser({ + transformers: [asyncTransformer], + // Add other configurations if necessary + }); + }); + + it("properly handles asynchronous transformations and state updates", async () => { + const originalMessage = + "This message contains sensitiveData that should be redacted."; + const transformedMessage = await maskingParser.mask(originalMessage); + + // Check if the message is transformed correctly + expect(transformedMessage).toBe( + "This message contains [REDACTED] that should be redacted." + ); + + // Check if the state is updated correctly + expect(maskingParser.getState().get("redacted")).toBe( + "sensitive string :(" + ); + }); + }); +}); diff --git a/langchain/src/experimental/masking/transformer.ts b/langchain/src/experimental/masking/transformer.ts new file mode 100644 index 000000000000..05582df2ea8a --- /dev/null +++ b/langchain/src/experimental/masking/transformer.ts @@ -0,0 +1,14 @@ +/** + * Abstract class representing a transformer used for masking and rehydrating messages. + */ +export abstract class MaskingTransformer { + abstract transform( + message: string, + state?: Map + ): Promise<[string, Map]>; + + abstract rehydrate( + message: string, + state: Map + ): Promise; +} diff --git a/langchain/src/experimental/masking/types.ts b/langchain/src/experimental/masking/types.ts new file mode 100644 index 000000000000..70cac7e7b878 --- /dev/null +++ b/langchain/src/experimental/masking/types.ts @@ -0,0 +1,31 @@ +import { MaskingTransformer } from "./transformer.js"; +/** + * Configuration type for MaskingParser. + */ + +export type MaskingParserConfig = { + transformers?: MaskingTransformer[]; + defaultHashFunction?: HashFunction; + onMaskingStart?: HookFunction; + onMaskingEnd?: HookFunction; + onRehydratingStart?: HookFunction; + onRehydratingEnd?: HookFunction; +}; + +/** + * Regex Masking Pattern used for masking in PIIMaskingTransformer. + */ +export type MaskingPattern = { + regex: RegExp; + replacement?: string; + mask?: (match: string) => string; +}; + +export type HookFunction = + | ((message: string) => Promise) + | ((message: string) => void); + +/** + * Represents a function that can hash a string input. + */ +export type HashFunction = (input: string) => string; diff --git a/langchain/src/load/import_map.ts b/langchain/src/load/import_map.ts index fa6ef233e228..d3ef2b3a96f1 100644 --- a/langchain/src/load/import_map.ts +++ b/langchain/src/load/import_map.ts @@ -103,6 +103,7 @@ export * as experimental__plan_and_execute from "../experimental/plan_and_execut export * as experimental__chat_models__bittensor from "../experimental/chat_models/bittensor.js"; export * as experimental__chat_models__ollama_functions from "../experimental/chat_models/ollama_functions.js"; export * as experimental__chains__violation_of_expectations from "../experimental/chains/violation_of_expectations/index.js"; +export * as experimental__masking from "../experimental/masking/index.js"; export * as evaluation from "../evaluation/index.js"; export * as runnables from "../runnables/index.js"; export * as runnables__remote from "../runnables/remote.js";