diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/chatgpt.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/chatgpt.mdx new file mode 100644 index 000000000000..120abd25aada --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/chatgpt.mdx @@ -0,0 +1,34 @@ +# ChatGPT files + +This example goes over how to load conversations.json from your ChatGPT data export folder. You can get your data export by email by going to: ChatGPT -> (Profile) - Settings -> Export data -> Confirm export -> Check email. + +## Usage, extracting all logs + +Example code: + +```typescript +import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt"; + +const loader = new ChatGPTLoader("./example_data/example_conversations.json"); + +const docs = await loader.load(); + +console.log(docs); +``` + +## Usage, extracting a single log + +Example code: + +```typescript +import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt"; + +const loader = new ChatGPTLoader( + "./example_data/example_conversations.json", + 1 +); + +const docs = await loader.load(); + +console.log(docs); +``` diff --git a/langchain/.gitignore b/langchain/.gitignore index fb3ac300f10f..0d43e52d03f8 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -469,6 +469,9 @@ document_loaders/fs/openai_whisper_audio.d.ts document_loaders/fs/pptx.cjs document_loaders/fs/pptx.js document_loaders/fs/pptx.d.ts +document_loaders/fs/chatgpt.cjs +document_loaders/fs/chatgpt.js +document_loaders/fs/chatgpt.d.ts document_transformers/html_to_text.cjs document_transformers/html_to_text.js document_transformers/html_to_text.d.ts diff --git a/langchain/package.json b/langchain/package.json index 86c13e485637..d18549481124 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -481,6 +481,9 @@ "document_loaders/fs/pptx.cjs", "document_loaders/fs/pptx.js", "document_loaders/fs/pptx.d.ts", + "document_loaders/fs/chatgpt.cjs", + "document_loaders/fs/chatgpt.js", + "document_loaders/fs/chatgpt.d.ts", "document_transformers/html_to_text.cjs", "document_transformers/html_to_text.js", "document_transformers/html_to_text.d.ts", diff --git a/langchain/src/document_loaders/fs/chatgpt.ts b/langchain/src/document_loaders/fs/chatgpt.ts new file mode 100644 index 000000000000..8e8b6c749b86 --- /dev/null +++ b/langchain/src/document_loaders/fs/chatgpt.ts @@ -0,0 +1,103 @@ +import { TextLoader } from "./text.js"; +import { Document } from "../../document.js"; + +interface ChatGPTMessage { + author: { + role: string; + }; + content: { + parts: string[]; + }; + create_time: number; +} + +interface ChatGPTLog { + title: string; + mapping: Record; +} + +function concatenateRows(message: ChatGPTMessage, title: string): string { + /** + * Combine message information in a readable format ready to be used. + * @param {ChatGPTMessage} message - Message to be concatenated + * @param {string} title - Title of the conversation + * + * @returns {string} Concatenated message + */ + if (!message) { + return ""; + } + + const sender = message.author ? message.author.role : "unknown"; + const text = message.content.parts[0]; + const date = new Date(message.create_time * 1000) + .toISOString() + .slice(0, 19) + .replace("T", " "); + return `${title} - ${sender} on ${date}: ${text}\n\n`; +} + +export class ChatGPTLoader extends TextLoader { + public numLogs: number; + + constructor(filePathOrBlob: string | Blob, numLogs = 0) { + super(filePathOrBlob); + this.numLogs = numLogs; + } + + protected async parse(raw: string): Promise { + let data; + try { + data = JSON.parse(raw); + } catch (e) { + console.error(e); + throw new Error("Failed to parse JSON"); + } + + const truncatedData = this.numLogs > 0 ? data.slice(0, this.numLogs) : data; + + return truncatedData.map((d: ChatGPTLog) => + Object.values(d.mapping) + .filter( + (msg, idx) => !(idx === 0 && msg.message.author.role === "system") + ) + .map((msg) => concatenateRows(msg.message, d.title)) + .join("") + ); + } + + public async load(): Promise { + let text: string; + let metadata: Record; + if (typeof this.filePathOrBlob === "string") { + const { readFile } = await TextLoader.imports(); + try { + text = await readFile(this.filePathOrBlob, "utf8"); + } catch (e) { + console.error(e); + throw new Error("Failed to read file"); + } + metadata = { source: this.filePathOrBlob }; + } else { + try { + text = await this.filePathOrBlob.text(); + } catch (e) { + console.error(e); + throw new Error("Failed to read blob"); + } + metadata = { source: "blob", blobType: this.filePathOrBlob.type }; + } + + const parsed = await this.parse(text); + return parsed.map( + (pageContent, i) => + new Document({ + pageContent, + metadata: { + ...metadata, + logIndex: i + 1, + }, + }) + ); + } +} diff --git a/langchain/src/document_loaders/tests/chatgpt-blob.test.ts b/langchain/src/document_loaders/tests/chatgpt-blob.test.ts new file mode 100644 index 000000000000..b158f7807d3b --- /dev/null +++ b/langchain/src/document_loaders/tests/chatgpt-blob.test.ts @@ -0,0 +1,52 @@ +import * as url from "node:url"; +import * as path from "node:path"; +import * as fs from "node:fs/promises"; +import { test, expect } from "@jest/globals"; +import { Document } from "../../document.js"; +import { ChatGPTLoader } from "../fs/chatgpt.js"; + +test("Test ChatGPT loader from blob to load all documents", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/chatgpt/example_conversations.json" + ); + const loader = new ChatGPTLoader( + new Blob([await fs.readFile(filePath)], { type: "application/json" }) + ); + const docs = await loader.load(); + expect(docs.length).toBe(2); + expect(docs[0]).toEqual( + new Document({ + metadata: { source: "blob", blobType: "application/json", logIndex: 1 }, + pageContent: + "Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", + }) + ); + expect(docs[1]).toEqual( + new Document({ + metadata: { source: "blob", blobType: "application/json", logIndex: 2 }, + pageContent: + "Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n", + }) + ); +}); + +test("Test ChatGPT loader from blob to only load 1 document", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/chatgpt/example_conversations.json" + ); + const loader = new ChatGPTLoader( + new Blob([await fs.readFile(filePath)], { type: "application/json" }), + 1 + ); + const docs = await loader.load(); + expect(docs.length).toBe(1); + expect(docs[0]).toEqual( + new Document({ + metadata: { source: "blob", blobType: "application/json", logIndex: 1 }, + pageContent: + "Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", + }) + ); +}); diff --git a/langchain/src/document_loaders/tests/chatgpt.test.ts b/langchain/src/document_loaders/tests/chatgpt.test.ts new file mode 100644 index 000000000000..7e1d68f189f6 --- /dev/null +++ b/langchain/src/document_loaders/tests/chatgpt.test.ts @@ -0,0 +1,46 @@ +import * as url from "node:url"; +import * as path from "node:path"; +import { test, expect } from "@jest/globals"; +import { Document } from "../../document.js"; +import { ChatGPTLoader } from "../fs/chatgpt.js"; + +test("Test ChatGPT loader to load all documents", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/chatgpt/example_conversations.json" + ); + const loader = new ChatGPTLoader(filePath); + const docs = await loader.load(); + expect(docs.length).toBe(2); + expect(docs[0]).toEqual( + new Document({ + metadata: { source: filePath, logIndex: 1 }, + pageContent: + "Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", + }) + ); + expect(docs[1]).toEqual( + new Document({ + metadata: { source: filePath, logIndex: 2 }, + pageContent: + "Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n", + }) + ); +}); + +test("Test ChatGPT loader to only load 1 document", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/chatgpt/example_conversations.json" + ); + const loader = new ChatGPTLoader(filePath, 1); + const docs = await loader.load(); + expect(docs.length).toBe(1); + expect(docs[0]).toEqual( + new Document({ + metadata: { source: filePath, logIndex: 1 }, + pageContent: + "Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", + }) + ); +}); diff --git a/langchain/src/document_loaders/tests/example_data/chatgpt/example_conversations.json b/langchain/src/document_loaders/tests/example_data/chatgpt/example_conversations.json new file mode 100644 index 000000000000..0caf083f6de7 --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/chatgpt/example_conversations.json @@ -0,0 +1,338 @@ +[ + { + "title": "Example Usage", + "create_time": 1697499617.697575, + "update_time": 1697499624.514186, + "mapping": { + "91c9fd24-0a12-41b8-b860-8f2cc237e838": { + "id": "91c9fd24-0a12-41b8-b860-8f2cc237e838", + "message": { + "id": "91c9fd24-0a12-41b8-b860-8f2cc237e838", + "author": { + "role": "system", + "name": null, + "metadata": {} + }, + "create_time": null, + "update_time": null, + "content": { + "content_type": "text", + "parts": [""] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 0.0, + "metadata": {}, + "recipient": "all" + }, + "parent": "aaa1ab70-044a-4f2b-b4f7-74d4342f34da", + "children": ["aaa29ce8-0430-461e-9d42-74f750409cc7"] + }, + "aaa1ab70-044a-4f2b-b4f7-74d4342f34da": { + "id": "aaa1ab70-044a-4f2b-b4f7-74d4342f34da", + "message": null, + "parent": null, + "children": ["91c9fd24-0a12-41b8-b860-8f2cc237e838"] + }, + "aaa29ce8-0430-461e-9d42-74f750409cc7": { + "id": "aaa29ce8-0430-461e-9d42-74f750409cc7", + "message": { + "id": "aaa29ce8-0430-461e-9d42-74f750409cc7", + "author": { + "role": "user", + "name": null, + "metadata": {} + }, + "create_time": 1697499617.698544, + "update_time": null, + "content": { + "content_type": "text", + "parts": ["Hello, what is your name?"] + }, + "status": "finished_successfully", + "end_turn": null, + "weight": 1.0, + "metadata": { + "timestamp_": "absolute", + "message_type": null + }, + "recipient": "all" + }, + "parent": "91c9fd24-0a12-41b8-b860-8f2cc237e838", + "children": ["e47e5933-13f9-460b-b0f0-fc032e44ff64"] + }, + "e47e5933-13f9-460b-b0f0-fc032e44ff64": { + "id": "e47e5933-13f9-460b-b0f0-fc032e44ff64", + "message": { + "id": "e47e5933-13f9-460b-b0f0-fc032e44ff64", + "author": { + "role": "assistant", + "name": null, + "metadata": {} + }, + "create_time": 1697499623.489646, + "update_time": null, + "content": { + "content_type": "text", + "parts": [ + "Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?" + ] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 1.0, + "metadata": { + "finish_details": { + "type": "stop", + "stop_tokens": [100260] + }, + "is_complete": true, + "message_type": null, + "model_slug": "text-davinci-002-render-sha", + "parent_id": "aaa29ce8-0430-461e-9d42-74f750409cc7", + "timestamp_": "absolute" + }, + "recipient": "all" + }, + "parent": "aaa29ce8-0430-461e-9d42-74f750409cc7", + "children": [] + } + }, + "moderation_results": [], + "current_node": "e47e5933-13f9-460b-b0f0-fc032e44ff64", + "plugin_ids": null, + "conversation_id": "47867fd2-2a41-4102-80ef-101510becc08", + "conversation_template_id": null, + "id": "47867fd2-2a41-4102-80ef-101510becc08" + }, + { + "title": "Example Usage 2", + "create_time": 1697238139.540548, + "update_time": 1697238574.062126, + "mapping": { + "f3602996-5cb6-4050-87d9-0001149ba7ee": { + "id": "f3602996-5cb6-4050-87d9-0001149ba7ee", + "message": { + "id": "f3602996-5cb6-4050-87d9-0001149ba7ee", + "author": { + "role": "system", + "name": null, + "metadata": {} + }, + "create_time": null, + "update_time": null, + "content": { + "content_type": "text", + "parts": [""] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 0.0, + "metadata": {}, + "recipient": "all" + }, + "parent": "aaa1590a-2f7f-410a-b12c-9119a3264ef9", + "children": ["aaa2fff4-49fb-40e5-9122-a2addfeedfc9"] + }, + "aaa1590a-2f7f-410a-b12c-9119a3264ef9": { + "id": "aaa1590a-2f7f-410a-b12c-9119a3264ef9", + "message": null, + "parent": null, + "children": ["f3602996-5cb6-4050-87d9-0001149ba7ee"] + }, + "aaa2fff4-49fb-40e5-9122-a2addfeedfc9": { + "id": "aaa2fff4-49fb-40e5-9122-a2addfeedfc9", + "message": { + "id": "aaa2fff4-49fb-40e5-9122-a2addfeedfc9", + "author": { + "role": "user", + "name": null, + "metadata": {} + }, + "create_time": 1697238139.541115, + "update_time": null, + "content": { + "content_type": "text", + "parts": ["What should I do today?"] + }, + "status": "finished_successfully", + "end_turn": null, + "weight": 1.0, + "metadata": { + "timestamp_": "absolute", + "message_type": null + }, + "recipient": "all" + }, + "parent": "f3602996-5cb6-4050-87d9-0001149ba7ee", + "children": ["fc75f9c1-d769-4d1b-a94f-0ebdf25af1f8"] + }, + "fc75f9c1-d769-4d1b-a94f-0ebdf25af1f8": { + "id": "fc75f9c1-d769-4d1b-a94f-0ebdf25af1f8", + "message": { + "id": "fc75f9c1-d769-4d1b-a94f-0ebdf25af1f8", + "author": { + "role": "assistant", + "name": null, + "metadata": {} + }, + "create_time": 1697238147.462413, + "update_time": null, + "content": { + "content_type": "text", + "parts": ["You should contribute to LangChain!"] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 1.0, + "metadata": { + "finish_details": { + "type": "stop", + "stop_tokens": [100260] + }, + "is_complete": true, + "message_type": null, + "model_slug": "text-davinci-002-render-sha", + "parent_id": "aaa2fff4-49fb-40e5-9122-a2addfeedfc9", + "timestamp_": "absolute" + }, + "recipient": "all" + }, + "parent": "aaa2fff4-49fb-40e5-9122-a2addfeedfc9", + "children": ["aaa24ae6-3972-4094-8c84-3b31750bc212"] + }, + "aaa24ae6-3972-4094-8c84-3b31750bc212": { + "id": "aaa24ae6-3972-4094-8c84-3b31750bc212", + "message": { + "id": "aaa24ae6-3972-4094-8c84-3b31750bc212", + "author": { + "role": "user", + "name": null, + "metadata": {} + }, + "create_time": 1697238210.739114, + "update_time": null, + "content": { + "content_type": "text", + "parts": ["How can I start?"] + }, + "status": "finished_successfully", + "end_turn": null, + "weight": 1.0, + "metadata": { + "timestamp_": "absolute", + "message_type": null + }, + "recipient": "all" + }, + "parent": "fc75f9c1-d769-4d1b-a94f-0ebdf25af1f8", + "children": ["713c8a38-0685-41f2-8a28-36c880d65480"] + }, + "713c8a38-0685-41f2-8a28-36c880d65480": { + "id": "713c8a38-0685-41f2-8a28-36c880d65480", + "message": { + "id": "713c8a38-0685-41f2-8a28-36c880d65480", + "author": { + "role": "assistant", + "name": null, + "metadata": {} + }, + "create_time": 1697238218.628864, + "update_time": null, + "content": { + "content_type": "text", + "parts": [ + "You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file." + ] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 1.0, + "metadata": { + "finish_details": { + "type": "stop", + "stop_tokens": [100260] + }, + "is_complete": true, + "message_type": null, + "model_slug": "text-davinci-002-render-sha", + "parent_id": "aaa24ae6-3972-4094-8c84-3b31750bc212", + "timestamp_": "absolute" + }, + "recipient": "all" + }, + "parent": "aaa24ae6-3972-4094-8c84-3b31750bc212", + "children": ["aaa266ba-b466-4207-bfbc-19b2c1bb9b78"] + }, + "aaa266ba-b466-4207-bfbc-19b2c1bb9b78": { + "id": "aaa266ba-b466-4207-bfbc-19b2c1bb9b78", + "message": { + "id": "aaa266ba-b466-4207-bfbc-19b2c1bb9b78", + "author": { + "role": "user", + "name": null, + "metadata": {} + }, + "create_time": 1697238564.084255, + "update_time": null, + "content": { + "content_type": "text", + "parts": ["Thank you!"] + }, + "status": "finished_successfully", + "end_turn": null, + "weight": 1.0, + "metadata": { + "timestamp_": "absolute", + "message_type": null + }, + "recipient": "all" + }, + "parent": "713c8a38-0685-41f2-8a28-36c880d65480", + "children": ["6ea89c28-95de-465d-9d25-a8fd67d4ba9c"] + }, + "6ea89c28-95de-465d-9d25-a8fd67d4ba9c": { + "id": "6ea89c28-95de-465d-9d25-a8fd67d4ba9c", + "message": { + "id": "6ea89c28-95de-465d-9d25-a8fd67d4ba9c", + "author": { + "role": "assistant", + "name": null, + "metadata": {} + }, + "create_time": 1697238574.056793, + "update_time": null, + "content": { + "content_type": "text", + "parts": [ + "You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out." + ] + }, + "status": "finished_successfully", + "end_turn": true, + "weight": 1.0, + "metadata": { + "finish_details": { + "type": "stop", + "stop_tokens": [100260] + }, + "is_complete": true, + "message_type": null, + "model_slug": "text-davinci-002-render-sha", + "parent_id": "aaa266ba-b466-4207-bfbc-19b2c1bb9b78", + "timestamp_": "absolute" + }, + "recipient": "all" + }, + "parent": "aaa266ba-b466-4207-bfbc-19b2c1bb9b78", + "children": [] + } + }, + "moderation_results": [], + "current_node": "6ea89c28-95de-465d-9d25-a8fd67d4ba9c", + "plugin_ids": null, + "conversation_id": "588ae2e0-a336-475b-b4ef-1904078b8c24", + "conversation_template_id": null, + "id": "588ae2e0-a336-475b-b4ef-1904078b8c24" + } +]