Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

langchain[minor]: Add document loader for ChatGPT data #3439

Merged
merged 17 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ChatGPT files

This example goes over how to load conversations.json from your ChatGPT data export folder. You can get your data export by email by going to: ChatGPT -> (Profile) - Settings -> Export data -> Confirm export -> Check email.

## Usage, extracting all logs

Example code:

```typescript
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt";

const loader = new ChatGPTLoader("./example_data/example_conversations.json");

const docs = await loader.load();

console.log(docs);
```

## Usage, extracting a single log

Example code:

```typescript
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt";

const loader = new ChatGPTLoader(
"./example_data/example_conversations.json",
1
);

const docs = await loader.load();

console.log(docs);
```
3 changes: 3 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ document_loaders/fs/openai_whisper_audio.d.ts
document_loaders/fs/pptx.cjs
document_loaders/fs/pptx.js
document_loaders/fs/pptx.d.ts
document_loaders/fs/chatgpt.cjs
document_loaders/fs/chatgpt.js
document_loaders/fs/chatgpt.d.ts
document_transformers/html_to_text.cjs
document_transformers/html_to_text.js
document_transformers/html_to_text.d.ts
Expand Down
3 changes: 3 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,9 @@
"document_loaders/fs/pptx.cjs",
"document_loaders/fs/pptx.js",
"document_loaders/fs/pptx.d.ts",
"document_loaders/fs/chatgpt.cjs",
"document_loaders/fs/chatgpt.js",
"document_loaders/fs/chatgpt.d.ts",
"document_transformers/html_to_text.cjs",
"document_transformers/html_to_text.js",
"document_transformers/html_to_text.d.ts",
Expand Down
103 changes: 103 additions & 0 deletions langchain/src/document_loaders/fs/chatgpt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import { TextLoader } from "./text.js";
import { Document } from "../../document.js";

interface ChatGPTMessage {
author: {
role: string;
};
content: {
parts: string[];
};
create_time: number;
}

interface ChatGPTLog {
title: string;
mapping: Record<string, { message: ChatGPTMessage }>;
}

function concatenateRows(message: ChatGPTMessage, title: string): string {
/**
* Combine message information in a readable format ready to be used.
* @param {ChatGPTMessage} message - Message to be concatenated
* @param {string} title - Title of the conversation
*
* @returns {string} Concatenated message
*/
if (!message) {
return "";
}

const sender = message.author ? message.author.role : "unknown";
const text = message.content.parts[0];
const date = new Date(message.create_time * 1000)
jacoblee93 marked this conversation as resolved.
Show resolved Hide resolved
.toISOString()
.slice(0, 19)
.replace("T", " ");
return `${title} - ${sender} on ${date}: ${text}\n\n`;
}

export class ChatGPTLoader extends TextLoader {
public numLogs: number;

constructor(filePathOrBlob: string | Blob, numLogs = 0) {
super(filePathOrBlob);
this.numLogs = numLogs;
}

protected async parse(raw: string): Promise<string[]> {
let data;
try {
data = JSON.parse(raw);
} catch (e) {
console.error(e);
throw new Error("Failed to parse JSON");
}

const truncatedData = this.numLogs > 0 ? data.slice(0, this.numLogs) : data;

return truncatedData.map((d: ChatGPTLog) =>
Object.values(d.mapping)
.filter(
(msg, idx) => !(idx === 0 && msg.message.author.role === "system")
)
.map((msg) => concatenateRows(msg.message, d.title))
.join("")
);
}

public async load(): Promise<Document[]> {
let text: string;
let metadata: Record<string, string>;
if (typeof this.filePathOrBlob === "string") {
const { readFile } = await TextLoader.imports();
try {
text = await readFile(this.filePathOrBlob, "utf8");
} catch (e) {
console.error(e);
throw new Error("Failed to read file");
}
metadata = { source: this.filePathOrBlob };
} else {
try {
text = await this.filePathOrBlob.text();
} catch (e) {
console.error(e);
throw new Error("Failed to read blob");
}
metadata = { source: "blob", blobType: this.filePathOrBlob.type };
}

const parsed = await this.parse(text);
return parsed.map(
(pageContent, i) =>
new Document({
pageContent,
metadata: {
...metadata,
logIndex: i + 1,
},
})
);
}
}
52 changes: 52 additions & 0 deletions langchain/src/document_loaders/tests/chatgpt-blob.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import * as url from "node:url";
import * as path from "node:path";
import * as fs from "node:fs/promises";
import { test, expect } from "@jest/globals";
import { Document } from "../../document.js";
import { ChatGPTLoader } from "../fs/chatgpt.js";

test("Test ChatGPT loader from blob to load all documents", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(
new Blob([await fs.readFile(filePath)], { type: "application/json" })
);
const docs = await loader.load();
expect(docs.length).toBe(2);
expect(docs[0]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
expect(docs[1]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 2 },
pageContent:
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n",
})
);
});

test("Test ChatGPT loader from blob to only load 1 document", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(
new Blob([await fs.readFile(filePath)], { type: "application/json" }),
1
);
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
});
46 changes: 46 additions & 0 deletions langchain/src/document_loaders/tests/chatgpt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import * as url from "node:url";
import * as path from "node:path";
import { test, expect } from "@jest/globals";
import { Document } from "../../document.js";
import { ChatGPTLoader } from "../fs/chatgpt.js";

test("Test ChatGPT loader to load all documents", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(filePath);
const docs = await loader.load();
expect(docs.length).toBe(2);
expect(docs[0]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
expect(docs[1]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 2 },
pageContent:
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n",
})
);
});

test("Test ChatGPT loader to only load 1 document", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(filePath, 1);
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
});
Loading