Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to stream files with GithubRepoLoader #3339

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,11 @@ import SubmodulesExample from "@examples/document_loaders/github_submodules.ts";
<CodeBlock language="typescript">{SubmodulesExample}</CodeBlock>

Note, that the loader will not follow submodules which are located on another GitHub instance than the one of the current repository.

### Stream large repository

For situations where processing large repositories in a memory-efficient manner is required. You can use the `loadAsStream` method to asynchronously streams documents from the entire GitHub repository.

import StreamExample from "@examples/document_loaders/github_stream.ts";

<CodeBlock language="typescript">{StreamExample}</CodeBlock>
20 changes: 20 additions & 0 deletions examples/src/document_loaders/github_stream.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { GithubRepoLoader } from "langchain/document_loaders/web/github";

export const run = async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
maxConcurrency: 3, // Defaults to 2
}
);

const docs = [];
for await (const doc of loader.loadAsStream()) {
docs.push(doc);
}

console.log({ docs });
};
79 changes: 78 additions & 1 deletion langchain/src/document_loaders/tests/github.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ test("Test GithubRepoLoader", async () => {
console.log(documents[0].pageContent);
});

test("Test ignorePaths with GithubRepoLoader", async () => {
test("Test ignoreFiles with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
Expand Down Expand Up @@ -61,3 +61,80 @@ test("Test ignorePaths with GithubRepoLoader", async () => {
).toBe(0);
console.log(documents[0].pageContent);
});

test("Test streaming documents from GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(1);
expect(
documents.filter((document) => document.metadata.source === "README.md")
.length
).toBe(1);
});

test("Test ignorePaths streaming with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
ignorePaths: ["yarn.lock", "*.md"],
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(0);
expect(
documents.filter((document) => document.metadata.source.endsWith(".md"))
.length
).toBe(0);
});

test("Test ignoreFiles streaming with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
ignoreFiles: ["yarn.lock", "README.md"],
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(0);
expect(
documents.filter((document) => document.metadata.source === "README.md")
.length
).toBe(0);
});
129 changes: 129 additions & 0 deletions langchain/src/document_loaders/web/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,27 @@ export class GithubRepoLoader
return documents;
}

/**
* Asynchronously streams documents from the entire GitHub repository.
* It is suitable for situations where processing large repositories in a memory-efficient manner is required.
* @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository.
*/
public async *loadAsStream(): AsyncGenerator<Document, void, undefined> {
jacoblee93 marked this conversation as resolved.
Show resolved Hide resolved
this.log(
`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`
);
yield* await this.processRepoAsStream(this.initialPath);

if (!this.processSubmodules) {
return;
}

await this.getSubmoduleInfo();
for (const submoduleInfo of this.submoduleInfos) {
yield* await this.loadSubmoduleAsStream(submoduleInfo);
}
}

/**
* Loads the information about Git submodules from the repository, if available.
*/
Expand Down Expand Up @@ -376,6 +397,47 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes and streams the contents of a specified submodule in the GitHub repository.
* @param submoduleInfo the info about the submodule to be loaded
* @yields Yields a Promise that resolves to a Document object for each file found in the submodule.
*/
private async *loadSubmoduleAsStream(
submoduleInfo: SubmoduleInfo
): AsyncGenerator<Document, void, undefined> {
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
yield* [];
}

if (!submoduleInfo.path.startsWith(this.initialPath)) {
this.log(
`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`
);
yield* [];
}

this.log(
`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`
);
const submoduleLoader = new GithubRepoLoader(submoduleInfo.url, {
accessToken: this.accessToken,
baseUrl: this.baseUrl,
apiUrl: this.apiUrl,
branch: submoduleInfo.ref,
recursive: this.recursive,
processSubmodules: this.processSubmodules,
unknown: this.unknown,
ignoreFiles: this.ignoreFiles,
ignorePaths: this.ignorePaths,
verbose: this.verbose,
maxConcurrency: this.maxConcurrency,
maxRetries: this.maxRetries,
});

yield* await submoduleLoader.processRepoAsStream(submoduleInfo.path);
}

/**
* Determines whether a file or directory should be ignored based on its
* path and type.
Expand Down Expand Up @@ -485,6 +547,40 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes the contents of the entire GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file found in the repository.
*/
private async *processRepoAsStream(
path: string
): AsyncGenerator<Document, void, undefined> {
const files = await this.fetchRepoFiles(path);
for (const file of files) {
if (this.shouldIgnore(file.path, file.type)) {
continue;
}

if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);

yield new Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
} catch (error) {
this.handleError(
`Failed to fetch file content: ${file.path}, ${error}`
);
}
} else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}

/**
* Fetches the contents of a directory and maps the file / directory paths
* to promises that will fetch the file / directory contents.
Expand All @@ -503,6 +599,39 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes the contents of a given directory in the GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file in the directory.
*/
private async *processDirectoryAsStream(
path: string
): AsyncGenerator<Document, void, undefined> {
const files = await this.fetchRepoFiles(path);

for (const file of files) {
if (this.shouldIgnore(file.path, file.type)) {
continue;
}

if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);

yield new Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
} catch {
this.handleError(`Failed to fetch file content: ${file.path}`);
}
} else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}

/**
* Fetches the files from a GitHub repository.
* If the path denotes a single file, the resulting array contains only one element.
Expand Down