Skip to content

Commit

Permalink
Allow to stream files with GithubRepoLoader (#3339)
Browse files Browse the repository at this point in the history
* add loadAsStream method in GithubRepoLoader

* apply review changes
  • Loading branch information
Njuelle authored Nov 22, 2023
1 parent 750eb2e commit 517ec3e
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,11 @@ import SubmodulesExample from "@examples/document_loaders/github_submodules.ts";
<CodeBlock language="typescript">{SubmodulesExample}</CodeBlock>

Note, that the loader will not follow submodules which are located on another GitHub instance than the one of the current repository.

### Stream large repository

For situations where processing large repositories in a memory-efficient manner is required. You can use the `loadAsStream` method to asynchronously streams documents from the entire GitHub repository.

import StreamExample from "@examples/document_loaders/github_stream.ts";

<CodeBlock language="typescript">{StreamExample}</CodeBlock>
20 changes: 20 additions & 0 deletions examples/src/document_loaders/github_stream.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { GithubRepoLoader } from "langchain/document_loaders/web/github";

export const run = async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
maxConcurrency: 3, // Defaults to 2
}
);

const docs = [];
for await (const doc of loader.loadAsStream()) {
docs.push(doc);
}

console.log({ docs });
};
79 changes: 78 additions & 1 deletion langchain/src/document_loaders/tests/github.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ test("Test GithubRepoLoader", async () => {
console.log(documents[0].pageContent);
});

test("Test ignorePaths with GithubRepoLoader", async () => {
test("Test ignoreFiles with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
Expand Down Expand Up @@ -61,3 +61,80 @@ test("Test ignorePaths with GithubRepoLoader", async () => {
).toBe(0);
console.log(documents[0].pageContent);
});

test("Test streaming documents from GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(1);
expect(
documents.filter((document) => document.metadata.source === "README.md")
.length
).toBe(1);
});

test("Test ignorePaths streaming with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
ignorePaths: ["yarn.lock", "*.md"],
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(0);
expect(
documents.filter((document) => document.metadata.source.endsWith(".md"))
.length
).toBe(0);
});

test("Test ignoreFiles streaming with GithubRepoLoader", async () => {
const loader = new GithubRepoLoader(
"https://github.com/langchain-ai/langchainjs",
{
branch: "main",
recursive: false,
unknown: "warn",
ignoreFiles: ["yarn.lock", "README.md"],
}
);

const documents = [];
for await (const document of loader.loadAsStream()) {
documents.push(document);
}

expect(
documents.filter((document) => document.metadata.source === "yarn.lock")
.length
).toBe(0);
expect(
documents.filter((document) => document.metadata.source === "README.md")
.length
).toBe(0);
});
129 changes: 129 additions & 0 deletions langchain/src/document_loaders/web/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,27 @@ export class GithubRepoLoader
return documents;
}

/**
* Asynchronously streams documents from the entire GitHub repository.
* It is suitable for situations where processing large repositories in a memory-efficient manner is required.
* @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository.
*/
public async *loadAsStream(): AsyncGenerator<Document, void, undefined> {
this.log(
`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`
);
yield* await this.processRepoAsStream(this.initialPath);

if (!this.processSubmodules) {
return;
}

await this.getSubmoduleInfo();
for (const submoduleInfo of this.submoduleInfos) {
yield* await this.loadSubmoduleAsStream(submoduleInfo);
}
}

/**
* Loads the information about Git submodules from the repository, if available.
*/
Expand Down Expand Up @@ -376,6 +397,47 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes and streams the contents of a specified submodule in the GitHub repository.
* @param submoduleInfo the info about the submodule to be loaded
* @yields Yields a Promise that resolves to a Document object for each file found in the submodule.
*/
private async *loadSubmoduleAsStream(
submoduleInfo: SubmoduleInfo
): AsyncGenerator<Document, void, undefined> {
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
yield* [];
}

if (!submoduleInfo.path.startsWith(this.initialPath)) {
this.log(
`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`
);
yield* [];
}

this.log(
`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`
);
const submoduleLoader = new GithubRepoLoader(submoduleInfo.url, {
accessToken: this.accessToken,
baseUrl: this.baseUrl,
apiUrl: this.apiUrl,
branch: submoduleInfo.ref,
recursive: this.recursive,
processSubmodules: this.processSubmodules,
unknown: this.unknown,
ignoreFiles: this.ignoreFiles,
ignorePaths: this.ignorePaths,
verbose: this.verbose,
maxConcurrency: this.maxConcurrency,
maxRetries: this.maxRetries,
});

yield* await submoduleLoader.processRepoAsStream(submoduleInfo.path);
}

/**
* Determines whether a file or directory should be ignored based on its
* path and type.
Expand Down Expand Up @@ -485,6 +547,40 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes the contents of the entire GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file found in the repository.
*/
private async *processRepoAsStream(
path: string
): AsyncGenerator<Document, void, undefined> {
const files = await this.fetchRepoFiles(path);
for (const file of files) {
if (this.shouldIgnore(file.path, file.type)) {
continue;
}

if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);

yield new Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
} catch (error) {
this.handleError(
`Failed to fetch file content: ${file.path}, ${error}`
);
}
} else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}

/**
* Fetches the contents of a directory and maps the file / directory paths
* to promises that will fetch the file / directory contents.
Expand All @@ -503,6 +599,39 @@ export class GithubRepoLoader
}
}

/**
* Asynchronously processes the contents of a given directory in the GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file in the directory.
*/
private async *processDirectoryAsStream(
path: string
): AsyncGenerator<Document, void, undefined> {
const files = await this.fetchRepoFiles(path);

for (const file of files) {
if (this.shouldIgnore(file.path, file.type)) {
continue;
}

if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);

yield new Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
} catch {
this.handleError(`Failed to fetch file content: ${file.path}`);
}
} else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}

/**
* Fetches the files from a GitHub repository.
* If the path denotes a single file, the resulting array contains only one element.
Expand Down

2 comments on commit 517ec3e

@vercel
Copy link

@vercel vercel bot commented on 517ec3e Nov 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

langchainjs-api-refs – ./docs/api_refs

@vercel
Copy link

@vercel vercel bot commented on 517ec3e Nov 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

langchainjs-docs – ./docs/core_docs/

Please sign in to comment.