Skip to content

Commit

Permalink
feat: add metadata loading to document loaders (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
Q1w1N authored Nov 5, 2024
1 parent 09652f3 commit 844ad5b
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 2 deletions.
2 changes: 1 addition & 1 deletion packages/document-loaders/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
export type { DocumentLoader, DocumentLoaderResult, DocumentMetadata } from './loaders/types.js';

export { loadDocument } from './loaders/index.js';
export { loadDocument, loadDocumentMetadata, registerLoader } from './loaders/index.js';

export { notionPdfBlockLoader } from './loaders/notion-pdf-block.js';
export { googleDriveLoader } from './loaders/google-drive.js';
Expand Down
1 change: 1 addition & 0 deletions packages/document-loaders/src/loaders/google-docs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const URL_ROOT = 'https://docs.google.com/document/d/';
export const googleDocsLoader: DocumentLoader = {
isSupported,
loadDocument,
loadMetadata,
};

function isSupported(source: string) {
Expand Down
12 changes: 12 additions & 0 deletions packages/document-loaders/src/loaders/google-drive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const URL_ROOT = 'https://drive.google.com/file/d/';
export const googleDriveLoader: DocumentLoader = {
isSupported,
loadDocument,
loadMetadata,
};

function isSupported(source: string) {
Expand All @@ -33,6 +34,17 @@ async function loadDocument(url: string) {
};
}

async function loadMetadata(url: string) {
const authClient = await getGoogleAuthClient();

const fileMetadata = await fetchFileMetadata(authClient, url);
if (!fileMetadata) {
throw new Error(`DocumentLoader/GoogleDrive Document name not found: ${url}`);
}

return fileMetadata;
}

function formatStandardizedUrl(url: string) {
const fileId = extractFileIdFromUrl(url);
return `${URL_ROOT}${fileId}`;
Expand Down
1 change: 1 addition & 0 deletions packages/document-loaders/src/loaders/google-slides.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const URL_ROOT = 'https://docs.google.com/presentation/d/';
export const googleSlidesLoader: DocumentLoader = {
isSupported,
loadDocument,
loadMetadata,
};

function isSupported(source: string) {
Expand Down
16 changes: 15 additions & 1 deletion packages/document-loaders/src/loaders/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { googleDriveLoader } from './google-drive.js';
import { googleSlidesLoader } from './google-slides.js';
import { googleDocsLoader } from './google-docs.js';
import { DocumentLoader, DocumentLoaderResult } from './types.js';
import { DocumentLoader, DocumentLoaderResult, DocumentMetadata } from './types.js';
import { notionLoader } from './notion.js';
import { notionPdfBlockLoader } from './notion-pdf-block.js';

Expand All @@ -22,3 +22,17 @@ export function loadDocument(source: string): Promise<DocumentLoaderResult> {

throw new Error(`DocumentLoader Unsupported document type ${source}`);
}

export function loadDocumentMetadata(source: string): Promise<DocumentMetadata> {
for (const loader of loaders) {
if (loader.isSupported(source)) {
return loader.loadMetadata(source);
}
}

throw new Error(`DocumentLoader Unsupported document type ${source}`);
}

export function registerLoader(loader: DocumentLoader) {
loaders.push(loader);
}
6 changes: 6 additions & 0 deletions packages/document-loaders/src/loaders/notion-pdf-block.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@ const BLOCK_ID_REGEX = /^[a-z0-9]{32}$/; // 32 characters
export const notionPdfBlockLoader: DocumentLoader = {
isSupported,
loadDocument,
loadMetadata,
};

function isSupported(source: string) {
return source.startsWith(URL_ROOT) && BLOCK_ID_REGEX.test(getUrlHash(source));
}

async function loadMetadata(url: string) {
const pdfBlock = await fetchPdfBlock(url);
return getMetadataFromBlock(url, pdfBlock);
}

async function loadDocument(url: string) {
const pdfBlock = await fetchPdfBlock(url);
const metadata = getMetadataFromBlock(url, pdfBlock);
Expand Down
10 changes: 10 additions & 0 deletions packages/document-loaders/src/loaders/notion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const URL_ROOT = 'https://www.notion.so/';
export const notionLoader: DocumentLoader = {
isSupported,
loadDocument,
loadMetadata,
};

function isSupported(source: string) {
Expand All @@ -27,6 +28,15 @@ async function loadDocument(url: string) {
return { content, metadata };
}

async function loadMetadata(url: string) {
const pageId = extractNotionPageId(url);
if (!pageId) {
throw new Error(`NotionLoader Invalid Notion URL: ${url}`);
}

return await fetchPageMetadata(pageId);
}

function extractNotionPageId(url: string) {
const pattern = /[0-9a-f]{32}/;
const match = url.match(pattern);
Expand Down
1 change: 1 addition & 0 deletions packages/document-loaders/src/loaders/types.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
export type DocumentLoader = {
isSupported: (source: string) => boolean;
loadDocument: (source: string) => Promise<DocumentLoaderResult>;
loadMetadata: (source: string) => Promise<DocumentMetadata>;
};

export type DocumentLoaderResult = {
Expand Down

0 comments on commit 844ad5b

Please sign in to comment.