From 844ad5b6f6a6e0b857f2634e69c5f8636aaa4287 Mon Sep 17 00:00:00 2001 From: Q1w1N <47570093+Q1w1N@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:36:53 +0100 Subject: [PATCH] feat: add metadata loading to document loaders (#5) --- packages/document-loaders/src/index.ts | 2 +- .../document-loaders/src/loaders/google-docs.ts | 1 + .../document-loaders/src/loaders/google-drive.ts | 12 ++++++++++++ .../src/loaders/google-slides.ts | 1 + packages/document-loaders/src/loaders/index.ts | 16 +++++++++++++++- .../src/loaders/notion-pdf-block.ts | 6 ++++++ packages/document-loaders/src/loaders/notion.ts | 10 ++++++++++ packages/document-loaders/src/loaders/types.ts | 1 + 8 files changed, 47 insertions(+), 2 deletions(-) diff --git a/packages/document-loaders/src/index.ts b/packages/document-loaders/src/index.ts index 101f5c3..eaf46df 100644 --- a/packages/document-loaders/src/index.ts +++ b/packages/document-loaders/src/index.ts @@ -1,6 +1,6 @@ export type { DocumentLoader, DocumentLoaderResult, DocumentMetadata } from './loaders/types.js'; -export { loadDocument } from './loaders/index.js'; +export { loadDocument, loadDocumentMetadata, registerLoader } from './loaders/index.js'; export { notionPdfBlockLoader } from './loaders/notion-pdf-block.js'; export { googleDriveLoader } from './loaders/google-drive.js'; diff --git a/packages/document-loaders/src/loaders/google-docs.ts b/packages/document-loaders/src/loaders/google-docs.ts index ad94cad..9f29e43 100644 --- a/packages/document-loaders/src/loaders/google-docs.ts +++ b/packages/document-loaders/src/loaders/google-docs.ts @@ -11,6 +11,7 @@ const URL_ROOT = 'https://docs.google.com/document/d/'; export const googleDocsLoader: DocumentLoader = { isSupported, loadDocument, + loadMetadata, }; function isSupported(source: string) { diff --git a/packages/document-loaders/src/loaders/google-drive.ts b/packages/document-loaders/src/loaders/google-drive.ts index 005c48c..bcfa2bb 100644 --- a/packages/document-loaders/src/loaders/google-drive.ts +++ b/packages/document-loaders/src/loaders/google-drive.ts @@ -11,6 +11,7 @@ const URL_ROOT = 'https://drive.google.com/file/d/'; export const googleDriveLoader: DocumentLoader = { isSupported, loadDocument, + loadMetadata, }; function isSupported(source: string) { @@ -33,6 +34,17 @@ async function loadDocument(url: string) { }; } +async function loadMetadata(url: string) { + const authClient = await getGoogleAuthClient(); + + const fileMetadata = await fetchFileMetadata(authClient, url); + if (!fileMetadata) { + throw new Error(`DocumentLoader/GoogleDrive Document name not found: ${url}`); + } + + return fileMetadata; +} + function formatStandardizedUrl(url: string) { const fileId = extractFileIdFromUrl(url); return `${URL_ROOT}${fileId}`; diff --git a/packages/document-loaders/src/loaders/google-slides.ts b/packages/document-loaders/src/loaders/google-slides.ts index 946985f..f57628e 100644 --- a/packages/document-loaders/src/loaders/google-slides.ts +++ b/packages/document-loaders/src/loaders/google-slides.ts @@ -11,6 +11,7 @@ const URL_ROOT = 'https://docs.google.com/presentation/d/'; export const googleSlidesLoader: DocumentLoader = { isSupported, loadDocument, + loadMetadata, }; function isSupported(source: string) { diff --git a/packages/document-loaders/src/loaders/index.ts b/packages/document-loaders/src/loaders/index.ts index a1e2ab3..80b1a95 100644 --- a/packages/document-loaders/src/loaders/index.ts +++ b/packages/document-loaders/src/loaders/index.ts @@ -1,7 +1,7 @@ import { googleDriveLoader } from './google-drive.js'; import { googleSlidesLoader } from './google-slides.js'; import { googleDocsLoader } from './google-docs.js'; -import { DocumentLoader, DocumentLoaderResult } from './types.js'; +import { DocumentLoader, DocumentLoaderResult, DocumentMetadata } from './types.js'; import { notionLoader } from './notion.js'; import { notionPdfBlockLoader } from './notion-pdf-block.js'; @@ -22,3 +22,17 @@ export function loadDocument(source: string): Promise { throw new Error(`DocumentLoader Unsupported document type ${source}`); } + +export function loadDocumentMetadata(source: string): Promise { + for (const loader of loaders) { + if (loader.isSupported(source)) { + return loader.loadMetadata(source); + } + } + + throw new Error(`DocumentLoader Unsupported document type ${source}`); +} + +export function registerLoader(loader: DocumentLoader) { + loaders.push(loader); +} diff --git a/packages/document-loaders/src/loaders/notion-pdf-block.ts b/packages/document-loaders/src/loaders/notion-pdf-block.ts index aad6f36..fe769c7 100644 --- a/packages/document-loaders/src/loaders/notion-pdf-block.ts +++ b/packages/document-loaders/src/loaders/notion-pdf-block.ts @@ -11,12 +11,18 @@ const BLOCK_ID_REGEX = /^[a-z0-9]{32}$/; // 32 characters export const notionPdfBlockLoader: DocumentLoader = { isSupported, loadDocument, + loadMetadata, }; function isSupported(source: string) { return source.startsWith(URL_ROOT) && BLOCK_ID_REGEX.test(getUrlHash(source)); } +async function loadMetadata(url: string) { + const pdfBlock = await fetchPdfBlock(url); + return getMetadataFromBlock(url, pdfBlock); +} + async function loadDocument(url: string) { const pdfBlock = await fetchPdfBlock(url); const metadata = getMetadataFromBlock(url, pdfBlock); diff --git a/packages/document-loaders/src/loaders/notion.ts b/packages/document-loaders/src/loaders/notion.ts index e7dccfe..7835ab0 100644 --- a/packages/document-loaders/src/loaders/notion.ts +++ b/packages/document-loaders/src/loaders/notion.ts @@ -7,6 +7,7 @@ const URL_ROOT = 'https://www.notion.so/'; export const notionLoader: DocumentLoader = { isSupported, loadDocument, + loadMetadata, }; function isSupported(source: string) { @@ -27,6 +28,15 @@ async function loadDocument(url: string) { return { content, metadata }; } +async function loadMetadata(url: string) { + const pageId = extractNotionPageId(url); + if (!pageId) { + throw new Error(`NotionLoader Invalid Notion URL: ${url}`); + } + + return await fetchPageMetadata(pageId); +} + function extractNotionPageId(url: string) { const pattern = /[0-9a-f]{32}/; const match = url.match(pattern); diff --git a/packages/document-loaders/src/loaders/types.ts b/packages/document-loaders/src/loaders/types.ts index 86fa1c6..2ca5324 100644 --- a/packages/document-loaders/src/loaders/types.ts +++ b/packages/document-loaders/src/loaders/types.ts @@ -1,6 +1,7 @@ export type DocumentLoader = { isSupported: (source: string) => boolean; loadDocument: (source: string) => Promise; + loadMetadata: (source: string) => Promise; }; export type DocumentLoaderResult = {