diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx index b62d3a824dcf..bb96bdea62e6 100644 --- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx @@ -49,6 +49,21 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { }); ``` +## Eliminating extra spaces + +PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but +if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this: + +```typescript +import { PDFLoader } from "langchain/document_loaders/fs/pdf"; + +const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { + parsedItemSeparator: "", +}); + +const docs = await loader.load(); +``` + ## Loading directories import CodeBlock from "@theme/CodeBlock"; diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx index 92f56ada6b8a..3bd1e677c17b 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx @@ -29,11 +29,25 @@ npm install pdfjs-dist ```typescript import { WebPDFLoader } from "langchain/document_loaders/web/pdf"; -const loader = new WebPDFLoader( - "src/document_loaders/example_data/example.pdf", - { - // you may need to add `.then(m => m.default)` to the end of the import - pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"), - } -); +const blob = new Blob(); // e.g. from a file input + +const loader = new WebPDFLoader(blob, { + // you may need to add `.then(m => m.default)` to the end of the import + pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"), +}); +``` + +## Eliminating extra spaces + +PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but +if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this: + +```typescript +import { WebPDFLoader } from "langchain/document_loaders/web/pdf"; + +const blob = new Blob(); // e.g. from a file input + +const loader = new WebPDFLoader(blob, { + parsedItemSeparator: "", +}); ``` diff --git a/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx b/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx index baa6f3e1484f..c87b5f9e0797 100644 --- a/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx +++ b/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx @@ -16,6 +16,10 @@ npm install pdf-parse ```typescript import { PDFLoader } from "langchain/document_loaders/fs/pdf"; +// Or, in web environments: +// import { WebPDFLoader } from "langchain/document_loaders/web/pdf"; +// const blob = new Blob(); // e.g. from a file input +// const loader = new WebPDFLoader(blob); const loader = new PDFLoader("src/document_loaders/example_data/example.pdf"); @@ -52,3 +56,18 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"), }); ``` + +## Eliminating extra spaces + +PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but +if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this: + +```typescript +import { PDFLoader } from "langchain/document_loaders/fs/pdf"; + +const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { + parsedItemSeparator: "", +}); + +const docs = await loader.load(); +``` diff --git a/langchain/src/document_loaders/fs/pdf.ts b/langchain/src/document_loaders/fs/pdf.ts index d4e72c54427e..1fa6c9de40a9 100644 --- a/langchain/src/document_loaders/fs/pdf.ts +++ b/langchain/src/document_loaders/fs/pdf.ts @@ -11,13 +11,20 @@ export class PDFLoader extends BufferLoader { private pdfjs: typeof PDFLoaderImports; + protected parsedItemSeparator: string; + constructor( filePathOrBlob: string | Blob, - { splitPages = true, pdfjs = PDFLoaderImports } = {} + { + splitPages = true, + pdfjs = PDFLoaderImports, + parsedItemSeparator = " ", + } = {} ) { super(filePathOrBlob); this.splitPages = splitPages; this.pdfjs = pdfjs; + this.parsedItemSeparator = parsedItemSeparator; } /** @@ -76,7 +83,7 @@ export class PDFLoader extends BufferLoader { } } - const text = textItems.join(" "); + const text = textItems.join(this.parsedItemSeparator); documents.push( new Document({ diff --git a/langchain/src/document_loaders/web/pdf.ts b/langchain/src/document_loaders/web/pdf.ts index a486f4777358..114447828e7d 100644 --- a/langchain/src/document_loaders/web/pdf.ts +++ b/langchain/src/document_loaders/web/pdf.ts @@ -12,14 +12,21 @@ export class WebPDFLoader extends BaseDocumentLoader { private pdfjs: typeof PDFLoaderImports; + protected parsedItemSeparator: string; + constructor( blob: Blob, - { splitPages = true, pdfjs = PDFLoaderImports } = {} + { + splitPages = true, + pdfjs = PDFLoaderImports, + parsedItemSeparator = " ", + } = {} ) { super(); this.blob = blob; this.splitPages = splitPages ?? this.splitPages; this.pdfjs = pdfjs; + this.parsedItemSeparator = parsedItemSeparator; } /** @@ -61,7 +68,7 @@ export class WebPDFLoader extends BaseDocumentLoader { lastY = item.transform[5]; } } - const text = textItems.join(" "); + const text = textItems.join(this.parsedItemSeparator); documents.push( new Document({