Hotfix pdf (#3306)

* fix: remove blank space between words when use pdf-loader * fix: add skipBlank option in pdf-loader * Change PDF loader instance method name, add docs * Formatting --------- Co-authored-by: ppxu <[email protected]>
langchain-ai · Nov 17, 2023 · 6d91724 · 6d91724 · vercel · Nov 17, 2023
1 parent bb9fb67
commit 6d91724
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 11 deletions.
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx
@@ -49,6 +49,21 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
 });
 ```
 
+## Eliminating extra spaces
+
+PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but
+if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:
+
+```typescript
+import { PDFLoader } from "langchain/document_loaders/fs/pdf";
+
+const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
+  parsedItemSeparator: "",
+});
+
+const docs = await loader.load();
+```
+
 ## Loading directories
 
 import CodeBlock from "@theme/CodeBlock";

diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx
@@ -29,11 +29,25 @@ npm install pdfjs-dist
 ```typescript
 import { WebPDFLoader } from "langchain/document_loaders/web/pdf";
 
-const loader = new WebPDFLoader(
-  "src/document_loaders/example_data/example.pdf",
-  {
-    // you may need to add `.then(m => m.default)` to the end of the import
-    pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"),
-  }
-);
+const blob = new Blob(); // e.g. from a file input
+
+const loader = new WebPDFLoader(blob, {
+  // you may need to add `.then(m => m.default)` to the end of the import
+  pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"),
+});
+```
+
+## Eliminating extra spaces
+
+PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but
+if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:
+
+```typescript
+import { WebPDFLoader } from "langchain/document_loaders/web/pdf";
+
+const blob = new Blob(); // e.g. from a file input
+
+const loader = new WebPDFLoader(blob, {
+  parsedItemSeparator: "",
+});
 ```
diff --git a/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx b/docs/core_docs/docs/modules/data_connection/document_loaders/how_to/pdf.mdx
@@ -16,6 +16,10 @@ npm install pdf-parse
 
 ```typescript
 import { PDFLoader } from "langchain/document_loaders/fs/pdf";
+// Or, in web environments:
+// import { WebPDFLoader } from "langchain/document_loaders/web/pdf";
+// const blob = new Blob(); // e.g. from a file input
+// const loader = new WebPDFLoader(blob);
 
 const loader = new PDFLoader("src/document_loaders/example_data/example.pdf");
 
@@ -52,3 +56,18 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
   pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"),
 });
 ```
+
+## Eliminating extra spaces
+
+PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but
+if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:
+
+```typescript
+import { PDFLoader } from "langchain/document_loaders/fs/pdf";
+
+const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
+  parsedItemSeparator: "",
+});
+
+const docs = await loader.load();
+```
diff --git a/langchain/src/document_loaders/fs/pdf.ts b/langchain/src/document_loaders/fs/pdf.ts
@@ -11,13 +11,20 @@ export class PDFLoader extends BufferLoader {
 
   private pdfjs: typeof PDFLoaderImports;
 
+  protected parsedItemSeparator: string;
+
   constructor(
     filePathOrBlob: string | Blob,
-    { splitPages = true, pdfjs = PDFLoaderImports } = {}
+    {
+      splitPages = true,
+      pdfjs = PDFLoaderImports,
+      parsedItemSeparator = " ",
+    } = {}
   ) {
     super(filePathOrBlob);
     this.splitPages = splitPages;
     this.pdfjs = pdfjs;
+    this.parsedItemSeparator = parsedItemSeparator;
   }
 
   /**
@@ -76,7 +83,7 @@ export class PDFLoader extends BufferLoader {
         }
       }
 
-      const text = textItems.join(" ");
+      const text = textItems.join(this.parsedItemSeparator);
 
       documents.push(
         new Document({

diff --git a/langchain/src/document_loaders/web/pdf.ts b/langchain/src/document_loaders/web/pdf.ts
@@ -12,14 +12,21 @@ export class WebPDFLoader extends BaseDocumentLoader {
 
   private pdfjs: typeof PDFLoaderImports;
 
+  protected parsedItemSeparator: string;
+
   constructor(
     blob: Blob,
-    { splitPages = true, pdfjs = PDFLoaderImports } = {}
+    {
+      splitPages = true,
+      pdfjs = PDFLoaderImports,
+      parsedItemSeparator = " ",
+    } = {}
   ) {
     super();
     this.blob = blob;
     this.splitPages = splitPages ?? this.splitPages;
     this.pdfjs = pdfjs;
+    this.parsedItemSeparator = parsedItemSeparator;
   }
 
   /**
@@ -61,7 +68,7 @@ export class WebPDFLoader extends BaseDocumentLoader {
           lastY = item.transform[5];
         }
       }
-      const text = textItems.join(" ");
+      const text = textItems.join(this.parsedItemSeparator);
 
       documents.push(
         new Document({