Skip to content

Commit

Permalink
"Refactor PDFLoader to extract text from a PDF file and return it as …
Browse files Browse the repository at this point in the history
…a Document object."
  • Loading branch information
buhe committed Nov 20, 2023
1 parent f50b35e commit 4fbc283
Showing 1 changed file with 24 additions and 19 deletions.
43 changes: 24 additions & 19 deletions Sources/LangChain/document_loaders/PDFLoader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,39 @@
import Foundation
import PDFKit

//if let url = Bundle.main.url(forResource: "sample_pdf", withExtension: "pdf") {
//
//}

public class PDFLoader: BaseLoader {
let fileURL: URL
let file_path: String

public init(fileURL: URL, callbacks: [BaseCallbackHandler] = []) {
self.fileURL = fileURL
public init(file_path: String, callbacks: [BaseCallbackHandler] = []) {
self.file_path = file_path
super.init(callbacks: callbacks)
}

public override func _load() async throws -> [Document] {
if let pdfDocument = PDFDocument(url: fileURL) {
var extractedText = ""
let metadata = ["url": fileURL.absoluteString]
for pageIndex in 0 ..< pdfDocument.pageCount {
if let pdfPage = pdfDocument.page(at: pageIndex) {
if let pageInfo = pdfPage.string {
extractedText += pageInfo
let nameAndExt = self.file_path.split(separator: ".")
let name = "\(nameAndExt[0])"
let ext = "\(nameAndExt[1])"
if let url = Bundle.main.url(forResource: name, withExtension: ext) {
if let pdfDocument = PDFDocument(url: url) {
var extractedText = ""
let metadata = ["source": file_path]
for pageIndex in 0 ..< pdfDocument.pageCount {
if let pdfPage = pdfDocument.page(at: pageIndex) {
if let pageInfo = pdfPage.string {
extractedText += pageInfo
}

}

}

// print(extractedText)
return [Document(page_content: extractedText, metadata: metadata)]
} else{
throw LangChainError.LoaderError("Parse PDF file fail.")
}

// print(extractedText)
return [Document(page_content: extractedText, metadata: metadata)]
} else{
throw LangChainError.LoaderError("Parse PDF file fail.")
} else {
throw LangChainError.LoaderError("PDF not exist")
}
}

Expand Down

0 comments on commit 4fbc283

Please sign in to comment.