diff --git a/imixs-archive-documents/README.md b/imixs-archive-documents/README.md index ab182ae..1e35b17 100644 --- a/imixs-archive-documents/README.md +++ b/imixs-archive-documents/README.md @@ -46,6 +46,8 @@ Both, the *OCRDocumentPlugin* as also the *OCRDocumentAdapter* can be configured X-Tika-PDFOcrDPI=72 X-Tika-OCRLanguage=eng+deu (PDF|pdf)$ + 1 + In this example configuration the OCR processing will be started with 4 additional tika options. @@ -76,6 +78,12 @@ Example - parse PDF files only: (PDF|pdf)$ +#### Max Pages of PDF Documents + +With the optional parameter `maxpdfpages` you can controll how many pages of a PDF document will be scanned. This optional parameter can be used to reduce the size of very time and CPU intensive scan processing of the Tika service. For example you can set the param to 1 to onyl scann the first page of a PDF document + + 1 + ### The OCRDocumentService The *OCRDocumentService* is a general service to extract the textual information from file attachments during the processing life cycle independent form a BPMN model. The TikaDocumentService reacts on the CDI event 'BEFORE\_PROCESS' and extracts the data automatically. diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java index 01bb617..6127b40 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java @@ -71,15 +71,17 @@ public ItemCollection execute(ItemCollection document, ItemCollection event) thr try { List tikaOptions = null; String filePattern = null; + int maxPdfPages=0; // read opitonal tika options ItemCollection evalItemCollection = workflowService.evalWorkflowResult(event, "tika", document, false); if (evalItemCollection != null) { tikaOptions = evalItemCollection.getItemValue("options"); filePattern = evalItemCollection.getItemValueString("filepattern"); + maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents } // extract text data.... ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions, - filePattern); + filePattern,maxPdfPages); } catch (PluginException e) { String message = "Tika OCRService - unable to extract text: " + e.getMessage(); throw new AdapterException(e.getErrorContext(), e.getErrorCode(), message, e); diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java index 4bb339c..71eae16 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java @@ -58,16 +58,18 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws if ("model".equalsIgnoreCase(serviceMode)) { List tikaOptions = null; String filePattern = null; + int maxPdfPages=0; // read optional tika options ItemCollection evalItemCollection = this.getWorkflowService().evalWorkflowResult(event, "tika", document, false); if (evalItemCollection != null) { tikaOptions = evalItemCollection.getItemValue("options"); filePattern = evalItemCollection.getItemValueString("filepattern"); + maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents } // update the dms meta data - ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern); + ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern,maxPdfPages); } else { logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode + " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Plugin will be ignored!"); diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java index 9212ad4..dc0f422 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java @@ -1,6 +1,7 @@ package org.imixs.archive.documents; import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; @@ -20,6 +21,7 @@ import javax.ejb.Stateless; import javax.inject.Inject; +import org.apache.pdfbox.pdmodel.PDDocument; import org.eclipse.microprofile.config.inject.ConfigProperty; import org.imixs.workflow.FileData; import org.imixs.workflow.ItemCollection; @@ -102,7 +104,7 @@ public class TikaService { * @throws PluginException */ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException { - extractText(workitem, snapshot, ocrStategy, null, null); + extractText(workitem, snapshot, ocrStategy, null, null, 0); } /** @@ -120,6 +122,11 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws *

* An optional param 'filePattern' can be provided to extract text only from * Attachments mating the given file pattern (regex). + *

+ * The optioanl param 'maxPages' can be provided to reduce the size of PDF + * documents to a maximum of pages. This avoids blocking the tika service by + * processing to large documetns. For example only the first 5 pages can be + * scanned. * * @param workitem - workitem with file attachments * @param pdf_mode - TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR @@ -128,7 +135,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws * @throws PluginException */ public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List options, - String filePatternRegex) throws PluginException { + String filePatternRegex, int maxPdfPages) throws PluginException { boolean debug = logger.isLoggable(Level.FINE); Pattern filePattern = null; @@ -201,7 +208,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String continue; } - textContent = doORCProcessing(originFileData, options); + textContent = doORCProcessing(originFileData, options, maxPdfPages); if (textContent == null) { logger.warning("Unable to extract text-content for '" + fileData.getName() + "'"); @@ -240,7 +247,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String * @return text content * @throws IOException */ - public String doORCProcessing(FileData fileData, List options) throws IOException { + public String doORCProcessing(FileData fileData, List options, int maxPdfPages) throws IOException { boolean debug = logger.isLoggable(Level.FINE); // read the Tika Service Enpoint @@ -264,6 +271,24 @@ public String doORCProcessing(FileData fileData, List options) throws IO return null; } + // remove pages if page size of a pdf document exceeds the max_pagesize + if (maxPdfPages > 0 && "application/pdf".equals(contentType)) { + PDDocument pdfdoc = PDDocument.load(fileData.getContent()); + if (pdfdoc.getNumberOfPages() > maxPdfPages) { + logger.info("......pdf document '" + fileData.getName() + "' has to many pages (max allowed=" + + maxPdfPages + ")"); + while (pdfdoc.getNumberOfPages() > maxPdfPages) { + logger.info("......removing page " + pdfdoc.getNumberOfPages()); + pdfdoc.removePage(pdfdoc.getNumberOfPages() - 1); + } + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + pdfdoc.save(byteArrayOutputStream); + pdfdoc.close(); + // update fileData content.... + fileData.setContent(byteArrayOutputStream.toByteArray()); + } + } + PrintWriter printWriter = null; HttpURLConnection urlConnection = null; PrintWriter writer = null;