implementation, docu

Issue #159
imixs · Dec 2, 2021 · 91b24ec · 91b24ec
1 parent c956a6b
commit 91b24ec
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 6 deletions.
diff --git a/imixs-archive-documents/README.md b/imixs-archive-documents/README.md
@@ -46,6 +46,8 @@ Both, the *OCRDocumentPlugin* as also the *OCRDocumentAdapter* can be configured
 	<tika name="options">X-Tika-PDFOcrDPI=72</tika>
 	<tika name="options">X-Tika-OCRLanguage=eng+deu</tika>
 	<tika name="filepattern">(PDF|pdf)$</tika>
+	<tika name="maxpdfpages">1</tika>
+
 
 In this example configuration the OCR processing will be started with 4 additional tika options. 
 
@@ -76,6 +78,12 @@ Example - parse PDF files only:
 
 	<tika name="filepattern">(PDF|pdf)$</tika>
 
+#### Max Pages of PDF Documents
+
+With the optional parameter `maxpdfpages` you can controll how many pages of a PDF document will be scanned. This optional parameter can be used to reduce the size of very time and CPU intensive scan processing of the Tika service. For example you can set the param to 1 to onyl scann the first page of a PDF document
+
+	<tika name="maxpdfpages">1</tika>
+
 ### The OCRDocumentService
 
 The *OCRDocumentService* is a general service to extract the textual information from file attachments during the processing life cycle independent form a BPMN model. The TikaDocumentService reacts on the CDI event 'BEFORE\_PROCESS' and extracts the data automatically. 

diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentAdapter.java
@@ -71,15 +71,17 @@ public ItemCollection execute(ItemCollection document, ItemCollection event) thr
             try {
                 List<String> tikaOptions = null;
                 String filePattern = null;
+                int maxPdfPages=0;
                 // read opitonal tika options
                 ItemCollection evalItemCollection = workflowService.evalWorkflowResult(event, "tika", document, false);
                 if (evalItemCollection != null) {
                     tikaOptions = evalItemCollection.getItemValue("options");
                     filePattern = evalItemCollection.getItemValueString("filepattern");
+                    maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents
                 }
                 // extract text data....
                 ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,
-                        filePattern);
+                        filePattern,maxPdfPages);
             } catch (PluginException e) {
                 String message = "Tika OCRService - unable to extract text: " + e.getMessage();
                 throw new AdapterException(e.getErrorContext(), e.getErrorCode(), message, e);

diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java
@@ -58,16 +58,18 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws
         if ("model".equalsIgnoreCase(serviceMode)) {
             List<String> tikaOptions = null;
             String filePattern = null;
+            int maxPdfPages=0;
             // read optional tika options
             ItemCollection evalItemCollection = this.getWorkflowService().evalWorkflowResult(event, "tika", document,
                     false);
             if (evalItemCollection != null) {
                 tikaOptions = evalItemCollection.getItemValue("options");
                 filePattern = evalItemCollection.getItemValueString("filepattern");
+                maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents
             }
 
             // update the dms meta data
-            ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern);
+            ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern,maxPdfPages);
         } else {
             logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode
                     + " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Plugin will be ignored!");

diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java
@@ -1,6 +1,7 @@
 package org.imixs.archive.documents;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
@@ -20,6 +21,7 @@
 import javax.ejb.Stateless;
 import javax.inject.Inject;
 
+import org.apache.pdfbox.pdmodel.PDDocument;
 import org.eclipse.microprofile.config.inject.ConfigProperty;
 import org.imixs.workflow.FileData;
 import org.imixs.workflow.ItemCollection;
@@ -102,7 +104,7 @@ public class TikaService {
      * @throws PluginException
      */
     public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
-        extractText(workitem, snapshot, ocrStategy, null, null);
+        extractText(workitem, snapshot, ocrStategy, null, null, 0);
     }
 
     /**
@@ -120,6 +122,11 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws
      * <p>
      * An optional param 'filePattern' can be provided to extract text only from
      * Attachments mating the given file pattern (regex).
+     * <p>
+     * The optioanl param 'maxPages' can be provided to reduce the size of PDF
+     * documents to a maximum of pages. This avoids blocking the tika service by
+     * processing to large documetns. For example only the first 5 pages can be
+     * scanned.
      * 
      * @param workitem         - workitem with file attachments
      * @param pdf_mode         - TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR
@@ -128,7 +135,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws
      * @throws PluginException
      */
     public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List<String> options,
-            String filePatternRegex) throws PluginException {
+            String filePatternRegex, int maxPdfPages) throws PluginException {
         boolean debug = logger.isLoggable(Level.FINE);
         Pattern filePattern = null;
 
@@ -201,7 +208,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
                             continue;
                         }
 
-                        textContent = doORCProcessing(originFileData, options);
+                        textContent = doORCProcessing(originFileData, options, maxPdfPages);
 
                         if (textContent == null) {
                             logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
@@ -240,7 +247,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
      * @return text content
      * @throws IOException
      */
-    public String doORCProcessing(FileData fileData, List<String> options) throws IOException {
+    public String doORCProcessing(FileData fileData, List<String> options, int maxPdfPages) throws IOException {
         boolean debug = logger.isLoggable(Level.FINE);
 
         // read the Tika Service Enpoint
@@ -264,6 +271,24 @@ public String doORCProcessing(FileData fileData, List<String> options) throws IO
             return null;
         }
 
+        // remove pages if page size of a pdf document exceeds the max_pagesize
+        if (maxPdfPages > 0 && "application/pdf".equals(contentType)) {
+            PDDocument pdfdoc = PDDocument.load(fileData.getContent());
+            if (pdfdoc.getNumberOfPages() > maxPdfPages) {
+                logger.info("......pdf document '" + fileData.getName() + "' has to many pages (max allowed="
+                        + maxPdfPages + ")");
+                while (pdfdoc.getNumberOfPages() > maxPdfPages) {
+                    logger.info("......removing page " + pdfdoc.getNumberOfPages());
+                    pdfdoc.removePage(pdfdoc.getNumberOfPages() - 1);
+                }
+                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+                pdfdoc.save(byteArrayOutputStream);
+                pdfdoc.close();
+                // update fileData content....
+                fileData.setContent(byteArrayOutputStream.toByteArray());
+            }
+        }
+
         PrintWriter printWriter = null;
         HttpURLConnection urlConnection = null;
         PrintWriter writer = null;