Skip to content

Commit

Permalink
implementation, docu
Browse files Browse the repository at this point in the history
Issue #159
  • Loading branch information
rsoika committed Dec 2, 2021
1 parent c956a6b commit 91b24ec
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 6 deletions.
8 changes: 8 additions & 0 deletions imixs-archive-documents/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Both, the *OCRDocumentPlugin* as also the *OCRDocumentAdapter* can be configured
<tika name="options">X-Tika-PDFOcrDPI=72</tika>
<tika name="options">X-Tika-OCRLanguage=eng+deu</tika>
<tika name="filepattern">(PDF|pdf)$</tika>
<tika name="maxpdfpages">1</tika>


In this example configuration the OCR processing will be started with 4 additional tika options.

Expand Down Expand Up @@ -76,6 +78,12 @@ Example - parse PDF files only:

<tika name="filepattern">(PDF|pdf)$</tika>

#### Max Pages of PDF Documents

With the optional parameter `maxpdfpages` you can controll how many pages of a PDF document will be scanned. This optional parameter can be used to reduce the size of very time and CPU intensive scan processing of the Tika service. For example you can set the param to 1 to onyl scann the first page of a PDF document

<tika name="maxpdfpages">1</tika>

### The OCRDocumentService

The *OCRDocumentService* is a general service to extract the textual information from file attachments during the processing life cycle independent form a BPMN model. The TikaDocumentService reacts on the CDI event 'BEFORE\_PROCESS' and extracts the data automatically.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,17 @@ public ItemCollection execute(ItemCollection document, ItemCollection event) thr
try {
List<String> tikaOptions = null;
String filePattern = null;
int maxPdfPages=0;
// read opitonal tika options
ItemCollection evalItemCollection = workflowService.evalWorkflowResult(event, "tika", document, false);
if (evalItemCollection != null) {
tikaOptions = evalItemCollection.getItemValue("options");
filePattern = evalItemCollection.getItemValueString("filepattern");
maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents
}
// extract text data....
ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,
filePattern);
filePattern,maxPdfPages);
} catch (PluginException e) {
String message = "Tika OCRService - unable to extract text: " + e.getMessage();
throw new AdapterException(e.getErrorContext(), e.getErrorCode(), message, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,18 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws
if ("model".equalsIgnoreCase(serviceMode)) {
List<String> tikaOptions = null;
String filePattern = null;
int maxPdfPages=0;
// read optional tika options
ItemCollection evalItemCollection = this.getWorkflowService().evalWorkflowResult(event, "tika", document,
false);
if (evalItemCollection != null) {
tikaOptions = evalItemCollection.getItemValue("options");
filePattern = evalItemCollection.getItemValueString("filepattern");
maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents
}

// update the dms meta data
ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern);
ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern,maxPdfPages);
} else {
logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode
+ " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Plugin will be ignored!");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.imixs.archive.documents;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
Expand All @@ -20,6 +21,7 @@
import javax.ejb.Stateless;
import javax.inject.Inject;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
Expand Down Expand Up @@ -102,7 +104,7 @@ public class TikaService {
* @throws PluginException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
extractText(workitem, snapshot, ocrStategy, null, null);
extractText(workitem, snapshot, ocrStategy, null, null, 0);
}

/**
Expand All @@ -120,6 +122,11 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws
* <p>
* An optional param 'filePattern' can be provided to extract text only from
* Attachments mating the given file pattern (regex).
* <p>
* The optioanl param 'maxPages' can be provided to reduce the size of PDF
* documents to a maximum of pages. This avoids blocking the tika service by
* processing to large documetns. For example only the first 5 pages can be
* scanned.
*
* @param workitem - workitem with file attachments
* @param pdf_mode - TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR
Expand All @@ -128,7 +135,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws
* @throws PluginException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List<String> options,
String filePatternRegex) throws PluginException {
String filePatternRegex, int maxPdfPages) throws PluginException {
boolean debug = logger.isLoggable(Level.FINE);
Pattern filePattern = null;

Expand Down Expand Up @@ -201,7 +208,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
continue;
}

textContent = doORCProcessing(originFileData, options);
textContent = doORCProcessing(originFileData, options, maxPdfPages);

if (textContent == null) {
logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
Expand Down Expand Up @@ -240,7 +247,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
* @return text content
* @throws IOException
*/
public String doORCProcessing(FileData fileData, List<String> options) throws IOException {
public String doORCProcessing(FileData fileData, List<String> options, int maxPdfPages) throws IOException {
boolean debug = logger.isLoggable(Level.FINE);

// read the Tika Service Enpoint
Expand All @@ -264,6 +271,24 @@ public String doORCProcessing(FileData fileData, List<String> options) throws IO
return null;
}

// remove pages if page size of a pdf document exceeds the max_pagesize
if (maxPdfPages > 0 && "application/pdf".equals(contentType)) {
PDDocument pdfdoc = PDDocument.load(fileData.getContent());
if (pdfdoc.getNumberOfPages() > maxPdfPages) {
logger.info("......pdf document '" + fileData.getName() + "' has to many pages (max allowed="
+ maxPdfPages + ")");
while (pdfdoc.getNumberOfPages() > maxPdfPages) {
logger.info("......removing page " + pdfdoc.getNumberOfPages());
pdfdoc.removePage(pdfdoc.getNumberOfPages() - 1);
}
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
pdfdoc.save(byteArrayOutputStream);
pdfdoc.close();
// update fileData content....
fileData.setContent(byteArrayOutputStream.toByteArray());
}
}

PrintWriter printWriter = null;
HttpURLConnection urlConnection = null;
PrintWriter writer = null;
Expand Down

0 comments on commit 91b24ec

Please sign in to comment.