From d339c8c3bb08e73d1a63ca4f6565c921ed3cfb1a Mon Sep 17 00:00:00 2001
From: Ralph Soika <ralph.soika@imixs.com>
Date: Fri, 6 Nov 2020 21:47:23 +0100
Subject: [PATCH] docu

---
 imixs-archive-documents/README.md             | 23 +++++++++++++++++--
 imixs-archive-ocr/README.md                   | 13 ++++++-----
 .../org/imixs/archive/ocr/OCRService.java     |  2 +-
 3 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/imixs-archive-documents/README.md b/imixs-archive-documents/README.md
index fb7ee34d..8c10ad1a 100644
--- a/imixs-archive-documents/README.md
+++ b/imixs-archive-documents/README.md
@@ -45,9 +45,28 @@ Both, the *OCRDocumentPlugin* as also the *OCRDocumentAdapter* can be configured
 	<!-- Tika Options -->
 	<tika name="options">X-Tika-PDFocrStrategy=OCR_AND_TEXT_EXTRACTION</tika>
 	<tika name="options">X-Tika-PDFOcrImageType=RGB</tika>
-	<tika name="options">X-Tika-PDFOcrDPI=400</tika>
+	<tika name="options">X-Tika-PDFOcrDPI=72</tika>
+	<tika name="options">X-Tika-OCRLanguage=eng+deu</tika>
 
-In this example configuration the OCR processing will be started with 3 additional tika options. For more details about the OCR configuration see the [Imixs-Archive-OCR project](https://github.com/imixs/imixs-archive/tree/master/imixs-archive-ocr).
+In this example configuration the OCR processing will be started with 4 additional tika options. 
+
+ - X-Tika-PDFOcrImageType=RGB  - set color mode
+ - X-Tika-PDFOcrDPI=72     - set DPI to 72
+ - X-Tika-OCRLanguage=deu  - set OCR language to german
+
+
+#### Overriding the configured language as part of your request
+
+Different requests may need processing using different language models. These can be specified for specific requests using the X-Tika-OCRLanguage custom header. An example of this is shown below:
+
+	X-Tika-OCRLanguage=deu
+
+Or for multiple languages:
+
+	X-Tika-OCRLanguage: eng+fra"
+
+
+For more details about the OCR configuration see the [Imixs-Archive-OCR project](https://github.com/imixs/imixs-archive/tree/master/imixs-archive-ocr).
 
 
 ## Searching Documents
diff --git a/imixs-archive-ocr/README.md b/imixs-archive-ocr/README.md
index ba652ef1..aa60add2 100644
--- a/imixs-archive-ocr/README.md
+++ b/imixs-archive-ocr/README.md
@@ -1,7 +1,6 @@
 # Imixs-Archive-OCR
 
-*Imixs-Archive-OCR* is a sub-project of Imixs-Archive. The project provides methods to extract textual information from documents
-attached to a Workitem. The text content of attachments is either extracted by the PDFBox API or by optical character recognition (OCR). This text content is stored in the $file attribute 'text' and can be use for further processing or to search for document content.
+*Imixs-Archive-OCR* is a sub-project of Imixs-Archive. The project is decoupled form the Imixs-Workflow Engine and provides a service component to extract textual information from documents attached to a Workitem. The text content of attachments is either extracted by the PDFBox API or by optical character recognition (OCR). This text content is stored in the $file attribute 'text' and can be use for further processing or to search for document content.
 
 
 ## OCR 
@@ -43,9 +42,9 @@ For example to set the DPI mode call:
 	// define options
 	List<String> options=new ArrayList<String>();
 	options.add("X-Tika-PDFocrStrategy=OCR_AND_TEXT_EXTRACTION");
-	options.add("X-Tika-PDFOcrImageType=RGB");
-	options.add("X-Tika-PDFOcrDPI=400");
-	
+	options.add("X-Tika-PDFOcrImageType=RGB"); 	//  support colors 
+	options.add("X-Tika-PDFOcrDPI=72");    			// set DPI
+	options.add("X-Tika-OCRLanguage=eng"); 			// set english language	
 	// start ocr 
 	tikaDocumentService.extractText(workitem, "TEXT_AND_OCR", options)
 
@@ -53,7 +52,9 @@ For example to set the DPI mode call:
 
 You have various options to configure the Tika server. Find details about how to configure imixs-tika [here](https://github.com/imixs/imixs-docker/tree/master/tika).	
 
-
+ - https://cwiki.apache.org/confluence/display/TIKA/TikaServer
+ - https://cwiki.apache.org/confluence/display/TIKA/TikaOCR
+ - https://cwiki.apache.org/confluence/display/tika/PDFParser%20(Apache%20PDFBox)
 
 
 ## How to Install
diff --git a/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java b/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java
index 13255e9a..bcf48a5e 100644
--- a/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java
+++ b/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java
@@ -122,7 +122,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
         // validate OCR MODE....
         if ("TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR".indexOf(pdfMode) == -1) {
             throw new PluginException(OCRService.class.getSimpleName(), PLUGIN_ERROR,
-                    "Invalid TIKA_OCR_MODE - exprected one of the following options: TEXT_ONLY | OCR_ONLY | TEXT_AND_OCR");
+                    "Invalid TIKA_OCR_MODE - expected one of the following options: TEXT_ONLY | OCR_ONLY | TEXT_AND_OCR");
         }
 
         long l = System.currentTimeMillis();