From 198db2ef73b19508ccc38354d96c0c987640768d Mon Sep 17 00:00:00 2001 From: Robin Windey Date: Fri, 13 Dec 2024 22:56:41 +0100 Subject: [PATCH] Add custom CLI args for ocrMyPdf (#284) * Introduce new UI text field for custom CLI args * Rework WorkflowOcr.vue for cleaner binding * Refactor WorkflowSettings.php --- README.md | 1 + lib/Model/WorkflowSettings.php | 39 ++++---- lib/OcrProcessors/OcrMyPdfBasedProcessor.php | 12 ++- src/components/WorkflowOcr.vue | 97 ++++++++----------- src/test/components/WorkflowOcr.spec.js | 58 ++++++++--- tests/Unit/Model/WorkflowSettingsTest.php | 2 +- .../OcrProcessors/PdfOcrProcessorTest.php | 32 +++++- 7 files changed, 151 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index 187e3a0..d138bde 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Remove tags after OCR | These tags will be removed from the file after it has be OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.
**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.
**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.
**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. | Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) | Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.
:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:| +Custom ocrMyPdf CLI arguments | If you want to pass custom arguments to the `ocrmypdf` CLI, you can do so here. Please note that the arguments will be passed as they are to the CLI, so make sure to use the correct syntax. Check the [official docs](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html) for more information. | \* *For `ocrmypdf` the parameter `--remove-background` is [incompatible with `--redo-ocr`](https://github.com/ocrmypdf/OCRmyPDF/blob/110c75cba25121dcca7e2b91644206cce29e8430/src/ocrmypdf/_validation.py#L104).* diff --git a/lib/Model/WorkflowSettings.php b/lib/Model/WorkflowSettings.php index b2bb224..637d173 100644 --- a/lib/Model/WorkflowSettings.php +++ b/lib/Model/WorkflowSettings.php @@ -52,6 +52,9 @@ class WorkflowSettings { /** @var bool */ private $keepOriginalFileVersion = false; + /** @var string */ + private $customCliArgs = ''; + /** * @param string $json The serialized JSON string used in frontend as input for the Vue component */ @@ -101,6 +104,13 @@ public function getKeepOriginalFileVersion(): bool { return $this->keepOriginalFileVersion; } + /** + * @return string + */ + public function getCustomCliArgs(): string { + return $this->customCliArgs; + } + /** * Checks if a new WorkflowSettings object can be constructed from the given JSON string * @param string $json The serialized JSON string used in frontend as input for the Vue component @@ -127,23 +137,18 @@ private function setJson(?string $json = null) { if ($data === null) { throw new InvalidArgumentException('Invalid JSON: "' . $json . '"'); } - if (array_key_exists('languages', $data) && is_array($data['languages'])) { - $this->languages = $data['languages']; - } - if (array_key_exists('removeBackground', $data) && is_bool($data['removeBackground'])) { - $this->removeBackground = $data['removeBackground']; - } - if (array_key_exists('ocrMode', $data) && is_int($data['ocrMode'])) { - $this->ocrMode = $data['ocrMode']; - } - if (array_key_exists('tagsToRemoveAfterOcr', $data) && is_array($data['tagsToRemoveAfterOcr'])) { - $this->tagsToRemoveAfterOcr = $data['tagsToRemoveAfterOcr']; - } - if (array_key_exists('tagsToAddAfterOcr', $data) && is_array($data['tagsToAddAfterOcr'])) { - $this->tagsToAddAfterOcr = $data['tagsToAddAfterOcr']; - } - if (array_key_exists('keepOriginalFileVersion', $data) && is_bool($data['keepOriginalFileVersion'])) { - $this->keepOriginalFileVersion = $data['keepOriginalFileVersion']; + $this->setProperty($this->languages, $data, 'languages', fn ($value) => is_array($value)); + $this->setProperty($this->removeBackground, $data, 'removeBackground', fn ($value) => is_bool($value)); + $this->setProperty($this->ocrMode, $data, 'ocrMode', fn ($value) => is_int($value)); + $this->setProperty($this->tagsToRemoveAfterOcr, $data, 'tagsToRemoveAfterOcr', fn ($value) => is_array($value)); + $this->setProperty($this->tagsToAddAfterOcr, $data, 'tagsToAddAfterOcr', fn ($value) => is_array($value)); + $this->setProperty($this->keepOriginalFileVersion, $data, 'keepOriginalFileVersion', fn ($value) => is_bool($value)); + $this->setProperty($this->customCliArgs, $data, 'customCliArgs', fn ($value) => is_string($value)); + } + + private function setProperty(& $property, array $jsonData, string $key, ?callable $dataCheck = null): void { + if (array_key_exists($key, $jsonData) && ($dataCheck === null || $dataCheck($jsonData[$key]))) { + $property = $jsonData[$key]; } } } diff --git a/lib/OcrProcessors/OcrMyPdfBasedProcessor.php b/lib/OcrProcessors/OcrMyPdfBasedProcessor.php index 2b9cefe..c4ba20f 100644 --- a/lib/OcrProcessors/OcrMyPdfBasedProcessor.php +++ b/lib/OcrProcessors/OcrMyPdfBasedProcessor.php @@ -147,8 +147,18 @@ private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $ $args[] = '--sidecar ' . $sidecarFilePath; } - $resultArgs = array_merge($args, $this->getAdditionalCommandlineArgs($settings, $globalSettings)); + $resultArgs = array_filter(array_merge( + $args, + $this->getAdditionalCommandlineArgs($settings, $globalSettings), + [$this->escapeCustomCliArgs($settings->getCustomCliArgs())] + ), fn ($arg) => !empty($arg)); return implode(' ', $resultArgs); } + + private function escapeCustomCliArgs(string $customCliArgs): string { + $customCliArgs = str_replace('&&', '', $customCliArgs); + $customCliArgs = str_replace(';', '', $customCliArgs); + return $customCliArgs; + } } diff --git a/src/components/WorkflowOcr.vue b/src/components/WorkflowOcr.vue index 0bce0df..fb1077b 100644 --- a/src/components/WorkflowOcr.vue +++ b/src/components/WorkflowOcr.vue @@ -34,18 +34,18 @@ - - {{ tagsToAddAfterOcr }} + {{ model.tagsToAddAfterOcr }} - - {{ tagsToRemoveAfterOcr }} + {{ model.tagsToRemoveAfterOcr }} {{ t('workflow_ocr', 'Remove background') }} {{ t('workflow_ocr', 'Keep original file version') }} +
+ + +
@@ -104,7 +110,7 @@ import { tesseractLanguageMapping } from '../constants.js' import { getInstalledLanguages } from '../service/ocrBackendInfoService.js' import SettingsItem from './SettingsItem.vue' -import { NcSelect, NcSelectTags, NcCheckboxRadioSwitch } from '@nextcloud/vue' +import { NcSelect, NcSelectTags, NcCheckboxRadioSwitch, NcTextField } from '@nextcloud/vue' export default { name: 'WorkflowOcr', @@ -112,6 +118,7 @@ export default { NcSelect: NcSelect, NcSelectTags: NcSelectTags, NcCheckboxRadioSwitch: NcCheckboxRadioSwitch, + NcTextField: NcTextField, SettingsItem: SettingsItem, }, props: { @@ -129,15 +136,24 @@ export default { * Model structure which is captured by NC parent as JSON string: * { * languages: [ 'de', 'en' ], - * assignTagsAfterOcr: [1, 2, 3], - * removeTagsAfterOcr: [42, 43], + * tagsToAddAfterOcr: [1, 2, 3], + * tagsToRemoveAfterOcr: [42, 43], * removeBackground: true, * keepOriginalFileVersion: true, * ocrMode: 0, + * customCliArgs: '--rotate-pages-threshold 8', * } * It's initially set after component creation by 'created'-hook. */ - model: {}, + model: { + languages: [], + tagsToAddAfterOcr: [], + tagsToRemoveAfterOcr: [], + removeBackground: false, + keepOriginalFileVersion: false, + ocrMode: 0, + customCliArgs: '', + }, } }, computed: { @@ -151,43 +167,6 @@ export default { }, set: function(langArray) { this.$set(this.model, 'languages', langArray.map(lang => lang.langCode).filter(lang => lang !== null)) - this.modelChanged() - }, - }, - tagsToAddAfterOcr: { - get: function() { - return this.model.tagsToAddAfterOcr ?? [] - }, - set: function(tagIdArray) { - this.$set(this.model, 'tagsToAddAfterOcr', tagIdArray) - this.modelChanged() - }, - }, - tagsToRemoveAfterOcr: { - get: function() { - return this.model.tagsToRemoveAfterOcr ?? [] - }, - set: function(tagIdArray) { - this.$set(this.model, 'tagsToRemoveAfterOcr', tagIdArray) - this.modelChanged() - }, - }, - removeBackground: { - get: function() { - return !!this.model.removeBackground - }, - set: function(checked) { - this.$set(this.model, 'removeBackground', !!checked) - this.modelChanged() - }, - }, - keepOriginalFileVersion: { - get: function() { - return !!this.model.keepOriginalFileVersion - }, - set: function(checked) { - this.$set(this.model, 'keepOriginalFileVersion', !!checked) - this.modelChanged() }, }, ocrMode: { @@ -200,7 +179,6 @@ export default { if (this.model.ocrMode === 1) { this.$set(this.model, 'removeBackground', false) } - this.modelChanged() }, }, selectedLanguagesPlaceholder: function() { @@ -214,13 +192,22 @@ export default { const installedLanguagesCodes = await getInstalledLanguages() this.availableLanguages = tesseractLanguageMapping.filter(lang => installedLanguagesCodes.includes(lang.langCode)) }, - created: function() { - // Set the initial model by applying the JSON value set by parent after initial mount - this.model = this.value ? JSON.parse(this.value) : {} - }, - methods: { - modelChanged: function() { - this.$emit('input', JSON.stringify(this.model)) + watch: { + value: { + immediate: true, + handler(newValue) { + if (newValue) { + // Merge with defaults + this.model = { ...this.model, ...JSON.parse(newValue) } + } + }, + }, + model: { + deep: true, + handler(newValue) { + // Publish serialized model to parent + this.$emit('input', JSON.stringify(this.model)) + }, }, }, } diff --git a/src/test/components/WorkflowOcr.spec.js b/src/test/components/WorkflowOcr.spec.js index 4ef3cf9..b86db2b 100644 --- a/src/test/components/WorkflowOcr.spec.js +++ b/src/test/components/WorkflowOcr.spec.js @@ -153,15 +153,16 @@ describe('Language settings tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"removeBackground":true}') + expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') + }) }) describe('Add/remove tags tests', () => { test('Values assignTagsAfterOcr/removeTagsAfterOcr tags are set to empty array if no value was choosen', () => { const wrapper = mount(WorkflowOcr) - expect(wrapper.vm.tagsToAddAfterOcr).toEqual([]) - expect(wrapper.vm.tagsToRemoveAfterOcr).toEqual([]) + expect(wrapper.vm.model.tagsToAddAfterOcr).toEqual([]) + expect(wrapper.vm.model.tagsToRemoveAfterOcr).toEqual([]) }) test('User input for assignTagsAfterOcr is applied correctly on empty component', async () => { @@ -181,7 +182,7 @@ describe('Add/remove tags tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":true,"tagsToAddAfterOcr":[1,2]}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') }) test('User input for removeTagsAfterOcr is applied correctly on empty component', async () => { @@ -201,14 +202,14 @@ describe('Add/remove tags tests', () => { const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":true,"tagsToRemoveAfterOcr":[1,2]}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') }) }) describe('Remove background tests', () => { test('RemoveBackground default is false if value is not set', () => { const wrapper = mount(WorkflowOcr) - expect(wrapper.vm.removeBackground).toBe(false) + expect(wrapper.vm.model.removeBackground).toBe(false) }) test('RemoveBackground default is false if property not set', () => { @@ -217,10 +218,10 @@ describe('Remove background tests', () => { value: '{ "languages": [ "de" ] }', }, }) - expect(wrapper.vm.removeBackground).toBe(false) + expect(wrapper.vm.model.removeBackground).toBe(false) }) - test('Should set removeBackground to false', () => { + test('Should set removeBackground to false', async () => { const wrapper = mount(WorkflowOcr, { propsData: { value: '{ "languages": [ "de" ], "removeBackground": true }', @@ -234,9 +235,11 @@ describe('Remove background tests', () => { // Simulate user input radioSwitch.vm.$emit('update:checked', false) + await wrapper.vm.$nextTick() + const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() - expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":false}') + expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}') }) }) @@ -246,13 +249,20 @@ describe('OCR mode tests', () => { expect(wrapper.vm.ocrMode).toBe('0') }) - test.each([0, 1, 2])('Should set OCR mode to %i', (mode) => { - const wrapper = mount(WorkflowOcr) + test.each([0, 1, 2])('Should set OCR mode to %i', async (mode) => { + const wrapper = mount(WorkflowOcr, { + propsData: { + // simulate that ocr mode is currently set to something diffferent + value: `{ "ocrMode": ${mode + 1 % 3}}`, + }, + }) const radioButton = wrapper.findComponent({ ref: `ocrMode${mode}` }) // Simulate user click on radiobutton radioButton.vm.$emit('update:checked', mode) + await wrapper.vm.$nextTick() + const inputEvent = wrapper.emitted().input expect(inputEvent).toBeTruthy() expect(inputEvent[0][0]).toContain(`"ocrMode":${mode}`) @@ -307,3 +317,29 @@ describe('OCR mode tests', () => { expect(removeBackgroundSwitchPost.vm.disabled).toBe(false) }) }) + +describe('Custom CLI args test', () => { + test('Default value for customCliArgs is empty string', () => { + const wrapper = mount(WorkflowOcr) + expect(wrapper.vm.model.customCliArgs).toBe('') + }) + + test('Should set input element value to customCliArgs', async () => { + const wrapper = mount(WorkflowOcr, { + propsData: { + value: '{}', + }, + }) + + const textInput = wrapper.findComponent({ ref: 'customCliArgs' }) + + // Simulate user input + textInput.vm.$emit('update:value', '--dpi 300') + + await wrapper.vm.$nextTick() + + const inputEvent = wrapper.emitted().input + expect(inputEvent).toBeTruthy() + expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":"--dpi 300"}') + }) +}) diff --git a/tests/Unit/Model/WorkflowSettingsTest.php b/tests/Unit/Model/WorkflowSettingsTest.php index 6f30f80..a3e1cd1 100644 --- a/tests/Unit/Model/WorkflowSettingsTest.php +++ b/tests/Unit/Model/WorkflowSettingsTest.php @@ -46,7 +46,7 @@ public function testWorkflowSettingsConstructorThrowsInvalidArgumentExceptionOnI public function dataProvider_testConstruction() { return [ [ - '{"removeBackground":true,"languages":["eng","deu","spa","fra","ita"]}', + '{"removeBackground":true,"languages":["eng","deu","spa","fra","ita"],"keepOriginalFileVersion":false}', true, ['eng', 'deu', 'spa', 'fra', 'ita'] ] diff --git a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php index 2e45409..dbcd30e 100644 --- a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php +++ b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php @@ -312,7 +312,7 @@ public function testAppliesSidecarParameterIfSidecarFileCanBeCreated() { public function testAppliesOcrModeParameter(int $simulatedOcrMode, string $expectedOcrMyPdfFlag) { $this->command->expects($this->once()) ->method('setCommand') - ->with('ocrmypdf -q ' . $expectedOcrMyPdfFlag . ' --sidecar /tmp/sidecar.txt - - | cat'); + ->with('ocrmypdf -q' . $expectedOcrMyPdfFlag . '--sidecar /tmp/sidecar.txt - - | cat'); $this->command->expects($this->once()) ->method('execute') ->willReturn(true); @@ -356,12 +356,34 @@ public function testRemoveBackgroundIsNotAppliedIfOcrModeIsRedoOcr() { $processor->ocrFile($this->fileBefore, new WorkflowSettings('{"ocrMode": ' . WorkflowSettings::OCR_MODE_REDO_OCR . ', "removeBackground": true}'), $this->defaultGlobalSettings); } + public function testAppliesCustomCliArgsCorrectly() { + $this->command->expects($this->once()) + ->method('setCommand') + ->with('ocrmypdf -q --skip-text --sidecar /tmp/sidecar.txt --output-type pdf - - | cat'); + $this->command->expects($this->once()) + ->method('execute') + ->willReturn(true); + $this->command->expects($this->once()) + ->method('getOutput') + ->willReturn('someOcrContent'); + $this->sidecarFileAccessor->expects($this->once()) + ->method('getSidecarFileContent') + ->willReturn('someOcrContent'); + $this->sidecarFileAccessor->expects($this->once()) + ->method('getOrCreateSidecarFile') + ->willReturn('/tmp/sidecar.txt'); + + $workflowSettings = new WorkflowSettings('{"customCliArgs": "--output-type pdf"}'); + $processor = new PdfOcrProcessor($this->command, $this->logger, $this->sidecarFileAccessor); + $processor->ocrFile($this->fileBefore, $workflowSettings, $this->defaultGlobalSettings); + } + public function dataProvider_testAppliesOcrModeParameter() { return [ - [WorkflowSettings::OCR_MODE_SKIP_TEXT, '--skip-text'], - [WorkflowSettings::OCR_MODE_REDO_OCR, '--redo-ocr'], - [WorkflowSettings::OCR_MODE_FORCE_OCR, '--force-ocr'], - [WorkflowSettings::OCR_MODE_SKIP_FILE, ''] + [WorkflowSettings::OCR_MODE_SKIP_TEXT, ' --skip-text '], + [WorkflowSettings::OCR_MODE_REDO_OCR, ' --redo-ocr '], + [WorkflowSettings::OCR_MODE_FORCE_OCR, ' --force-ocr '], + [WorkflowSettings::OCR_MODE_SKIP_FILE, ' '] ]; } }