Skip to content

Commit

Permalink
Add custom CLI args for ocrMyPdf (R0Wi-DEV#284)
Browse files Browse the repository at this point in the history
* Introduce new UI text field for custom CLI args
* Rework WorkflowOcr.vue for cleaner binding
* Refactor WorkflowSettings.php
  • Loading branch information
R0Wi authored and XueSheng-GIT committed Dec 14, 2024
1 parent aa81fe1 commit 198db2e
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 90 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ Remove tags after OCR | These tags will be removed from the file after it has be
OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.<br>**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.<br>**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.<br>**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. |
Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) |
Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.<br/>:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:|
Custom ocrMyPdf CLI arguments | If you want to pass custom arguments to the `ocrmypdf` CLI, you can do so here. Please note that the arguments will be passed as they are to the CLI, so make sure to use the correct syntax. Check the [official docs](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html) for more information. |


\* *For `ocrmypdf` the parameter `--remove-background` is [incompatible with `--redo-ocr`](https://github.com/ocrmypdf/OCRmyPDF/blob/110c75cba25121dcca7e2b91644206cce29e8430/src/ocrmypdf/_validation.py#L104).*
Expand Down
39 changes: 22 additions & 17 deletions lib/Model/WorkflowSettings.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class WorkflowSettings {
/** @var bool */
private $keepOriginalFileVersion = false;

/** @var string */
private $customCliArgs = '';

/**
* @param string $json The serialized JSON string used in frontend as input for the Vue component
*/
Expand Down Expand Up @@ -101,6 +104,13 @@ public function getKeepOriginalFileVersion(): bool {
return $this->keepOriginalFileVersion;
}

/**
* @return string
*/
public function getCustomCliArgs(): string {
return $this->customCliArgs;
}

/**
* Checks if a new WorkflowSettings object can be constructed from the given JSON string
* @param string $json The serialized JSON string used in frontend as input for the Vue component
Expand All @@ -127,23 +137,18 @@ private function setJson(?string $json = null) {
if ($data === null) {
throw new InvalidArgumentException('Invalid JSON: "' . $json . '"');
}
if (array_key_exists('languages', $data) && is_array($data['languages'])) {
$this->languages = $data['languages'];
}
if (array_key_exists('removeBackground', $data) && is_bool($data['removeBackground'])) {
$this->removeBackground = $data['removeBackground'];
}
if (array_key_exists('ocrMode', $data) && is_int($data['ocrMode'])) {
$this->ocrMode = $data['ocrMode'];
}
if (array_key_exists('tagsToRemoveAfterOcr', $data) && is_array($data['tagsToRemoveAfterOcr'])) {
$this->tagsToRemoveAfterOcr = $data['tagsToRemoveAfterOcr'];
}
if (array_key_exists('tagsToAddAfterOcr', $data) && is_array($data['tagsToAddAfterOcr'])) {
$this->tagsToAddAfterOcr = $data['tagsToAddAfterOcr'];
}
if (array_key_exists('keepOriginalFileVersion', $data) && is_bool($data['keepOriginalFileVersion'])) {
$this->keepOriginalFileVersion = $data['keepOriginalFileVersion'];
$this->setProperty($this->languages, $data, 'languages', fn ($value) => is_array($value));
$this->setProperty($this->removeBackground, $data, 'removeBackground', fn ($value) => is_bool($value));
$this->setProperty($this->ocrMode, $data, 'ocrMode', fn ($value) => is_int($value));
$this->setProperty($this->tagsToRemoveAfterOcr, $data, 'tagsToRemoveAfterOcr', fn ($value) => is_array($value));
$this->setProperty($this->tagsToAddAfterOcr, $data, 'tagsToAddAfterOcr', fn ($value) => is_array($value));
$this->setProperty($this->keepOriginalFileVersion, $data, 'keepOriginalFileVersion', fn ($value) => is_bool($value));
$this->setProperty($this->customCliArgs, $data, 'customCliArgs', fn ($value) => is_string($value));
}

private function setProperty(& $property, array $jsonData, string $key, ?callable $dataCheck = null): void {
if (array_key_exists($key, $jsonData) && ($dataCheck === null || $dataCheck($jsonData[$key]))) {
$property = $jsonData[$key];
}
}
}
12 changes: 11 additions & 1 deletion lib/OcrProcessors/OcrMyPdfBasedProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,18 @@ private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $
$args[] = '--sidecar ' . $sidecarFilePath;
}

$resultArgs = array_merge($args, $this->getAdditionalCommandlineArgs($settings, $globalSettings));
$resultArgs = array_filter(array_merge(
$args,
$this->getAdditionalCommandlineArgs($settings, $globalSettings),
[$this->escapeCustomCliArgs($settings->getCustomCliArgs())]
), fn ($arg) => !empty($arg));

return implode(' ', $resultArgs);
}

private function escapeCustomCliArgs(string $customCliArgs): string {
$customCliArgs = str_replace('&&', '', $customCliArgs);
$customCliArgs = str_replace(';', '', $customCliArgs);
return $customCliArgs;
}
}
97 changes: 42 additions & 55 deletions src/components/WorkflowOcr.vue
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@
</SettingsItem>
<SettingsItem :label="t('workflow_ocr', 'Assign tags after OCR')"
:info-text="t('workflow_ocr', 'These tags will be assigned to the file after OCR processing has finished')">
<NcSelectTags v-model="tagsToAddAfterOcr"
<NcSelectTags v-model="model.tagsToAddAfterOcr"
:labelOutside="true"
:multiple="true">
{{ tagsToAddAfterOcr }}
{{ model.tagsToAddAfterOcr }}
</NcSelectTags>
</SettingsItem>
<SettingsItem :label="t('workflow_ocr', 'Remove tags after OCR')"
:info-text="t('workflow_ocr', 'These tags will be removed from the file after OCR processing has finished')">
<NcSelectTags v-model="tagsToRemoveAfterOcr"
<NcSelectTags v-model="model.tagsToRemoveAfterOcr"
:labelOutside="true"
:multiple="true">
{{ tagsToRemoveAfterOcr }}
{{ model.tagsToRemoveAfterOcr }}
</NcSelectTags>
</SettingsItem>
<SettingsItem :label="t('workflow_ocr', 'OCR mode')"
Expand Down Expand Up @@ -85,17 +85,23 @@
<div>
<NcCheckboxRadioSwitch ref="removeBackgroundSwitch"
:disabled="removeBackgroundDisabled"
:checked.sync="removeBackground"
:checked.sync="model.removeBackground"
type="switch">
{{ t('workflow_ocr', 'Remove background') }}
</NcCheckboxRadioSwitch>
<NcCheckboxRadioSwitch ref="keepOriginalFileVersionSwitch"
:checked.sync="keepOriginalFileVersion"
:checked.sync="model.keepOriginalFileVersion"
type="switch">
{{ t('workflow_ocr', 'Keep original file version') }}
</NcCheckboxRadioSwitch>
</div>
</SettingsItem>
<div>
<NcTextField :value.sync="model.customCliArgs"
:label="t('workflow_ocr', 'Custom ocrMyPdf CLI arguments')"
ref="customCliArgs">
</NcTextField>
</div>
</div>
</template>

Expand All @@ -104,14 +110,15 @@
import { tesseractLanguageMapping } from '../constants.js'
import { getInstalledLanguages } from '../service/ocrBackendInfoService.js'
import SettingsItem from './SettingsItem.vue'
import { NcSelect, NcSelectTags, NcCheckboxRadioSwitch } from '@nextcloud/vue'
import { NcSelect, NcSelectTags, NcCheckboxRadioSwitch, NcTextField } from '@nextcloud/vue'

export default {
name: 'WorkflowOcr',
components: {
NcSelect: NcSelect,
NcSelectTags: NcSelectTags,
NcCheckboxRadioSwitch: NcCheckboxRadioSwitch,
NcTextField: NcTextField,
SettingsItem: SettingsItem,
},
props: {
Expand All @@ -129,15 +136,24 @@ export default {
* Model structure which is captured by NC parent as JSON string:
* {
* languages: [ 'de', 'en' ],
* assignTagsAfterOcr: [1, 2, 3],
* removeTagsAfterOcr: [42, 43],
* tagsToAddAfterOcr: [1, 2, 3],
* tagsToRemoveAfterOcr: [42, 43],
* removeBackground: true,
* keepOriginalFileVersion: true,
* ocrMode: 0,
* customCliArgs: '--rotate-pages-threshold 8',
* }
* It's initially set after component creation by 'created'-hook.
*/
model: {},
model: {
languages: [],
tagsToAddAfterOcr: [],
tagsToRemoveAfterOcr: [],
removeBackground: false,
keepOriginalFileVersion: false,
ocrMode: 0,
customCliArgs: '',
},
}
},
computed: {
Expand All @@ -151,43 +167,6 @@ export default {
},
set: function(langArray) {
this.$set(this.model, 'languages', langArray.map(lang => lang.langCode).filter(lang => lang !== null))
this.modelChanged()
},
},
tagsToAddAfterOcr: {
get: function() {
return this.model.tagsToAddAfterOcr ?? []
},
set: function(tagIdArray) {
this.$set(this.model, 'tagsToAddAfterOcr', tagIdArray)
this.modelChanged()
},
},
tagsToRemoveAfterOcr: {
get: function() {
return this.model.tagsToRemoveAfterOcr ?? []
},
set: function(tagIdArray) {
this.$set(this.model, 'tagsToRemoveAfterOcr', tagIdArray)
this.modelChanged()
},
},
removeBackground: {
get: function() {
return !!this.model.removeBackground
},
set: function(checked) {
this.$set(this.model, 'removeBackground', !!checked)
this.modelChanged()
},
},
keepOriginalFileVersion: {
get: function() {
return !!this.model.keepOriginalFileVersion
},
set: function(checked) {
this.$set(this.model, 'keepOriginalFileVersion', !!checked)
this.modelChanged()
},
},
ocrMode: {
Expand All @@ -200,7 +179,6 @@ export default {
if (this.model.ocrMode === 1) {
this.$set(this.model, 'removeBackground', false)
}
this.modelChanged()
},
},
selectedLanguagesPlaceholder: function() {
Expand All @@ -214,13 +192,22 @@ export default {
const installedLanguagesCodes = await getInstalledLanguages()
this.availableLanguages = tesseractLanguageMapping.filter(lang => installedLanguagesCodes.includes(lang.langCode))
},
created: function() {
// Set the initial model by applying the JSON value set by parent after initial mount
this.model = this.value ? JSON.parse(this.value) : {}
},
methods: {
modelChanged: function() {
this.$emit('input', JSON.stringify(this.model))
watch: {
value: {
immediate: true,
handler(newValue) {
if (newValue) {
// Merge with defaults
this.model = { ...this.model, ...JSON.parse(newValue) }
}
},
},
model: {
deep: true,
handler(newValue) {
// Publish serialized model to parent
this.$emit('input', JSON.stringify(this.model))
},
},
},
}
Expand Down
58 changes: 47 additions & 11 deletions src/test/components/WorkflowOcr.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,16 @@ describe('Language settings tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"removeBackground":true}')
expect(inputEvent[0][0]).toBe('{"languages":["de","en"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')

})
})

describe('Add/remove tags tests', () => {
test('Values assignTagsAfterOcr/removeTagsAfterOcr tags are set to empty array if no value was choosen', () => {
const wrapper = mount(WorkflowOcr)
expect(wrapper.vm.tagsToAddAfterOcr).toEqual([])
expect(wrapper.vm.tagsToRemoveAfterOcr).toEqual([])
expect(wrapper.vm.model.tagsToAddAfterOcr).toEqual([])
expect(wrapper.vm.model.tagsToRemoveAfterOcr).toEqual([])
})

test('User input for assignTagsAfterOcr is applied correctly on empty component', async () => {
Expand All @@ -181,7 +182,7 @@ describe('Add/remove tags tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":true,"tagsToAddAfterOcr":[1,2]}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[1,2],"tagsToRemoveAfterOcr":[],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
})

test('User input for removeTagsAfterOcr is applied correctly on empty component', async () => {
Expand All @@ -201,14 +202,14 @@ describe('Add/remove tags tests', () => {

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":true,"tagsToRemoveAfterOcr":[1,2]}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[1,2],"removeBackground":true,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
})
})

describe('Remove background tests', () => {
test('RemoveBackground default is false if value is not set', () => {
const wrapper = mount(WorkflowOcr)
expect(wrapper.vm.removeBackground).toBe(false)
expect(wrapper.vm.model.removeBackground).toBe(false)
})

test('RemoveBackground default is false if property not set', () => {
Expand All @@ -217,10 +218,10 @@ describe('Remove background tests', () => {
value: '{ "languages": [ "de" ] }',
},
})
expect(wrapper.vm.removeBackground).toBe(false)
expect(wrapper.vm.model.removeBackground).toBe(false)
})

test('Should set removeBackground to false', () => {
test('Should set removeBackground to false', async () => {
const wrapper = mount(WorkflowOcr, {
propsData: {
value: '{ "languages": [ "de" ], "removeBackground": true }',
Expand All @@ -234,9 +235,11 @@ describe('Remove background tests', () => {
// Simulate user input
radioSwitch.vm.$emit('update:checked', false)

await wrapper.vm.$nextTick()

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":["de"],"removeBackground":false}')
expect(inputEvent[0][0]).toBe('{"languages":["de"],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":""}')
})
})

Expand All @@ -246,13 +249,20 @@ describe('OCR mode tests', () => {
expect(wrapper.vm.ocrMode).toBe('0')
})

test.each([0, 1, 2])('Should set OCR mode to %i', (mode) => {
const wrapper = mount(WorkflowOcr)
test.each([0, 1, 2])('Should set OCR mode to %i', async (mode) => {
const wrapper = mount(WorkflowOcr, {
propsData: {
// simulate that ocr mode is currently set to something diffferent
value: `{ "ocrMode": ${mode + 1 % 3}}`,
},
})
const radioButton = wrapper.findComponent({ ref: `ocrMode${mode}` })

// Simulate user click on radiobutton
radioButton.vm.$emit('update:checked', mode)

await wrapper.vm.$nextTick()

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toContain(`"ocrMode":${mode}`)
Expand Down Expand Up @@ -307,3 +317,29 @@ describe('OCR mode tests', () => {
expect(removeBackgroundSwitchPost.vm.disabled).toBe(false)
})
})

describe('Custom CLI args test', () => {
test('Default value for customCliArgs is empty string', () => {
const wrapper = mount(WorkflowOcr)
expect(wrapper.vm.model.customCliArgs).toBe('')
})

test('Should set input element value to customCliArgs', async () => {
const wrapper = mount(WorkflowOcr, {
propsData: {
value: '{}',
},
})

const textInput = wrapper.findComponent({ ref: 'customCliArgs' })

// Simulate user input
textInput.vm.$emit('update:value', '--dpi 300')

await wrapper.vm.$nextTick()

const inputEvent = wrapper.emitted().input
expect(inputEvent).toBeTruthy()
expect(inputEvent[0][0]).toBe('{"languages":[],"tagsToAddAfterOcr":[],"tagsToRemoveAfterOcr":[],"removeBackground":false,"keepOriginalFileVersion":false,"ocrMode":0,"customCliArgs":"--dpi 300"}')
})
})
2 changes: 1 addition & 1 deletion tests/Unit/Model/WorkflowSettingsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public function testWorkflowSettingsConstructorThrowsInvalidArgumentExceptionOnI
public function dataProvider_testConstruction() {
return [
[
'{"removeBackground":true,"languages":["eng","deu","spa","fra","ita"]}',
'{"removeBackground":true,"languages":["eng","deu","spa","fra","ita"],"keepOriginalFileVersion":false}',
true,
['eng', 'deu', 'spa', 'fra', 'ita']
]
Expand Down
Loading

0 comments on commit 198db2e

Please sign in to comment.