Skip to content

Commit

Permalink
Cleanup & Add tests for feature "keep original version"
Browse files Browse the repository at this point in the history
  • Loading branch information
R0Wi committed Dec 3, 2024
1 parent 04722c3 commit f12016f
Show file tree
Hide file tree
Showing 12 changed files with 5,865 additions and 6,900 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ OCR language | The languages to be used for OCR processing. The languages can be
Assign tags after OCR | These tags will be assigned to the file after it has been successfully processed. |
Remove tags after OCR | These tags will be removed from the file after it has been successfully processed. If the file does not have the tag, it will just be skipped. |
OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.<br>**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.<br>**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.<br>**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. |
Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) |
Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.<br/>:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:|


Expand Down
4 changes: 4 additions & 0 deletions jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@ const ignorePatterns = [
'@mdi/svg',
'bail',
'comma-separated-tokens',
'ccount',
'char-regex',
'decode-named-character-reference',
'devlop',
'escape-string-regexp',
'hast-.*',
'is-.*',
'longest-streak',
'mdast-.*',
'micromark',
'micromark-.*',
'markdown-table',
'property-information',
'rehype-.*',
'remark-.*',
Expand All @@ -28,6 +31,7 @@ const ignorePatterns = [
'vfile-.*',
'vue-material-design-icons',
'web-namespaces',
'zwitch',
]

module.exports = {
Expand Down
13 changes: 13 additions & 0 deletions lib/Model/WorkflowSettings.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class WorkflowSettings {
/** @var array string */
private $tagsToAddAfterOcr = [];

/** @var bool */
private $keepOriginalFileVersion = false;

/**
* @param string $json The serialized JSON string used in frontend as input for the Vue component
*/
Expand Down Expand Up @@ -91,6 +94,13 @@ public function getTagsToAddAfterOcr(): array {
return $this->tagsToAddAfterOcr;
}

/**
* @return bool
*/
public function getKeepOriginalFileVersion(): bool {
return $this->keepOriginalFileVersion;
}

/**
* Checks if a new WorkflowSettings object can be constructed from the given JSON string
* @param string $json The serialized JSON string used in frontend as input for the Vue component
Expand Down Expand Up @@ -132,5 +142,8 @@ private function setJson(?string $json = null) {
if (array_key_exists('tagsToAddAfterOcr', $data) && is_array($data['tagsToAddAfterOcr'])) {
$this->tagsToAddAfterOcr = $data['tagsToAddAfterOcr'];
}
if (array_key_exists('keepOriginalFileVersion', $data) && is_bool($data['keepOriginalFileVersion'])) {
$this->keepOriginalFileVersion = $data['keepOriginalFileVersion'];
}
}
}
57 changes: 43 additions & 14 deletions lib/Service/OcrService.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
namespace OCA\WorkflowOcr\Service;

use OC\User\NoUserException;
use OCA\Files_Versions\Versions\IMetadataVersion;
use OCA\Files_Versions\Versions\IMetadataVersionBackend;
use OCA\Files_Versions\Versions\IVersionManager;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
Expand All @@ -47,14 +49,17 @@
use Psr\Log\LoggerInterface;

class OcrService implements IOcrService {
private const FILE_VERSION_LABEL_KEY = 'label';
private const FILE_VERSION_LABEL_VALUE = 'Before OCR';

/** @var IOcrProcessorFactory */
private $ocrProcessorFactory;

/** @var IGlobalSettingsService */
private $globalSettingsService;

/** @var IVersionManager */
private $versionManager;
private $versionManager;

/** @var ISystemTagObjectMapper */
private $systemTagObjectMapper;
Expand Down Expand Up @@ -139,19 +144,10 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin

// Only create a new file version if the file OCR result was not empty #130
if ($result->getRecognizedText() !== '') {
$fileMTime = $file->getMTime();
$user = $this->userManager->get($uid);
$versions = $this->versionManager->getVersionsForFile($user, $file);

foreach ($versions as $version) {
$versionTimestamp = $version->getTimestamp();
$versionLabel = $version->getMetadataValue('label');

if ($fileMTime === $versionTimestamp && ($versionLabel === null || $versionLabel === '')) {
// Add label to current file version to prevent its expiry
$this->versionManager->setMetadataValue($file, $version->getRevisionId(), 'label', 'PreOCR');
}
}
if ($settings->getKeepOriginalFileVersion()) {
// Add label to original file to prevent its expiry
$this->setFileVersionsLabel($file, $uid, self::FILE_VERSION_LABEL_VALUE);
}

$newFilePath = $originalFileExtension === $newFileExtension ?
$filePath :
Expand Down Expand Up @@ -245,4 +241,37 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int
$this->processingFileAccessor->setCurrentlyProcessedFileId(null);
}
}

/**
* @param File $file The file to set the label for
* @param string $uid The userId of the file owner
* @param string $label The label to set
*/
private function setFileVersionsLabel(File $file, string $uid, string $label): void {
$fileMTime = $file->getMTime();
$user = $this->userManager->get($uid);
$versions = $this->versionManager->getVersionsForFile($user, $file);

foreach ($versions as $version) {
$revisionId = $version->getRevisionId();
if (!$version instanceof IMetadataVersion) {
$this->logger->debug('Skipping version with revision id {versionId} because "{versionClass}" is not an IMetadataVersion', ['versionId' => $revisionId, 'versionClass' => get_class($version)]);
continue;
}

$versionBackend = $version->getBackend();
if (!$versionBackend instanceof IMetadataVersionBackend) {
$this->logger->debug('Skipping version with revision id {versionId} because its backend "{versionBackendClass}" does not implement IMetadataVersionBackend', ['versionId' => $revisionId, 'versionBackendClass' => get_class($versionBackend)]);
continue;
}

$versionTimestamp = $version->getTimestamp();
$versionLabel = $version->getMetadataValue(self::FILE_VERSION_LABEL_KEY);

if ($fileMTime === $versionTimestamp && empty($versionLabel)) {
$this->logger->debug('Setting pre OCR label for version with revision id {versionId} on file {fileId}', ['versionId' => $revisionId, 'fileId' => $file->getId()]);
$versionBackend->setMetadataValue($file, $revisionId, self::FILE_VERSION_LABEL_KEY, $label);
}
}
}
}
Loading

0 comments on commit f12016f

Please sign in to comment.