Skip to content

Commit

Permalink
Label current version to prevent expiry of original (#271)
Browse files Browse the repository at this point in the history
* Label current version to prevent expiry of original

* Cleanup & Add tests for feature "keep original version"

---------

Co-authored-by: Robin Windey <[email protected]>
  • Loading branch information
XueSheng-GIT and R0Wi authored Dec 3, 2024
1 parent 0777888 commit e1190b9
Show file tree
Hide file tree
Showing 12 changed files with 5,871 additions and 6,886 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ OCR language | The languages to be used for OCR processing. The languages can be
Assign tags after OCR | These tags will be assigned to the file after it has been successfully processed. |
Remove tags after OCR | These tags will be removed from the file after it has been successfully processed. If the file does not have the tag, it will just be skipped. |
OCR mode | Controls the way files are processed, which already have OCR content. For PDF files this setting corresponds to the `--skip-text`, `--redo-ocr` and `--force-ocr` parameters of `ocrmypdf`. See [official docs](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) for additional information.<br>**Skip text:** skip pages completely that already contain text. Such a page will not be touched and just be copied to the final output.<br>**Redo OCR:** perform a detailed text analysis to split up pages into areas with and without text.<br>**Force OCR:** all pages will be rasterized to images and OCR will be performed on every page. |
Keep original file version | If the switch is set, the original file (before applying OCR) will be kept. This is done by giving the file version the label `Before OC`. This version will be excluded from the automatic expiration process (see [here](https://docs.nextcloud.com/server/latest/user_manual/en/files/version_control.html#naming-a-version) for details) |
Remove background\* | If the switch is set, the OCR processor will try to remove the background of the document before processing and instead set a white background. For PDF files this setting corresponds to the [`--remove-background`](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html?highlight=remove-background#image-processing) parameter of `ocrmypdf`.<br/>:warning: Please note that this flag will currently only work with **`ocrmypdf` versions prior to 13**. It might be added in future versions again. See [here](https://github.com/ocrmypdf/OCRmyPDF/issues/884) for details. :warning:|


Expand Down
4 changes: 4 additions & 0 deletions jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@ const ignorePatterns = [
'@mdi/svg',
'bail',
'comma-separated-tokens',
'ccount',
'char-regex',
'decode-named-character-reference',
'devlop',
'escape-string-regexp',
'hast-.*',
'is-.*',
'longest-streak',
'mdast-.*',
'micromark',
'micromark-.*',
'markdown-table',
'property-information',
'rehype-.*',
'remark-.*',
Expand All @@ -28,6 +31,7 @@ const ignorePatterns = [
'vfile-.*',
'vue-material-design-icons',
'web-namespaces',
'zwitch',
]

module.exports = {
Expand Down
13 changes: 13 additions & 0 deletions lib/Model/WorkflowSettings.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class WorkflowSettings {
/** @var array string */
private $tagsToAddAfterOcr = [];

/** @var bool */
private $keepOriginalFileVersion = false;

/**
* @param string $json The serialized JSON string used in frontend as input for the Vue component
*/
Expand Down Expand Up @@ -91,6 +94,13 @@ public function getTagsToAddAfterOcr(): array {
return $this->tagsToAddAfterOcr;
}

/**
* @return bool
*/
public function getKeepOriginalFileVersion(): bool {
return $this->keepOriginalFileVersion;
}

/**
* Checks if a new WorkflowSettings object can be constructed from the given JSON string
* @param string $json The serialized JSON string used in frontend as input for the Vue component
Expand Down Expand Up @@ -132,5 +142,8 @@ private function setJson(?string $json = null) {
if (array_key_exists('tagsToAddAfterOcr', $data) && is_array($data['tagsToAddAfterOcr'])) {
$this->tagsToAddAfterOcr = $data['tagsToAddAfterOcr'];
}
if (array_key_exists('keepOriginalFileVersion', $data) && is_bool($data['keepOriginalFileVersion'])) {
$this->keepOriginalFileVersion = $data['keepOriginalFileVersion'];
}
}
}
49 changes: 49 additions & 0 deletions lib/Service/OcrService.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
namespace OCA\WorkflowOcr\Service;

use OC\User\NoUserException;
use OCA\Files_Versions\Versions\IMetadataVersion;
use OCA\Files_Versions\Versions\IMetadataVersionBackend;
use OCA\Files_Versions\Versions\IVersionManager;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Model\WorkflowSettings;
Expand All @@ -46,12 +49,18 @@
use Psr\Log\LoggerInterface;

class OcrService implements IOcrService {
private const FILE_VERSION_LABEL_KEY = 'label';
private const FILE_VERSION_LABEL_VALUE = 'Before OCR';

/** @var IOcrProcessorFactory */
private $ocrProcessorFactory;

/** @var IGlobalSettingsService */
private $globalSettingsService;

/** @var IVersionManager */
private $versionManager;

/** @var ISystemTagObjectMapper */
private $systemTagObjectMapper;

Expand Down Expand Up @@ -82,6 +91,7 @@ class OcrService implements IOcrService {
public function __construct(
IOcrProcessorFactory $ocrProcessorFactory,
IGlobalSettingsService $globalSettingsService,
IVersionManager $versionManager,
ISystemTagObjectMapper $systemTagObjectMapper,
IUserManager $userManager,
IFilesystem $filesystem,
Expand All @@ -93,6 +103,7 @@ public function __construct(
LoggerInterface $logger) {
$this->ocrProcessorFactory = $ocrProcessorFactory;
$this->globalSettingsService = $globalSettingsService;
$this->versionManager = $versionManager;
$this->systemTagObjectMapper = $systemTagObjectMapper;
$this->userManager = $userManager;
$this->filesystem = $filesystem;
Expand Down Expand Up @@ -133,6 +144,11 @@ public function runOcrProcess(int $fileId, string $uid, WorkflowSettings $settin

// Only create a new file version if the file OCR result was not empty #130
if ($result->getRecognizedText() !== '') {
if ($settings->getKeepOriginalFileVersion()) {
// Add label to original file to prevent its expiry
$this->setFileVersionsLabel($file, $uid, self::FILE_VERSION_LABEL_VALUE);
}

$newFilePath = $originalFileExtension === $newFileExtension ?
$filePath :
$filePath . '.pdf';
Expand Down Expand Up @@ -225,4 +241,37 @@ private function createNewFileVersion(string $filePath, string $ocrContent, int
$this->processingFileAccessor->setCurrentlyProcessedFileId(null);
}
}

/**
* @param File $file The file to set the label for
* @param string $uid The userId of the file owner
* @param string $label The label to set
*/
private function setFileVersionsLabel(File $file, string $uid, string $label): void {
$fileMTime = $file->getMTime();
$user = $this->userManager->get($uid);
$versions = $this->versionManager->getVersionsForFile($user, $file);

foreach ($versions as $version) {
$revisionId = $version->getRevisionId();
if (!$version instanceof IMetadataVersion) {
$this->logger->debug('Skipping version with revision id {versionId} because "{versionClass}" is not an IMetadataVersion', ['versionId' => $revisionId, 'versionClass' => get_class($version)]);
continue;
}

$versionBackend = $version->getBackend();
if (!$versionBackend instanceof IMetadataVersionBackend) {
$this->logger->debug('Skipping version with revision id {versionId} because its backend "{versionBackendClass}" does not implement IMetadataVersionBackend', ['versionId' => $revisionId, 'versionBackendClass' => get_class($versionBackend)]);
continue;
}

$versionTimestamp = $version->getTimestamp();
$versionLabel = $version->getMetadataValue(self::FILE_VERSION_LABEL_KEY);

if ($fileMTime === $versionTimestamp && empty($versionLabel)) {
$this->logger->debug('Setting pre OCR label for version with revision id {versionId} on file {fileId}', ['versionId' => $revisionId, 'fileId' => $file->getId()]);
$versionBackend->setMetadataValue($file, $revisionId, self::FILE_VERSION_LABEL_KEY, $label);
}
}
}
}
Loading

0 comments on commit e1190b9

Please sign in to comment.