Skip to content

Commit

Permalink
Add Verbose arg to oce image-ocr-extract-text (#37372)
Browse files Browse the repository at this point in the history
* Added release notes

* added UT

* added description to the command

* added to the release notes
  • Loading branch information
omerKarkKatz authored Nov 26, 2024
1 parent c9f9914 commit 5a76ff0
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 6 deletions.
12 changes: 9 additions & 3 deletions Packs/ImageOCR/Integrations/ImageOCR/ImageOCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@ def list_languages() -> list[str]:
return sorted(lines[1:]) # ignore first line


def extract_text(image_path: str, languages: list[str] = None) -> str:
exe_params = [TESSERACT_EXE, "-v", image_path, 'stdout']
def extract_text(image_path: str, languages: list[str] = [], verbose: bool = False) -> str:
exe_params = [TESSERACT_EXE, image_path, 'stdout']
if verbose:
exe_params.extend(["-v"])

if languages:
exe_params.extend(['-l', '+'.join(languages)])

res = subprocess.run(exe_params, capture_output=True, check=True, text=True)
if res.stderr:
demisto.debug(f'tesseract returned ok but stderr contains warnings: {res.stderr}')

return res.stdout


Expand All @@ -37,6 +42,7 @@ def list_languages_command() -> CommandResults:

def extract_text_command(args: dict, instance_languages: list, skip_corrupted: bool) -> tuple[list, list]:
langs = argToList(args.get('langs')) or instance_languages
verbose = argToBoolean(args.get('verbose', False))
demisto.debug(f"Using langs settings: {langs}")
results, errors = [], []

Expand All @@ -48,7 +54,7 @@ def extract_text_command(args: dict, instance_languages: list, skip_corrupted: b
raise DemistoException(f"Couldn't find entry id: {entry_id}")

demisto.debug(f'Extracting text from file: {file_path}')
res = extract_text(file_path['path'], langs)
res = extract_text(file_path['path'], langs, verbose)
file_entry = {'EntryID': entry_id, 'Text': res}
results.append(
CommandResults(
Expand Down
8 changes: 6 additions & 2 deletions Packs/ImageOCR/Integrations/ImageOCR/ImageOCR.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ configuration:
section: Connect
advanced: true
required: false
- additionalinfo: If true, will not raise an error if the image is corrupted and could not be processed.
- additionalinfo: If true, will not raise an error if the image is corrupt and could not be processed.
display: Skip on corrupted images
name: skip_corrupted
section: Connect
Expand All @@ -36,8 +36,12 @@ script:
- description: A CSV of language codes of the language to use for OCR. Overrides the default configured language list.
isArray: true
name: langs
description: Extracts text from an image.
- description: Turn on verbose flag to display tesseract and other used libraries versions.
required: false
defaultValue: False
name: verbose
name: image-ocr-extract-text
description: Extract text from images.
outputs:
- contextPath: File.Text
description: Extracted text from the passed image file.
Expand Down
20 changes: 20 additions & 0 deletions Packs/ImageOCR/Integrations/ImageOCR/ImageOCR_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,26 @@ def test_extract_text(image, expected_text, langs):
assert expected_text in res


def test_extract_text_verbose_params():
"""
Given:
- An image with text
When:
- Running the extract_text command
Then:
- Validate the result with and without the verbose parameter.
"""
path = 'test_data/bomb.jpg'
res_verbose = extract_text(path, verbose=True)
# Some of the verbose data.
assert "tesseract" in res_verbose
# Without verbose.
res_without_verbose = extract_text(path, verbose=False)
assert "tesseract" not in res_without_verbose


def test_extract_text_command(mocker):
"""
Given:
Expand Down
1 change: 1 addition & 0 deletions Packs/ImageOCR/Integrations/ImageOCR/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Extracts text from an image.
| --- | --- | --- |
| entryid | A comma-separated list of Entry IDs of image files to process. | Required |
| langs | A CSV of language codes of the language to use for OCR. Overrides the default configured language list. | Optional |
| verbose | Turn on verbose flag to display tesseract and other used libraries versions. | Optional |

#### Context Output

Expand Down
6 changes: 6 additions & 0 deletions Packs/ImageOCR/ReleaseNotes/1_1_35.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

#### Integrations

##### Image OCR

- Added the *verbose* argument to the ***image-ocr-extract-text*** command.
2 changes: 1 addition & 1 deletion Packs/ImageOCR/pack_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "Image OCR",
"description": "Extracts text from images.",
"support": "xsoar",
"currentVersion": "1.1.34",
"currentVersion": "1.1.35",
"author": "Cortex XSOAR",
"url": "https://www.paloaltonetworks.com/cortex",
"email": "",
Expand Down

0 comments on commit 5a76ff0

Please sign in to comment.