Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add checkpoint_dir content-type, remove checkpoint variant #70

Merged
merged 9 commits into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ jobs:
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
- checkout
- run: pip3 install --upgrade pip
- run: make install PIP_INSTALL="pip3 install"
- run: pip3 install -r requirements-test.txt
- run: make install deps-test PIP_INSTALL="pip3 install"
- run: make coverage LC_ALL=en_US.utf8
- codecov/upload

Expand Down
29 changes: 14 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install
GIT_CLONE = git clone
PYTHON = python3
PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
MODEL = qurator-gt4histocr-1.0

# BEGIN-EVAL makefile-parser --make-help Makefile

Expand All @@ -11,7 +12,7 @@ help:
@echo " Targets"
@echo ""
@echo " install Install ocrd_calamari"
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
@echo " $(MODEL) Get Calamari model (from SBB)"
@echo " actevedef_718448162 Download example data"
@echo " deps-test Install testing python deps via pip"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
Expand All @@ -25,6 +26,7 @@ help:
@echo " PYTHON '$(PYTHON)'"
@echo " PIP_INSTALL '$(PIP_INSTALL)'"
@echo " GIT_CLONE '$(GIT_CLONE)'"
@echo " MODEL '$(MODEL)'"

# END-EVAL

Expand All @@ -34,17 +36,14 @@ install:


# Get GT4HistOCR Calamari model (from SBB)
gt4histocr-calamari1:
mkdir -p gt4histocr-calamari1
cd gt4histocr-calamari1 && \
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
tar xfv model.tar.xz && \
rm model.tar.xz

# Download example data
$(MODEL):
ocrd resmgr download ocrd-calamari-recognize $@

# Download example data (not used currently)
actevedef_718448162:
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
unzip actevedef_718448162.zip
wget https://qurator-data.de/examples/actevedef_718448162.zip \
&& unzip actevedef_718448162.zip \
&& rm actevedef_718448162.zip



Expand All @@ -54,7 +53,7 @@ actevedef_718448162:

# Install testing python deps via pip
deps-test:
$(PIP) install -r requirements_test.txt
$(PIP_INSTALL) -r requirements-test.txt


# Clone OCR-D/assets to ./repo/assets
Expand All @@ -73,15 +72,15 @@ assets-clean:
rm -rf test/assets

# Run unit tests
test: test/assets gt4histocr-calamari1
test: test/assets $(MODEL)
# declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)

# Run unit tests and determine test coverage
coverage: test/assets gt4histocr-calamari1
coverage: test/assets $(MODEL)
coverage erase
make test PYTHON="coverage run"
coverage report
coverage html

.PHONY: assets-clean test
.PHONY: install assets-clean deps-test test coverage $(MODEL)
10 changes: 5 additions & 5 deletions ocrd_calamari/ocrd-tool.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
"parameters": {
"checkpoint_dir": {
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
},
"checkpoint": {
"description": "The calamari model files (*.ckpt.json)",
"type": "string", "format": "file", "cacheable": true
"type": "string",
"format": "uri",
"content-type": "text/directory",
"cacheable": true,
"default": "qurator-gt4histocr-1.0"
},
"voter": {
"description": "The voting algorithm to use",
Expand Down
19 changes: 3 additions & 16 deletions ocrd_calamari/recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,8 @@ def setup(self):
"""
Set up the model prior to processing.
"""
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
checkpoints = glob(self.parameter['checkpoint'])
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
checkpoints = glob('%s/*.ckpt.json' % resolved)
self.predictor = MultiPredictor(checkpoints=checkpoints)

self.network_input_channels = self.predictor.predictors[0].network.input_channels
Expand Down Expand Up @@ -244,18 +242,7 @@ def _words(s):


# Add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name, value=self.parameter[name])
for name in self.parameter.keys()])]))


self.add_metadata(pcgts)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file(
Expand Down
3 changes: 3 additions & 0 deletions test/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import sys

from test.assets import assets
from ocrd_utils import initLogging

PWD = os.path.dirname(os.path.realpath(__file__))
sys.path.append(PWD + '/../ocrd')

initLogging()
58 changes: 17 additions & 41 deletions test/test_recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@

METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
CHECKPOINT_DIR = os.getenv('MODEL')

# Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
Expand All @@ -31,14 +30,6 @@ def workspace():
resolver = Resolver()
workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)

# XXX Work around data bug(?):
# PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
urllib.request.urlretrieve(
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))

# The binarization options I have are:
#
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
Expand All @@ -47,45 +38,30 @@ def workspace():
# c. just fumble with the original files
#
# So I'm going for option c.
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
subprocess.call(['convert', ff, '-threshold', '50%', ff])
for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
imgf = workspace.download_file(imgf)
path = os.path.join(workspace.directory, imgf.local_filename)
subprocess.call(['mogrify', '-threshold', '50%', path])

# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
# XXX Review data again
# XXX Make this more robust against namespace version changes
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
workspace.download_file(of)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
tree = etree.parse(ff)
path = os.path.join(workspace.directory, of.local_filename)
tree = etree.parse(path)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
e.getparent().remove(e)
tree.write(ff, xml_declaration=True, encoding="utf-8")
tree.write(path, xml_declaration=True, encoding="utf-8")

return workspace


def test_recognize(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
}
).process()
workspace.save_mets()

page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()

def test_recognize_with_checkpoint_dir(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint_dir": CHECKPOINT_DIR,
Expand All @@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
caplog.set_level(logging.WARNING)
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
parameter={'checkpoint': CHECKPOINT}
parameter={'checkpoint_dir': CHECKPOINT_DIR}
).process()

interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
Expand All @@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
def test_word_segmentation(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"checkpoint_dir": CHECKPOINT_DIR,
"textequiv_level": "word", # Note that we're going down to word level here
}
).process()
Expand Down Expand Up @@ -147,10 +123,10 @@ def test_word_segmentation(workspace):
def test_glyphs(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"checkpoint_dir": CHECKPOINT_DIR,
"textequiv_level": "glyph", # Note that we're going down to glyph level here
}
).process()
Expand Down