OCR-D · mikegerber · Feb 23, 2022 · Feb 10, 2022 · Feb 10, 2022 · Feb 10, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -14,8 +14,7 @@ jobs:
       - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
       - checkout
       - run: pip3 install --upgrade pip
-      - run: make install PIP_INSTALL="pip3 install"
-      - run: pip3 install -r requirements-test.txt
+      - run: make install deps-test PIP_INSTALL="pip3 install"
       - run: make coverage LC_ALL=en_US.utf8
       - codecov/upload
 

diff --git a/Makefile b/Makefile
@@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install
 GIT_CLONE = git clone
 PYTHON = python3
 PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
+MODEL = qurator-gt4histocr-1.0
 
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
@@ -11,7 +12,7 @@ help:
 	@echo "  Targets"
 	@echo ""
 	@echo "    install          Install ocrd_calamari"
-	@echo "    gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
+	@echo "    $(MODEL)         Get Calamari model (from SBB)"
 	@echo "    actevedef_718448162 Download example data"
 	@echo "    deps-test        Install testing python deps via pip"
 	@echo "    repo/assets      Clone OCR-D/assets to ./repo/assets"
@@ -25,6 +26,7 @@ help:
 	@echo "    PYTHON       '$(PYTHON)'"
 	@echo "    PIP_INSTALL  '$(PIP_INSTALL)'"
 	@echo "    GIT_CLONE    '$(GIT_CLONE)'"
+	@echo "    MODEL        '$(MODEL)'"
 
 # END-EVAL
 
@@ -34,17 +36,14 @@ install:
 
 
 # Get GT4HistOCR Calamari model (from SBB)
-gt4histocr-calamari1:
-	mkdir -p gt4histocr-calamari1
-	cd gt4histocr-calamari1 && \
-	wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
-	tar xfv model.tar.xz && \
-	rm model.tar.xz
-
-# Download example data
+$(MODEL):
+	ocrd resmgr download ocrd-calamari-recognize $@
+
+# Download example data (not used currently)
 actevedef_718448162:
-	wget https://qurator-data.de/examples/actevedef_718448162.zip && \
-	unzip actevedef_718448162.zip
+	wget https://qurator-data.de/examples/actevedef_718448162.zip \
+	&& unzip actevedef_718448162.zip \
+	&& rm actevedef_718448162.zip
 
 
 
@@ -54,7 +53,7 @@ actevedef_718448162:
 
 # Install testing python deps via pip
 deps-test:
-	$(PIP) install -r requirements_test.txt
+	$(PIP_INSTALL) -r requirements-test.txt
 
 
 # Clone OCR-D/assets to ./repo/assets
@@ -73,15 +72,15 @@ assets-clean:
 	rm -rf test/assets
 
 # Run unit tests
-test: test/assets gt4histocr-calamari1
+test: test/assets $(MODEL)
 	# declare -p HTTP_PROXY
 	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
 
 # Run unit tests and determine test coverage
-coverage: test/assets gt4histocr-calamari1
+coverage: test/assets $(MODEL)
 	coverage erase
 	make test PYTHON="coverage run"
 	coverage report
 	coverage html
 
-.PHONY: assets-clean test
+.PHONY: install assets-clean deps-test test coverage $(MODEL)
diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json
@@ -20,11 +20,11 @@
       "parameters": {
         "checkpoint_dir": {
           "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
-          "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
-        },
-        "checkpoint": {
-          "description": "The calamari model files (*.ckpt.json)",
-          "type": "string", "format": "file", "cacheable": true
+          "type": "string",
+          "format": "uri",
+          "content-type": "text/directory",
+          "cacheable": true,
+          "default": "qurator-gt4histocr-1.0"
         },
         "voter": {
           "description": "The voting algorithm to use",

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
@@ -47,10 +47,8 @@ def setup(self):
         """
         Set up the model prior to processing.
         """
-        if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
-            resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
-            self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
-        checkpoints = glob(self.parameter['checkpoint'])
+        resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
+        checkpoints = glob('%s/*.ckpt.json' % resolved)
         self.predictor = MultiPredictor(checkpoints=checkpoints)
 
         self.network_input_channels = self.predictor.predictors[0].network.input_channels
@@ -244,18 +242,7 @@ def _words(s):
 
 
             # Add metadata about this operation and its runtime parameters:
-            metadata = pcgts.get_Metadata()  # ensured by from_file()
-            metadata.add_MetadataItem(
-                MetadataItemType(type_="processingStep",
-                                 name=self.ocrd_tool['steps'][0],
-                                 value=TOOL,
-                                 Labels=[LabelsType(
-                                     externalModel="ocrd-tool",
-                                     externalId="parameters",
-                                     Label=[LabelType(type_=name, value=self.parameter[name])
-                                            for name in self.parameter.keys()])]))
-
-
+            self.add_metadata(pcgts)
             file_id = make_file_id(input_file, self.output_file_grp)
             pcgts.set_pcGtsId(file_id)
             self.workspace.add_file(

diff --git a/test/base.py b/test/base.py
@@ -4,6 +4,9 @@
 import sys
 
 from test.assets import assets
+from ocrd_utils import initLogging
 
 PWD = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(PWD + '/../ocrd')
+
+initLogging()
diff --git a/test/test_recognize.py b/test/test_recognize.py
@@ -14,8 +14,7 @@
 
 METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
-CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
-CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
+CHECKPOINT_DIR = os.getenv('MODEL')
 
 # Because XML namespace versions are so much fun, we not only use one, we use TWO!
 NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
@@ -31,14 +30,6 @@ def workspace():
     resolver = Resolver()
     workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
 
-    # XXX Work around data bug(?):
-    #     PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
-    os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
-    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
-        urllib.request.urlretrieve(
-            "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
-            os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
-
     # The binarization options I have are:
     #
     # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@@ -47,45 +38,30 @@ def workspace():
     # c. just fumble with the original files
     #
     # So I'm going for option c.
-    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
-        ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
-        subprocess.call(['convert', ff, '-threshold', '50%', ff])
+    for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
+        imgf = workspace.download_file(imgf)
+        path = os.path.join(workspace.directory, imgf.local_filename)
+        subprocess.call(['mogrify', '-threshold', '50%', path])
 
     # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
     # XXX Review data again
     # XXX Make this more robust against namespace version changes
-    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
+    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
         workspace.download_file(of)
-    for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
-        for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
-            tree = etree.parse(ff)
+        path = os.path.join(workspace.directory, of.local_filename)
+        tree = etree.parse(path)
+        for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
             for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
                 e.getparent().remove(e)
-            tree.write(ff, xml_declaration=True, encoding="utf-8")
+        tree.write(path, xml_declaration=True, encoding="utf-8")
 
     return workspace
 
 
 def test_recognize(workspace):
     CalamariRecognize(
         workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
-        output_file_grp="OCR-D-OCR-CALAMARI",
-        parameter={
-            "checkpoint": CHECKPOINT,
-        }
-    ).process()
-    workspace.save_mets()
-
-    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
-    assert os.path.exists(page1)
-    with open(page1, "r", encoding="utf-8") as f:
-        assert "verſchuldeten" in f.read()
-
-def test_recognize_with_checkpoint_dir(workspace):
-    CalamariRecognize(
-        workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
             "checkpoint_dir": CHECKPOINT_DIR,
@@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
     caplog.set_level(logging.WARNING)
     CalamariRecognize(
         workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
         output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
-        parameter={'checkpoint': CHECKPOINT}
+        parameter={'checkpoint_dir': CHECKPOINT_DIR}
     ).process()
 
     interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
@@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
 def test_word_segmentation(workspace):
     CalamariRecognize(
         workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
-            "checkpoint": CHECKPOINT,
+            "checkpoint_dir": CHECKPOINT_DIR,
             "textequiv_level": "word",   # Note that we're going down to word level here
         }
     ).process()
@@ -147,10 +123,10 @@ def test_word_segmentation(workspace):
 def test_glyphs(workspace):
     CalamariRecognize(
         workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
-            "checkpoint": CHECKPOINT,
+            "checkpoint_dir": CHECKPOINT_DIR,
             "textequiv_level": "glyph",   # Note that we're going down to glyph level here
         }
     ).process()