From f69acbf150486037b852c5bccf2429ad5f2f7e16 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Feb 2024 11:10:36 +0100 Subject: [PATCH 01/11] project: clip coords to parent's parent (instead of parent) --- ocrd_segment/project.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index e6ad5b1..3c96793 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -120,22 +120,22 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, segment, constituents, page_id): - """Shrink segment outline to become the minimal convex hull of its constituent segments.""" + """Overwrite segment outline to become the minimal convex hull of its constituent segments.""" LOG = getLogger('processor.ProjectHull') polygons = [make_valid(Polygon(polygon_from_points(constituent.get_Coords().points))) for constituent in constituents] polygon = join_polygons(polygons).buffer(self.parameter['padding']).exterior.coords[:-1] + # make sure the segment still fits into its parent's parent if isinstance(segment, PageType): - oldborder = segment.Border - segment.Border = None # ensure interim parent is the page frame itself - # make sure the segment still fits into its own parent - polygon2 = polygon_for_parent(polygon, segment) - if polygon2 is None: + # ensure interim parent is the page frame itself + parent = PageType(**segment.__dict__) + parent.Border = None + else: + parent = segment.parent_object_ + polygon = polygon_for_parent(polygon, parent) + if polygon is None: LOG.info('Ignoring extant segment: %s', segment.id) - if isinstance(segment, PageType): - segment.Border = oldborder else: - polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) LOG.debug('Using new coordinates from %d constituents for segment "%s"', From 85a3d3afd0e7fb493421b2c8f7fde6e95fdda41a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Feb 2024 14:02:46 +0100 Subject: [PATCH 02/11] adapt to deprecated Shapely .type --- ocrd_segment/project.py | 8 ++++---- ocrd_segment/repair.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index 3c96793..afa0009 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -154,7 +154,7 @@ def join_polygons(polygons, scale=20): """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" # ensure input polygons are simply typed polygons = list(itertools.chain.from_iterable([ - poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] + poly.geoms if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] else [poly] for poly in polygons])) npoly = len(polygons) @@ -178,7 +178,7 @@ def join_polygons(polygons, scale=20): bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) polygons.append(bridgep) jointp = unary_union(polygons) - assert jointp.type == 'Polygon', jointp.wkt + assert jointp.geom_type == 'Polygon', jointp.wkt if jointp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity @@ -227,10 +227,10 @@ def make_intersection(poly1, poly2): # post-process if interp.is_empty or interp.area == 0.0: return None - if interp.type == 'GeometryCollection': + if interp.geom_type == 'GeometryCollection': # heterogeneous result: filter zero-area shapes (LineString, Point) interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) - if interp.type == 'MultiPolygon': + if interp.geom_type == 'MultiPolygon': # homogeneous result: construct convex hull to connect interp = join_polygons(interp.geoms) if interp.minimum_clearance < 1.0: diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index e9bfb29..28af414 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -482,7 +482,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me _tag_name(otherseg), otherseg.id) otherpoly = make_valid(Polygon(polygon_from_points(otherseg.get_Coords().points))) poly = poly.difference(otherpoly) - if poly.type == 'MultiPolygon': + if poly.geom_type == 'MultiPolygon': poly = join_polygons(poly.geoms) if poly.minimum_clearance < 1.0: poly = Polygon(np.round(poly.exterior.coords)) @@ -599,7 +599,7 @@ def simplify(segment, tolerance=0): def merge_poly(poly1, poly2): poly = poly1.union(poly2) - if poly.type == 'MultiPolygon': + if poly.geom_type == 'MultiPolygon': #poly = poly.convex_hull poly = join_polygons(poly.geoms) if poly.minimum_clearance < 1.0: @@ -611,10 +611,10 @@ def clip_poly(poly1, poly2): poly = poly1.intersection(poly2) if poly.is_empty or poly.area == 0.0: return None - if poly.type == 'GeometryCollection': + if poly.geom_type == 'GeometryCollection': # heterogeneous result: filter zero-area shapes (LineString, Point) poly = unary_union([geom for geom in poly.geoms if geom.area > 0]) - if poly.type == 'MultiPolygon': + if poly.geom_type == 'MultiPolygon': # homogeneous result: construct convex hull to connect #poly = poly.convex_hull poly = join_polygons(poly.geoms) From 2bd54ab206d4aa9a9e212cad3f8b80c84ae6c536 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Feb 2024 15:28:31 +0100 Subject: [PATCH 03/11] repair: use make_valid from project --- ocrd_segment/repair.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 28af414..6e1820d 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -41,7 +41,7 @@ PageValidator ) from .config import OCRD_TOOL -from .project import join_polygons +from .project import join_polygons, make_valid TOOL = 'ocrd-segment-repair' @@ -556,8 +556,8 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0): continue # pick contour and convert to absolute: region_polygon = join_polygons([make_valid(Polygon(contour[:, 0, ::])) - for contour in contours - if len(contour) >= 3], scale=scale) + for area, contour in zip(areas, contours) + if len(contour) >= 3 and area > 0], scale=scale) if padding: region_polygon = region_polygon.buffer(padding) region_polygon = coordinates_for_segment(region_polygon.exterior.coords[:-1], page_image, page_coords) @@ -719,20 +719,5 @@ def ensure_valid(element): points = points_from_polygon(polygon) coords.set_points(points) -def make_valid(polygon): - """Ensures shapely.geometry.Polygon object is valid by repeated simplification""" - for split in range(1, len(polygon.exterior.coords)-1): - if polygon.is_valid or polygon.simplify(polygon.area).is_valid: - break - # simplification may not be possible (at all) due to ordering - # in that case, try another starting point - polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) - for tolerance in range(1, int(polygon.area)): - if polygon.is_valid: - break - # simplification may require a larger tolerance - polygon = polygon.simplify(tolerance) - return polygon - def _tag_name(element): return element.__class__.__name__[0:-4] From a4bea735ecaa39004db7564c70f143e671099c84 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Feb 2024 15:29:13 +0100 Subject: [PATCH 04/11] project: make make_valid even more robust --- ocrd_segment/project.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index afa0009..2be7c1e 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -241,16 +241,26 @@ def make_intersection(poly1, poly2): return interp def make_valid(polygon): + """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" points = list(polygon.exterior.coords) + # try by re-arranging points for split in range(1, len(points)): if polygon.is_valid or polygon.simplify(polygon.area).is_valid: break # simplification may not be possible (at all) due to ordering # in that case, try another starting point polygon = Polygon(points[-split:]+points[:-split]) - for tolerance in range(int(polygon.area)): + # try by simplification + for tolerance in range(int(polygon.area + 1.5)): if polygon.is_valid: break # simplification may require a larger tolerance polygon = polygon.simplify(tolerance + 1) + # try by enlarging + for tolerance in range(1, int(polygon.area + 2.5)): + if polygon.is_valid: + break + # enlargement may require a larger tolerance + polygon = polygon.buffer(tolerance) + assert polygon.is_valid, polygon.wkt return polygon From 15206aa9a76fd763abb1429b26c4e1d696f4ac2b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Feb 2024 15:47:30 +0100 Subject: [PATCH 05/11] update chlog --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d88f9bd..8732972 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [Unreleased] +### Fixed + + * repair/project: adapt to Shapely deprecations + +### Changed + + * project: clip coords to parent's parent instead of parent + * repair/project: make make_valid even more robust + ## [0.1.22] - 2023-06-29 ### Added From 03540b5c2375a71a302fe1eefcb878c275e1d777 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Feb 2024 01:58:13 +0100 Subject: [PATCH 06/11] =?UTF-8?q?repair/project:=20make=20join=5Fpolygons?= =?UTF-8?q?=20more=20robust=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ensure alternating orientation of parts - round via Shapely 2.x set_precision --- ocrd_segment/project.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index 2be7c1e..75dc396 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -5,6 +5,8 @@ import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from shapely.geometry import Polygon, LineString +from shapely.geometry.polygon import orient +from shapely import set_precision from shapely.ops import unary_union, nearest_points from ocrd import Processor @@ -152,11 +154,13 @@ def pairwise(iterable): def join_polygons(polygons, scale=20): """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" - # ensure input polygons are simply typed - polygons = list(itertools.chain.from_iterable([ - poly.geoms if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] - else [poly] - for poly in polygons])) + # ensure input polygons are simply typed and all oriented equally + polygons = [orient(poly) + for poly in itertools.chain.from_iterable( + [poly.geoms + if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])] npoly = len(polygons) if npoly == 1: return polygons[0] @@ -175,15 +179,11 @@ def join_polygons(polygons, scale=20): prevp = polygons[prevp] nextp = polygons[nextp] nearest = nearest_points(prevp, nextp) - bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1) polygons.append(bridgep) jointp = unary_union(polygons) assert jointp.geom_type == 'Polygon', jointp.wkt - if jointp.minimum_clearance < 1.0: - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - jointp = Polygon(np.round(jointp.exterior.coords)) - jointp = make_valid(jointp) + jointp = set_precision(jointp, 1.0) return jointp def polygon_for_parent(polygon, parent): From f8a13a45375874e351409fadad5d154b021deb53 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Feb 2024 02:06:28 +0100 Subject: [PATCH 07/11] =?UTF-8?q?repair/project:=20make=20make=5Fintersect?= =?UTF-8?q?ion=20more=20robust=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit round via Shapely 2.x set_precision --- ocrd_segment/project.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index 75dc396..2bc1c4a 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -183,6 +183,8 @@ def join_polygons(polygons, scale=20): polygons.append(bridgep) jointp = unary_union(polygons) assert jointp.geom_type == 'Polygon', jointp.wkt + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity jointp = set_precision(jointp, 1.0) return jointp @@ -233,11 +235,9 @@ def make_intersection(poly1, poly2): if interp.geom_type == 'MultiPolygon': # homogeneous result: construct convex hull to connect interp = join_polygons(interp.geoms) - if interp.minimum_clearance < 1.0: - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - interp = Polygon(np.round(interp.exterior.coords)) - interp = make_valid(interp) + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + interp = set_precision(interp, 1.0) return interp def make_valid(polygon): From 9b55241ff0f24e6f7e21b8875a83637781ba2d8d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Feb 2024 02:06:43 +0100 Subject: [PATCH 08/11] require Shapely 2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ff728f0..bc83f8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ ocrd >= 2.20.0 -shapely >= 1.7.1 +shapely >= 2.0 scikit-image numpy xlsxwriter From e232212a1b61f4e7d60b2ef10f24e75dbb7f3d9c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 10 Mar 2024 00:18:21 +0100 Subject: [PATCH 09/11] =?UTF-8?q?repair/project:=20more=20make=5Fintersect?= =?UTF-8?q?ion=20robustness=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit round via set_precision unless that would split the shape (in which case try to round the exterior and join its parts) --- ocrd_segment/project.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index 2bc1c4a..d990541 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -185,8 +185,12 @@ def join_polygons(polygons, scale=20): assert jointp.geom_type == 'Polygon', jointp.wkt # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity - jointp = set_precision(jointp, 1.0) - return jointp + jointp2 = set_precision(jointp, 1.0) + if jointp2.geom_type != 'Polygon' or not jointp2.is_valid: + jointp2 = Polygon(np.round(jointp.exterior.coords)) + jointp2 = make_valid(jointp2) + assert jointp2.geom_type == 'Polygon', jointp2.wkt + return jointp2 def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. From 5f28d6ee1e067945b73601c55d702d683e59ac3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 10 Mar 2024 00:42:40 +0100 Subject: [PATCH 10/11] repair/sanitize: shrink regions before attempting repair --- ocrd_segment/repair.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 6e1820d..cf13eb1 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -115,6 +115,14 @@ def process(self): pcgts.set_pcGtsId(file_id) page = pcgts.get_Page() + # shrink/expand text regions to the hull of their text lines + if sanitize: + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized', + feature_filter='clipped') + shrink_regions(page_image, page_coords, page, page_id, + padding=self.parameter['sanitize_padding']) # # validate segmentation (warn of children extending beyond their parents) # @@ -180,14 +188,6 @@ def process(self): # delete/merge/split redundant text regions (or its text lines) if plausibilize: self.plausibilize_page(page, page_id) - # shrink/expand text regions to the hull of their text lines - if sanitize: - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized', - feature_filter='clipped') - shrink_regions(page_image, page_coords, page, page_id, - padding=self.parameter['sanitize_padding']) self.workspace.add_file( ID=file_id, From 3d9e0d66b064d0513856ec6de44c1aa463bd684c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 10 Mar 2024 00:46:49 +0100 Subject: [PATCH 11/11] update changelog --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8732972..1864f3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,14 @@ Versioned according to [Semantic Versioning](http://semver.org/). ### Fixed - * repair/project: adapt to Shapely deprecations + * repair/project: adapt to Shapely deprecations, + * repair/project: more robust `join_polygons`, `make_intersections`, `make_valid` ### Changed + * :fire: require Shapely 2 * project: clip coords to parent's parent instead of parent - * repair/project: make make_valid even more robust + * repair (`sanitize`): shrink before attempting repair (hierarchical consistency) ## [0.1.22] - 2023-06-29