From 1cc1b961c5074ef213e009f7f7d9c17303699a29 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Mon, 23 Mar 2020 21:38:39 +0000 Subject: [PATCH] Also group center-aligned text lines in addition to left-aligned and right-aligned text lines (#382) (#384) * Group text lines if they are centered (#382) Closes #382 * Add comparison private methods to LTTextLines * Add missing docstrings * Add tests for find_neighbors * Update changelog * Cosmetic changes from code review --- CHANGELOG.md | 3 ++ pdfminer/layout.py | 86 ++++++++++++++++++++++++++++++++++++------ tests/test_layout.py | 90 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 166 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 962ef77d..fc8f8b0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) +### Changed +- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382)) + ## [20200124] - 2020-01-24 ### Security diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 312cdec8..0a22c5a5 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -409,20 +409,51 @@ def __init__(self, word_margin): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) - if self._x1 < obj.x0-margin: + if self._x1 < obj.x0 - margin: LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 LTTextLine.add(self, obj) return def find_neighbors(self, plane, ratio): - d = ratio*self.height - objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) + """ + Finds neighboring LTTextLineHorizontals in the plane. + + Returns a list of other LTTestLineHorizontals in the plane which are + close to self. "Close" can be controlled by ratio. The returned objects + will be the same height as self, and also either left-, right-, or + centrally-aligned. + """ + d = ratio * self.height + objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) return [obj for obj in objs if (isinstance(obj, LTTextLineHorizontal) and - abs(obj.height-self.height) < d and - (abs(obj.x0-self.x0) < d or - abs(obj.x1-self.x1) < d))] + self._is_same_height_as(obj, tolerance=d) and + (self._is_left_aligned_with(obj, tolerance=d) or + self._is_right_aligned_with(obj, tolerance=d) or + self._is_centrally_aligned_with(obj, tolerance=d)))] + + def _is_left_aligned_with(self, other, tolerance=0): + """ + Whether the left-hand edge of `other` is within `tolerance`. + """ + return abs(other.x0 - self.x0) <= tolerance + + def _is_right_aligned_with(self, other, tolerance=0): + """ + Whether the right-hand edge of `other` is within `tolerance`. + """ + return abs(other.x1 - self.x1) <= tolerance + + def _is_centrally_aligned_with(self, other, tolerance=0): + """ + Whether the horizontal center of `other` is within `tolerance`. + """ + return abs( + (other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance + + def _is_same_height_as(self, other, tolerance): + return abs(other.height - self.height) <= tolerance class LTTextLineVertical(LTTextLine): @@ -434,20 +465,51 @@ def __init__(self, word_margin): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) - if obj.y1+margin < self._y0: + if obj.y1 + margin < self._y0: LTContainer.add(self, LTAnno(' ')) self._y0 = obj.y0 LTTextLine.add(self, obj) return def find_neighbors(self, plane, ratio): - d = ratio*self.width - objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1)) + """ + Finds neighboring LTTextLineVerticals in the plane. + + Returns a list of other LTTextLineVerticals in the plane which are + close to self. "Close" can be controlled by ratio. The returned objects + will be the same width as self, and also either upper-, lower-, or + centrally-aligned. + """ + d = ratio * self.width + objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) return [obj for obj in objs if (isinstance(obj, LTTextLineVertical) and - abs(obj.width-self.width) < d and - (abs(obj.y0-self.y0) < d or - abs(obj.y1-self.y1) < d))] + self._is_same_width_as(obj, tolerance=d) and + (self._is_lower_aligned_with(obj, tolerance=d) or + self._is_upper_aligned_with(obj, tolerance=d) or + self._is_centrally_aligned_with(obj, tolerance=d)))] + + def _is_lower_aligned_with(self, other, tolerance=0): + """ + Whether the lower edge of `other` is within `tolerance`. + """ + return abs(other.y0 - self.y0) <= tolerance + + def _is_upper_aligned_with(self, other, tolerance=0): + """ + Whether the upper edge of `other` is within `tolerance`. + """ + return abs(other.y1 - self.y1) <= tolerance + + def _is_centrally_aligned_with(self, other, tolerance=0): + """ + Whether the vertical center of `other` is within `tolerance`. + """ + return abs( + (other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance + + def _is_same_width_as(self, other, tolerance): + return abs(other.width - self.width) <= tolerance class LTTextBox(LTTextContainer): diff --git a/tests/test_layout.py b/tests/test_layout.py index a6788a33..1d062317 100644 --- a/tests/test_layout.py +++ b/tests/test_layout.py @@ -1,6 +1,12 @@ import unittest -from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal +from pdfminer.layout import ( + LTLayoutContainer, + LAParams, + LTTextLineHorizontal, + LTTextLineVertical, +) +from pdfminer.utils import Plane class TestGroupTextLines(unittest.TestCase): @@ -21,3 +27,85 @@ def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self): textboxes = list(layout.group_textlines(laparams, lines)) self.assertEqual(len(textboxes), 2) + + +class TestFindNeigbors(unittest.TestCase): + def test_find_neighbors_horizontal(self): + laparams = LAParams() + plane = Plane((0, 0, 50, 50)) + + line = LTTextLineHorizontal(laparams.word_margin) + line.set_bbox((10, 4, 20, 6)) + plane.add(line) + + left_aligned_above = LTTextLineHorizontal(laparams.word_margin) + left_aligned_above.set_bbox((10, 6, 15, 8)) + plane.add(left_aligned_above) + + right_aligned_below = LTTextLineHorizontal(laparams.word_margin) + right_aligned_below.set_bbox((15, 2, 20, 4)) + plane.add(right_aligned_below) + + centrally_aligned_overlapping = LTTextLineHorizontal( + laparams.word_margin) + centrally_aligned_overlapping.set_bbox((13, 5, 17, 7)) + plane.add(centrally_aligned_overlapping) + + not_aligned = LTTextLineHorizontal(laparams.word_margin) + not_aligned.set_bbox((0, 6, 5, 8)) + plane.add(not_aligned) + + wrong_height = LTTextLineHorizontal(laparams.word_margin) + wrong_height.set_bbox((10, 6, 15, 10)) + plane.add(wrong_height) + + neighbors = line.find_neighbors(plane, laparams.line_margin) + self.assertCountEqual( + neighbors, + [ + line, + left_aligned_above, + right_aligned_below, + centrally_aligned_overlapping, + ], + ) + + def test_find_neighbors_vertical(self): + laparams = LAParams() + plane = Plane((0, 0, 50, 50)) + + line = LTTextLineVertical(laparams.word_margin) + line.set_bbox((4, 10, 6, 20)) + plane.add(line) + + bottom_aligned_right = LTTextLineVertical(laparams.word_margin) + bottom_aligned_right.set_bbox((6, 10, 8, 15)) + plane.add(bottom_aligned_right) + + top_aligned_left = LTTextLineVertical(laparams.word_margin) + top_aligned_left.set_bbox((2, 15, 4, 20)) + plane.add(top_aligned_left) + + centrally_aligned_overlapping = LTTextLineVertical( + laparams.word_margin) + centrally_aligned_overlapping.set_bbox((5, 13, 7, 17)) + plane.add(centrally_aligned_overlapping) + + not_aligned = LTTextLineVertical(laparams.word_margin) + not_aligned.set_bbox((6, 0, 8, 5)) + plane.add(not_aligned) + + wrong_width = LTTextLineVertical(laparams.word_margin) + wrong_width.set_bbox((6, 10, 10, 15)) + plane.add(wrong_width) + + neighbors = line.find_neighbors(plane, laparams.line_margin) + self.assertCountEqual( + neighbors, + [ + line, + bottom_aligned_right, + top_aligned_left, + centrally_aligned_overlapping, + ], + )