Skip to content

Commit

Permalink
Also group center-aligned text lines in addition to left-aligned and …
Browse files Browse the repository at this point in the history
…right-aligned text lines (pdfminer#382) (pdfminer#384)

* Group text lines if they are centered (pdfminer#382)

Closes pdfminer#382

* Add comparison private methods to LTTextLines

* Add missing docstrings

* Add tests for find_neighbors

* Update changelog

* Cosmetic changes from code review
  • Loading branch information
jstockwin authored Mar 23, 2020
1 parent 9d7fe2d commit 1cc1b96
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 13 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))

### Changed
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))

## [20200124] - 2020-01-24

### Security
Expand Down
86 changes: 74 additions & 12 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,20 +409,51 @@ def __init__(self, word_margin):
def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0-margin:
if self._x1 < obj.x0 - margin:
LTContainer.add(self, LTAnno(' '))
self._x1 = obj.x1
LTTextLine.add(self, obj)
return

def find_neighbors(self, plane, ratio):
d = ratio*self.height
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
"""
Finds neighboring LTTextLineHorizontals in the plane.
Returns a list of other LTTestLineHorizontals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same height as self, and also either left-, right-, or
centrally-aligned.
"""
d = ratio * self.height
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d))]
self._is_same_height_as(obj, tolerance=d) and
(self._is_left_aligned_with(obj, tolerance=d) or
self._is_right_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]

def _is_left_aligned_with(self, other, tolerance=0):
"""
Whether the left-hand edge of `other` is within `tolerance`.
"""
return abs(other.x0 - self.x0) <= tolerance

def _is_right_aligned_with(self, other, tolerance=0):
"""
Whether the right-hand edge of `other` is within `tolerance`.
"""
return abs(other.x1 - self.x1) <= tolerance

def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the horizontal center of `other` is within `tolerance`.
"""
return abs(
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance

def _is_same_height_as(self, other, tolerance):
return abs(other.height - self.height) <= tolerance


class LTTextLineVertical(LTTextLine):
Expand All @@ -434,20 +465,51 @@ def __init__(self, word_margin):
def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if obj.y1+margin < self._y0:
if obj.y1 + margin < self._y0:
LTContainer.add(self, LTAnno(' '))
self._y0 = obj.y0
LTTextLine.add(self, obj)
return

def find_neighbors(self, plane, ratio):
d = ratio*self.width
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
"""
Finds neighboring LTTextLineVerticals in the plane.
Returns a list of other LTTextLineVerticals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same width as self, and also either upper-, lower-, or
centrally-aligned.
"""
d = ratio * self.width
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d))]
self._is_same_width_as(obj, tolerance=d) and
(self._is_lower_aligned_with(obj, tolerance=d) or
self._is_upper_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]

def _is_lower_aligned_with(self, other, tolerance=0):
"""
Whether the lower edge of `other` is within `tolerance`.
"""
return abs(other.y0 - self.y0) <= tolerance

def _is_upper_aligned_with(self, other, tolerance=0):
"""
Whether the upper edge of `other` is within `tolerance`.
"""
return abs(other.y1 - self.y1) <= tolerance

def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the vertical center of `other` is within `tolerance`.
"""
return abs(
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance

def _is_same_width_as(self, other, tolerance):
return abs(other.width - self.width) <= tolerance


class LTTextBox(LTTextContainer):
Expand Down
90 changes: 89 additions & 1 deletion tests/test_layout.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import unittest

from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
from pdfminer.layout import (
LTLayoutContainer,
LAParams,
LTTextLineHorizontal,
LTTextLineVertical,
)
from pdfminer.utils import Plane


class TestGroupTextLines(unittest.TestCase):
Expand All @@ -21,3 +27,85 @@ def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self):
textboxes = list(layout.group_textlines(laparams, lines))

self.assertEqual(len(textboxes), 2)


class TestFindNeigbors(unittest.TestCase):
def test_find_neighbors_horizontal(self):
laparams = LAParams()
plane = Plane((0, 0, 50, 50))

line = LTTextLineHorizontal(laparams.word_margin)
line.set_bbox((10, 4, 20, 6))
plane.add(line)

left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
left_aligned_above.set_bbox((10, 6, 15, 8))
plane.add(left_aligned_above)

right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
right_aligned_below.set_bbox((15, 2, 20, 4))
plane.add(right_aligned_below)

centrally_aligned_overlapping = LTTextLineHorizontal(
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
plane.add(centrally_aligned_overlapping)

not_aligned = LTTextLineHorizontal(laparams.word_margin)
not_aligned.set_bbox((0, 6, 5, 8))
plane.add(not_aligned)

wrong_height = LTTextLineHorizontal(laparams.word_margin)
wrong_height.set_bbox((10, 6, 15, 10))
plane.add(wrong_height)

neighbors = line.find_neighbors(plane, laparams.line_margin)
self.assertCountEqual(
neighbors,
[
line,
left_aligned_above,
right_aligned_below,
centrally_aligned_overlapping,
],
)

def test_find_neighbors_vertical(self):
laparams = LAParams()
plane = Plane((0, 0, 50, 50))

line = LTTextLineVertical(laparams.word_margin)
line.set_bbox((4, 10, 6, 20))
plane.add(line)

bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
bottom_aligned_right.set_bbox((6, 10, 8, 15))
plane.add(bottom_aligned_right)

top_aligned_left = LTTextLineVertical(laparams.word_margin)
top_aligned_left.set_bbox((2, 15, 4, 20))
plane.add(top_aligned_left)

centrally_aligned_overlapping = LTTextLineVertical(
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
plane.add(centrally_aligned_overlapping)

not_aligned = LTTextLineVertical(laparams.word_margin)
not_aligned.set_bbox((6, 0, 8, 5))
plane.add(not_aligned)

wrong_width = LTTextLineVertical(laparams.word_margin)
wrong_width.set_bbox((6, 10, 10, 15))
plane.add(wrong_width)

neighbors = line.find_neighbors(plane, laparams.line_margin)
self.assertCountEqual(
neighbors,
[
line,
bottom_aligned_right,
top_aligned_left,
centrally_aligned_overlapping,
],
)

0 comments on commit 1cc1b96

Please sign in to comment.