Skip to content

Commit

Permalink
hocr: allow parsing more hOCR documents
Browse files Browse the repository at this point in the history
It looks like some documents do not contain the xhtml namespace, and
also do not use ocr_par, but rather ocrx_block. The code will still
assume that the direct children of these nodes are lines, though.
  • Loading branch information
MerlijnWajer committed Jan 8, 2022
1 parent a03ae9a commit 6cdb14d
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions hocr/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from lxml import etree

from .util import open_if_required
from .util import open_if_required, HOCR_SCHEMA


WRITING_DIRECTION_UNSPECIFIED = 0
Expand Down Expand Up @@ -43,7 +43,7 @@ def hocr_page_iterator(fd_or_path):

# TODO: Add gzip loading, specify what file_like should be (I suggest just
# file descriptor or just path)
doc = etree.iterparse(fp, tag='{http://www.w3.org/1999/xhtml}div')
doc = etree.iterparse(fp, tag=(HOCR_SCHEMA + 'div', 'div'))
for act, elem in doc:
if elem.tag[-3:] == 'div' and elem.attrib['class'] == 'ocr_page':
page = elem
Expand Down Expand Up @@ -130,13 +130,14 @@ def hocr_page_to_word_data(hocr_page, scaler=1):
"""
paragraphs = []

for par in hocr_page.xpath('.//*[@class="ocr_par"]'):
for par in hocr_page.xpath('.//*[@class="ocrx_block" or @class="ocr_par"]'):
paragraph_data = {'lines': []}

paragraph_writing_direction = WRITING_DIRECTION_UNSPECIFIED
if 'dir' in par.attrib:
paragraph_writing_direction = wdmap[par.attrib['dir']]

# We assume that the direct children are all the lines
for line in par.getchildren():
line_data = {}

Expand Down Expand Up @@ -260,9 +261,10 @@ def hocr_page_to_word_data_fast(hocr_page):

has_ocrx_cinfo = 0

for par in hocr_page.xpath('.//*[@class="ocr_par"]'):
for par in hocr_page.xpath('.//*[@class="ocrx_block" or @class="ocr_par"]'):
paragraph_data = {'lines': []}

# We assume that the direct children are all the lines
for line in par.getchildren():
line_data = {}

Expand Down

0 comments on commit 6cdb14d

Please sign in to comment.