diff --git a/hocr/parse.py b/hocr/parse.py index 0b265c9..fb1c98f 100644 --- a/hocr/parse.py +++ b/hocr/parse.py @@ -3,7 +3,7 @@ from lxml import etree -from .util import open_if_required +from .util import open_if_required, HOCR_SCHEMA WRITING_DIRECTION_UNSPECIFIED = 0 @@ -43,7 +43,7 @@ def hocr_page_iterator(fd_or_path): # TODO: Add gzip loading, specify what file_like should be (I suggest just # file descriptor or just path) - doc = etree.iterparse(fp, tag='{http://www.w3.org/1999/xhtml}div') + doc = etree.iterparse(fp, tag=(HOCR_SCHEMA + 'div', 'div')) for act, elem in doc: if elem.tag[-3:] == 'div' and elem.attrib['class'] == 'ocr_page': page = elem @@ -130,13 +130,14 @@ def hocr_page_to_word_data(hocr_page, scaler=1): """ paragraphs = [] - for par in hocr_page.xpath('.//*[@class="ocr_par"]'): + for par in hocr_page.xpath('.//*[@class="ocrx_block" or @class="ocr_par"]'): paragraph_data = {'lines': []} paragraph_writing_direction = WRITING_DIRECTION_UNSPECIFIED if 'dir' in par.attrib: paragraph_writing_direction = wdmap[par.attrib['dir']] + # We assume that the direct children are all the lines for line in par.getchildren(): line_data = {} @@ -260,9 +261,10 @@ def hocr_page_to_word_data_fast(hocr_page): has_ocrx_cinfo = 0 - for par in hocr_page.xpath('.//*[@class="ocr_par"]'): + for par in hocr_page.xpath('.//*[@class="ocrx_block" or @class="ocr_par"]'): paragraph_data = {'lines': []} + # We assume that the direct children are all the lines for line in par.getchildren(): line_data = {}