From 1dab4e086cb6d1b7d938ab835217741aa5d8e5e9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Dec 2019 19:41:08 +0100 Subject: [PATCH] ALTO renderer: use proper BlockTypes - use TextBlock, Illustration, GraphicalElement (not just TextBlock), as appropriate for the internal block types - do not enter RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL and ChoiceIterator on anything other than TextBlocks - refactor loop to make it more readable --- src/api/altorenderer.cpp | 178 +++++++++++++++++++++------------------ 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 346e818e2e..9e5c06d285 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -164,97 +164,109 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { << " HEIGHT=\"" << rect_height_ << "\">\n"; ResultIterator* res_it = GetIterator(); - while (!res_it->Empty(RIL_BLOCK)) { - if (res_it->Empty(RIL_WORD)) { - res_it->Next(RIL_WORD); - continue; + for (; !res_it->Empty(RIL_BLOCK); res_it->Next(RIL_BLOCK)) { + alto_str << "\t\t\t\tBlockType()) { + case PT_FLOWING_TEXT: + case PT_HEADING_TEXT: + case PT_PULLOUT_TEXT: + case PT_CAPTION_TEXT: + case PT_VERTICAL_TEXT: + case PT_TABLE: // nothing special here + case PT_EQUATION: + case PT_INLINE_EQUATION: + block_type = "TextBlock"; + break; + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + block_type = "Illustration"; + break; + case PT_HORZ_LINE: + case PT_VERT_LINE: + block_type = "GraphicalElement"; + break; + default: + block_type = "ComposedBlock"; } - if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { - alto_str << "\t\t\t\t\tEmpty(RIL_PARA); res_it->Next(RIL_PARA)) { + alto_str << "\t\t\t\t\t<" << block_type << " ID=\"block_" << tcnt << "\""; AddBoxToAlto(res_it, RIL_PARA, alto_str); alto_str << "\n"; - } - - if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { - alto_str << "\t\t\t\t\t\tGetUTF8Text(RIL_WORD)).c_str() << "\">"; - - bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); - bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); - bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - - int left, top, right, bottom; - res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - - do { - alto_str << "\n\t\t\t\t\t\t\t\t grapheme( - res_it->GetUTF8Text(RIL_SYMBOL)); - if (grapheme && grapheme[0] != 0) { - alto_str << HOcrEscape(grapheme.get()).c_str(); + if (strcmp(block_type, "TextBlock") == 0) { + for (; !res_it->Empty(RIL_TEXTLINE); res_it->Next(RIL_TEXTLINE)) { + alto_str << "\t\t\t\t\t\tEmpty(RIL_WORD); res_it->Next(RIL_WORD)) { + int left = 0, top = 0, right = 0, bottom = 0; + if (!res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + int hpos = right; + int vpos = top; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + int width = left - hpos; + int height = bottom - top; + alto_str << "\n\t\t\t\t\t\t\t\n"; + } + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + + alto_str << "\t\t\t\t\t\t\tGetUTF8Text(RIL_WORD)).c_str() << "\">"; + + for (; !res_it->Empty(RIL_SYMBOL); res_it->Next(RIL_SYMBOL)) { + alto_str << "\n\t\t\t\t\t\t\t\t grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) + alto_str << HOcrEscape(grapheme.get()).c_str(); + alto_str << "\">"; + + ChoiceIterator choice_it(*res_it); + do { + int vc = choice_it.Confidence(); + alto_str << "\n\t\t\t\t\t\t\t\t\t"; + } while (choice_it.Next()); + alto_str << "\n\t\t\t\t\t\t\t\t"; + scnt++; + if (res_it->IsAtFinalElement(RIL_WORD, RIL_SYMBOL)) + break; + } + alto_str << "\n\t\t\t\t\t\t\t"; + wcnt++; + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) + break; + } + alto_str << "\n\t\t\t\t\t\t\n"; + lcnt++; + if (res_it->IsAtFinalElement(RIL_PARA, RIL_TEXTLINE)) + break; + } } - alto_str << "\">"; - ChoiceIterator choice_it(*res_it); - do { - int vc = choice_it.Confidence(); - alto_str << "\n\t\t\t\t\t\t\t\t\t"; - } while (choice_it.Next()); - alto_str << "\n\t\t\t\t\t\t\t\t"; - res_it->Next(RIL_SYMBOL); - - scnt++; - } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - - alto_str << "\n\t\t\t\t\t\t\t"; - - wcnt++; - - if (last_word_in_line) { - alto_str << "\n\t\t\t\t\t\t\n"; - lcnt++; - } else { - int hpos = right; - int vpos = top; - res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - int width = left - hpos; - int height = bottom - top; - alto_str << "\n\t\t\t\t\t\t\t\n"; - } - - if (last_word_in_tblock) { - alto_str << "\t\t\t\t\t\n"; + alto_str << "\t\t\t\t\t\n"; tcnt++; + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_PARA)) + break; } - - if (last_word_in_cblock) { - alto_str << "\t\t\t\t\n"; - bcnt++; - } + alto_str << "\t\t\t\t\n"; + bcnt++; } alto_str << "\t\t\t\n"