diff --git a/.travis.yml b/.travis.yml
index f8e543a..8395b4b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,6 +10,9 @@ notifications:
 
 install:
   - sudo apt-get -qq update
+  - sudo apt-get install libmagickwand-dev ghostscript  # required by Wand
+  - sudo rm -rf /etc/ImageMagick-6/policy.xml  # HazyResearch/fonduer#170
+  - pip install -U pip
   - make dev
   - pip install coveralls
 
@@ -21,7 +24,6 @@ before_script:
 
 script:
   - coverage run --source=pdftotree -m pytest tests -v -rsXx
-  - python setup.py -q install
   - pdftotree tests/input/112823.pdf
 
 after_success:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d34db52..e96cb42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - [@HiromuHota][HiromuHota]: Fix a bug that an html file is not created at a given path.
   ([#64](https://github.com/HazyResearch/pdftotree/pull/64))
-
+- [@HiromuHota][HiromuHota]: Switch the output format from "HTML-like" to hOCR.
+  ([#62](https://github.com/HazyResearch/pdftotree/pull/62))
 
 ## 0.4.1 - 2020-09-21
 
diff --git a/Makefile b/Makefile
index 3483385..c7d3115 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ TESTDATA=tests/input
 
 dev: 
 	pip install -r requirements-dev.txt
-	pip install -e .
+	pip install -e . --use-feature=2020-resolver
 	pre-commit install
 
 test: $(TESTDATA)/paleo_visual_model.h5 dev check
diff --git a/README.rst b/README.rst
index 27b1d30..949aac3 100644
--- a/README.rst
+++ b/README.rst
@@ -3,7 +3,7 @@ pdftotree
 
 |License| |Stars| |PyPI| |Version| |Issues| |Travis| |Coveralls| |CodeStyle|
 
-**WARNING**: ``pdftotree`` *is experimental code and is NOT stable or maintained. It is not integrated with or supported by Fonduer.*
+**WARNING**: ``pdftotree`` *is experimental code and is NOT stable. It is not integrated with or supported by Fonduer.*
 
 Fonduer_ performs knowledge base construction from richly formatted data such
 as tables. A crucial step in this process is the construction of the
@@ -16,8 +16,10 @@ This package is the result of building our own module as replacement to Adobe
 Acrobat. Several open source tools are available for pdf to html conversion but
 these tools do not preserve the cell structure in a table. Our goal in this
 project is to develop a tool that extracts text, figures and tables in a pdf
-document and maintains the structure of the document using a tree data
-structure.
+document and returns them in an easily consumable format.
+
+Up to v0.4.1, pdftotree's output was formatted in its own "HTML-like" format.
+From v0.5.0, it conforms to hOCR_, an open-standard format for OCR results.
 
 Dependencies
 ------------
@@ -49,19 +51,14 @@ pdftotree
 ~~~~~~~~~
 
 This is the primary command-line utility provided with this Python package.
-This takes a PDF file as input, and produces an HTML-like representation of the
-data::
+This takes a PDF file as input and produces an hOCR file as output::
 
     usage: pdftotree [options] pdf_file
 
-    Script to extract tree structure from PDF files. Takes a PDF as input and
-    outputs an HTML-like representation of the document's structure. By default,
-    this conversion is done using heuristics. However, a model can be provided as
-    a parameter to use a machine-learning-based approach.
+    Convert PDF into hOCR.
 
     positional arguments:
-      pdf_file              PDF file name for which tree structure needs to be
-                            extracted
+      pdf_file              Path to input PDF file.
 
     optional arguments:
       -h, --help            show this help message and exit
@@ -71,12 +68,12 @@ data::
       -m MODEL_PATH, --model_path MODEL_PATH
                             Pretrained model, generated by extract_tables tool
       -o OUTPUT, --output OUTPUT
-                            Path where tree structure should be saved. If none,
-                            HTML is printed to stdout.
+                            Path to output hOCR file. If not given, it will be
+                            printed to stdout.
       -f FAVOR_FIGURES, --favor_figures FAVOR_FIGURES
                             Whether figures must be favored over other parts such
                             as tables and section headers
-      -V, --visualize       Whether to output visualization images for the tree
+      -V, --visualize       Whether to output visualization images
       -d, --dry-run         Run pdftotree, but do not save any output or print to
                             console.
       -v, --verbose         Output INFO level logging.
@@ -207,3 +204,4 @@ Then you can run our tests::
 .. _version file: https://github.com/HazyResearch/pdftotree/blob/master/pdftotree/_version.py
 .. _editable mode: https://packaging.python.org/tutorials/distributing-packages/#working-in-development-mode
 .. _flake8: http://flake8.pycqa.org/en/latest/
+.. _hOCR: http://kba.cloud/hocr-spec/1.2/
diff --git a/bin/pdftotree b/bin/pdftotree
index e1e2f40..93ab0fc 100755
--- a/bin/pdftotree
+++ b/bin/pdftotree
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-"""Simple commandline interface for parsing PDF to HTML."""
+"""Simple commandline interface for parsing PDF to hOCR."""
 import argparse
 import logging
 import os
@@ -9,10 +9,7 @@ import pdftotree
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="""
-        Script to extract tree structure from PDF files. Takes a PDF as input
-        and outputs an HTML-like representation of the document's structure. By
-        default, this conversion is done using heuristics. However, a model can
-        be provided as a parameter to use a machine-learning-based approach.
+        Convert PDF into hOCR.
         """,
         usage="%(prog)s [options] pdf_file",
     )
@@ -34,19 +31,22 @@ if __name__ == "__main__":
     parser.add_argument(
         "pdf_file",
         type=str,
-        help="PDF file name for which tree structure needs to be extracted",
+        help="Path to input PDF file.",
     )
     parser.add_argument(
         "-o",
         "--output",
         type=str,
-        help="Path where tree structure should be saved. If none, HTML is printed to stdout.",
+        help="Path to output hOCR file. If not given, it will be printed to stdout.",
     )
     parser.add_argument(
         "-f",
         "--favor_figures",
         type=str,
-        help="Whether figures must be favored over other parts such as tables and section headers",
+        help="""
+        Whether figures must be favored over other parts such as tables and section
+        headers
+        """,
         default="True",
     )
     parser.add_argument(
@@ -103,7 +103,7 @@ if __name__ == "__main__":
     log.addHandler(ch)
 
     if args.dry_run:
-        print("This is just a dry run. No HTML will be output.")
+        print("This is just a dry run. No hOCR will be output.")
         args.output = None
 
     # Call the main routine
@@ -120,4 +120,4 @@ if __name__ == "__main__":
         if not args.dry_run:
             print(result)
     else:
-        print("HTML output to {}".format(args.output))
+        print("hOCR output to {}".format(args.output))
diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py
index c498f32..502c5bf 100644
--- a/pdftotree/TreeExtract.py
+++ b/pdftotree/TreeExtract.py
@@ -1,14 +1,15 @@
 import html
 import logging
-import re
 from functools import cmp_to_key
-from typing import Any, Dict
+from typing import Any, Dict, List, Tuple
+from xml.dom.minidom import Document, Element
 
 import numpy as np
 import tabula
-from pdfminer.layout import LTChar
+from pdfminer.layout import LTChar, LTTextLine
 from pdfminer.utils import Plane
 
+from pdftotree._version import __version__
 from pdftotree.ml.features import get_lines_features, get_mentions_within_bbox
 from pdftotree.utils.bbox_utils import get_rectangles
 from pdftotree.utils.lines_utils import (
@@ -43,7 +44,6 @@ def __init__(self, pdf_file):
         self.iou_thresh = 0.8
         self.scanned = False
         self.tree: Dict[int, Any] = {}  # key represents page_num
-        self.html = ""
 
     def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
         plane = Plane(page_bbox)
@@ -238,10 +238,30 @@ def get_tree_structure(self, model_type, model, favor_figures) -> Dict[str, Any]
             )
         return self.tree
 
-    def get_html_tree(self):
-        self.html = "<html>"
-        for page_num in self.elems.keys():
-            page_html = "<div id=" + str(page_num) + ">"
+    def get_html_tree(self) -> str:
+        doc = Document()
+        self.doc = doc
+        html = doc.createElement("html")
+        doc.appendChild(html)
+        head = doc.createElement("head")
+        html.appendChild(head)
+        # meta
+        meta = doc.createElement("meta")
+        head.appendChild(meta)
+        meta.setAttribute("name", "ocr-system")
+        meta.setAttribute("content", f"Converted from PDF by pdftotree {__version__}")
+        meta = doc.createElement("meta")
+        head.appendChild(meta)
+        meta.setAttribute("name", "ocr-capabilities")
+        meta.setAttribute("content", "ocr_page ocr_table ocrx_block ocrx_word")
+        meta = doc.createElement("meta")
+        head.appendChild(meta)
+        meta.setAttribute("name", "ocr-number-of-pages")
+        meta.setAttribute("content", f"{len(self.elems.keys())}")
+        # body
+        body = doc.createElement("body")
+        html.appendChild(body)
+        for page_num in self.elems.keys():  # 1-based
             boxes = []
             for clust in self.tree[page_num]:
                 for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[
@@ -250,61 +270,44 @@ def get_html_tree(self):
                     boxes += [
                         [clust.lower().replace(" ", "_"), top, left, bottom, right]
                     ]
-
+            page = doc.createElement("div")
+            page.setAttribute("class", "ocr_page")
+            page.setAttribute("id", f"page_{page_num}")
+            page.setAttribute(
+                "title", f"bbox 0 0 {int(pwidth)} {int(pheight)}; ppageno {page_num-1}"
+            )
+            body.appendChild(page)
             # TODO: We need to detect columns and sort acccordingly.
             boxes.sort(key=cmp_to_key(column_order))
 
             for box in boxes:
                 if box[0] == "table":
-                    table = box[1:]
-                    table_html = self.get_html_table(table, page_num)
-                    page_html += table_html
+                    table = box[1:]  # bbox
+                    table_element = self.get_html_table(table, page_num)
+                    page.appendChild(table_element)
                 elif box[0] == "figure":
-                    fig_str = [str(i) for i in box[1:]]
-                    fig_html = "<figure bbox=" + ",".join(fig_str) + "></figure>"
-                    page_html += fig_html
-                else:
-                    (
-                        box_html,
-                        char_html,
-                        top_html,
-                        left_html,
-                        bottom_html,
-                        right_html,
-                    ) = self.get_html_others(box[1:], page_num)
-                    page_html += (
-                        "<"
-                        + box[0]
-                        + " char='"
-                        + char_html
-                        + "', top='"
-                        + top_html
-                        + "', left='"
-                        + left_html
-                        + "', bottom='"
-                        + bottom_html
-                        + "', right='"
-                        + right_html
-                        + "'>"
-                        + box_html
-                        + "</"
-                        + box[0]
-                        + ">"
+                    fig_element = doc.createElement("figure")
+                    page.appendChild(fig_element)
+                    top, left, bottom, right = [int(i) for i in box[1:]]
+                    fig_element.setAttribute(
+                        "title", f"bbox {left} {top} {right} {bottom}"
                     )
-            page_html += "</div>"
-            self.html += page_html
-        self.html += "</html>"
-        return self.html
+                else:
+                    element = self.get_html_others(box[0], box[1:], page_num)
+                    page.appendChild(element)
+        return doc.toprettyxml()
 
-    def get_word_boundaries(self, mention):
+    def get_word_boundaries(
+        self, mention: LTTextLine
+    ) -> List[Tuple[str, float, float, float, float]]:
         mention_text = mention.get_text()
-        mention_chars = []
+        mention_chars: List[Tuple[str, int, int, int, int]] = []
         for obj in mention:
             if isinstance(obj, LTChar):
                 x0, y0, x1, y1 = obj.bbox
                 mention_chars.append([obj.get_text(), y0, x0, y1, x1])
         words = []
-        mention_words = mention_text.split()
+        mention_words: List[str] = mention_text.split()  # word split by " " (space)
         char_idx = 0
         for word in mention_words:
             curr_word = [word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf")]
@@ -335,111 +338,75 @@ def get_char_boundaries(self, mention):
                 mention_chars.append([obj.get_text(), y0, x0, y1, x1])
         return mention_chars
 
-    def get_html_others(self, box, page_num):
-        node_html = ""
-        top_html = ""
-        left_html = ""
-        bottom_html = ""
-        right_html = ""
-        char_html = ""
-        sep = " "
-        elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
+    def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
+        element = self.doc.createElement("div")
+        element.setAttribute("class", "ocrx_block")
+        element.setAttribute("pdftotree", tag)  # for backward-compatibility
+        top, left, bottom, right = [int(x) for x in box]
+        element.setAttribute("title", f"bbox {left} {top} {right} {bottom}")
+        elems: List[LTTextLine] = get_mentions_within_bbox(
+            box, self.elems[page_num].mentions
+        )
         elems.sort(key=cmp_to_key(reading_order))
         for elem in elems:
-            chars = self.get_char_boundaries(elem)
-            for char in chars:
-                temp = char[0]
-                if not re.match(r"[\x00-\x1F]", temp):
-                    char_html += char[0] + sep
-                    top_html += str(char[1]) + sep
-                    left_html += str(char[2]) + sep
-                    bottom_html += str(char[3]) + sep
-                    right_html += str(char[4]) + sep
+            line_element = self.doc.createElement("span")
+            element.appendChild(line_element)
+            line_element.setAttribute("class", "ocrx_line")
+            line_element.setAttribute(
+                "title",
+                f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}",
+            )
             words = self.get_word_boundaries(elem)
             for word in words:
-                #  node_html += (
-                #      "<word top=" + str(word[1]) + " left=" + str(word[2]) +
-                #      " bottom=" + str(word[3]) + " right=" + str(word[4]) +
-                #      ">" + str(word[0].encode('utf-8')) + "</word> ")
-                node_html += word[0] + " "
+                top, left, bottom, right = [int(x) for x in word[1:]]
+                # escape special HTML chars
+                text = html.escape(word[0])
 
-        # escape special HTML chars
-        node_html = html.escape(node_html)
-        char_html = html.escape(char_html)
-        return node_html, char_html, top_html, left_html, bottom_html, right_html
+                word_element = self.doc.createElement("span")
+                line_element.appendChild(word_element)
+                word_element.setAttribute("class", "ocrx_word")
+                word_element.setAttribute(
+                    "title", f"bbox {left} {top} {right} {bottom}"
+                )
+                word_element.appendChild(self.doc.createTextNode(text))
+        return element
 
-    def get_html_table(self, table, page_num):
+    def get_html_table(self, table, page_num) -> Element:
         table_str = [str(i) for i in table]
         table_json = tabula.read_pdf(
             self.pdf_file, pages=page_num, area=table_str, output_format="json"
         )
-        table_html = ""
         if len(table_json) > 0:
-            table_html = "<table>"
+            table_element = self.doc.createElement("table")
             for i, row in enumerate(table_json[0]["data"]):
-                row_str = "<tr>"
+                row_element = self.doc.createElement("tr")
+                table_element.appendChild(row_element)
                 for j, column in enumerate(row):
+                    col_element = self.doc.createElement("td")
+                    row_element.appendChild(col_element)
                     box = [
                         column["top"],
                         column["left"],
                         column["top"] + column["height"],
                         column["left"] + column["width"],
                     ]
-                    top_html = ""
-                    left_html = ""
-                    bottom_html = ""
-                    right_html = ""
-                    char_html = ""
-                    sep = " "
                     elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
                     elems.sort(key=cmp_to_key(reading_order))
-                    word_td = ""
                     for elem in elems:
-                        chars = self.get_char_boundaries(elem)
-                        for char in chars:
-                            temp = char[0]
-                            if not re.match(r"[\x00-\x1F]", temp):
-                                char_html += char[0].replace("'", '"') + sep
-                                top_html += str(char[1]) + sep
-                                left_html += str(char[2]) + sep
-                                bottom_html += str(char[3]) + sep
-                                right_html += str(char[4]) + sep
                         words = self.get_word_boundaries(elem)
                         for word in words:
-                            temp = word[0]
-                            if not re.match(r"[\x00-\x1F]", temp):
-                                word_td += word[0] + sep
-                    # escape special HTML chars
-                    word_td = html.escape(word_td)
-                    char_html = html.escape(char_html)
-                    row_str += (
-                        "<td char='"
-                        + char_html
-                        + "', top='"
-                        + top_html
-                        + "', left='"
-                        + left_html
-                        + "', bottom='"
-                        + bottom_html
-                        + "', right='"
-                        + right_html
-                        + "'>"
-                        + word_td.strip()
-                        + "</td>"
-                    )
-                    #  row_str += (
-                    #      "<td word='" + word_html + "', top='" + top_html +
-                    #      "', left='" + left_html + "', bottom='" + bottom_html +
-                    #      "', right='" + right_html + "'>") + str(
-                    #          column["text"].encode('utf-8')) + "</td>"
-                    #  row_str += ("<td char='" + char_html + "', top=") + str(
-                    #      column["top"]
-                    #  ) + (", left=" + str(column["left"]) + ", bottom=") + str(
-                    #      column["top"] + column["height"]) + ", right=" + str(
-                    #          column["left"] + column["width"]) + ">"
-                    # row_str += str(column["text"].encode('utf-8'))
-                    # row_str += "</td>"
-                row_str += "</tr>"
-                table_html += row_str
-            table_html += "</table>"
-        return table_html
+                            top = int(word[1])
+                            left = int(word[2])
+                            bottom = int(word[3])
+                            right = int(word[4])
+                            # escape special HTML chars
+                            text = html.escape(word[0])
+
+                            word_element = self.doc.createElement("span")
+                            col_element.appendChild(word_element)
+                            word_element.setAttribute("class", "ocrx_word")
+                            word_element.setAttribute(
+                                "title", f"bbox {left} {top} {right} {bottom}"
+                            )
+                            word_element.appendChild(self.doc.createTextNode(text))
+        return table_element
diff --git a/pdftotree/TreeVisualizer.py b/pdftotree/TreeVisualizer.py
index 879ffd8..35ca373 100644
--- a/pdftotree/TreeVisualizer.py
+++ b/pdftotree/TreeVisualizer.py
@@ -1,7 +1,19 @@
-import subprocess
+from typing import Tuple
+
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
+
+try:
+    from IPython import get_ipython
+
+    if "IPKernelApp" not in get_ipython().config:
+        raise ImportError("console")
+except (AttributeError, ImportError):
+    from wand.display import display
+else:
+    from IPython.display import display
 
-from bs4 import BeautifulSoup
-from IPython.display import display
 from wand.color import Color
 from wand.drawing import Drawing
 from wand.image import Image
@@ -83,14 +95,11 @@ def pdf_to_img(self, page_num, pdf_dim=None):
         return img
 
 
-def get_pdf_dim(pdf_file):
-    html_content = subprocess.check_output(
-        "pdftotext -f {} -l {} -bbox '{}' -".format("1", "1", pdf_file), shell=True
-    )
-    soup = BeautifulSoup(html_content, "html.parser")
-    pages = soup.find_all("page")
-    page_width, page_height = (
-        int(float(pages[0].get("width"))),
-        int(float(pages[0].get("height"))),
-    )
+def get_pdf_dim(pdf_file) -> Tuple[int, int]:
+    with open(pdf_file, "rb") as f:
+        parser = PDFParser(f)
+        doc = PDFDocument(parser)
+        # Look at the 1st page only.
+        page = next(PDFPage.create_pages(doc))
+        _, _, page_width, page_height = page.mediabox
     return page_width, page_height
diff --git a/pdftotree/core.py b/pdftotree/core.py
index af87fda..8636854 100644
--- a/pdftotree/core.py
+++ b/pdftotree/core.py
@@ -22,7 +22,6 @@
 import logging
 import os
 import pickle
-import re
 
 from pdftotree.TreeExtract import TreeExtractor
 from pdftotree.TreeVisualizer import TreeVisualizer
@@ -66,8 +65,8 @@ def parse(
         log.info("Tree structure built, creating html...")
         pdf_html = extractor.get_html_tree()
         log.info("HTML created.")
-        # Check html_path exists, create if not
-        pdf_html = re.sub(r"[\x00-\x1F]+", "", pdf_html)
+        # TODO: what is the following substition for and is it required?
+        # pdf_html = re.sub(r"[\x00-\x1F]+", "", pdf_html)
 
         if html_path is None:
             return pdf_html
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 57d5765..125c9af 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,4 +3,5 @@ flake8
 isort>=5.0.0
 pre-commit
 pytest
+hocr-tools
 sphinx
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 70dd258..877a82b 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os
+from subprocess import PIPE, Popen
 
 import pdftotree
 
@@ -17,6 +18,20 @@ def test_cli_should_output_at_given_path(tmp_path):
     assert os.path.isfile(html_path)
 
 
+def test_output_should_conform_to_hocr(tmp_path):
+    """Test if an exported file conform to hOCR."""
+    html_path = os.path.join(tmp_path, "md.html")
+    pdftotree.parse("tests/input/md.pdf", html_path)
+    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
+        assert all([line.decode("utf-8").startswith("ok") for line in proc.stderr])
+
+
+def test_visualize_output(tmp_path):
+    """Test if an output can be visualzied."""
+    html_path = os.path.join(tmp_path, "md.html")
+    pdftotree.parse("tests/input/md.pdf", html_path, visualize=True)
+
+
 def test_ml_completion():
     """Simply test that ML-based parse runs without errors."""
     output = pdftotree.parse(