From 688e4211225b697f781519a3a53cac0497b601fb Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 12 Mar 2020 15:26:56 +0000 Subject: [PATCH] Refactor parsing process (#59) * Add initial implementation * add some more comments * Update parser.py * add comments * update mistletoe/myst_parser versions * Update basic.ipynb * fix pre-commit * try adding widget state after metadata --- docs/use/basic.ipynb | 6 +- myst_nb/parser.py | 190 +++++++++++++++++++++++++++++-------------- setup.py | 2 +- 3 files changed, 134 insertions(+), 64 deletions(-) diff --git a/docs/use/basic.ipynb b/docs/use/basic.ipynb index 2643f492..66656951 100644 --- a/docs/use/basic.ipynb +++ b/docs/use/basic.ipynb @@ -17,7 +17,9 @@ "![](../_static/logo.png)\n", "\n", "because MyST-NB is using the MyST-markdown parser, you can include rich markdown with Sphinx\n", - "in your notebook. For example, here's a note block:\n", + "in your notebook.[^note] For example, here's a note block:\n", + "\n", + "[^note]: Even footnotes!\n", "\n", "`````{note}\n", "Wow, a note! It was generated with this code:\n", @@ -256,7 +258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.6-final" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/myst_nb/parser.py b/myst_nb/parser.py index c71986da..dbdb44fc 100644 --- a/myst_nb/parser.py +++ b/myst_nb/parser.py @@ -1,14 +1,23 @@ from docutils import nodes import nbformat as nbf from pathlib import Path +from sphinx.util import logging -from myst_parser.docutils_renderer import SphinxRenderer, dict_to_docinfo -from myst_parser.block_tokens import Document +from myst_parser.docutils_renderer import SphinxRenderer from myst_parser.sphinx_parser import MystParser + +from mistletoe.base_elements import BlockToken, Position, SourceLines +from mistletoe.parse_context import ParseContext, get_parse_context, set_parse_context +from mistletoe.block_tokenizer import tokenize_block +from mistletoe.block_tokens import Document, FrontMatter + from jupyter_sphinx.ast import get_widgets, JupyterWidgetStateNode from jupyter_sphinx.execute import contains_widgets, write_notebook_output +SPHINX_LOGGER = logging.getLogger(__name__) + + class NotebookParser(MystParser): """Docutils parser for IPynb + CommonMark + Math + Tables + RST Extensions """ @@ -21,6 +30,92 @@ class NotebookParser(MystParser): config_section_dependencies = ("parsers",) def parse(self, inputstring, document): + + # de-serialize the notebook + ntbk = nbf.reads(inputstring, nbf.NO_CONVERT) + + # This is a contaner for top level markdown tokens + # which we will add to as we walk the document + mkdown_tokens = [] # type: list[BlockToken] + + # First we ensure that we are using a 'clean' global context + # for parsing, which is setup with the MyST parsing tokens + # the logger will report on duplicate link/footnote definitions, etc + parse_context = ParseContext( + find_blocks=SphinxNBRenderer.default_block_tokens, + find_spans=SphinxNBRenderer.default_span_tokens, + logger=SPHINX_LOGGER, + ) + set_parse_context(parse_context) + + for cell_index, nb_cell in enumerate(ntbk.cells): + + # Skip empty cells + if len(nb_cell["source"].strip()) == 0: + continue + + # skip cells tagged for removal + tags = nb_cell.metadata.get("tags", []) + if "remove_cell" in tags: + continue + + if nb_cell["cell_type"] == "markdown": + + # we add the document path and cell index + # to the source lines, so they can be included in the error logging + # NOTE: currently the logic to report metadata is not written + # into SphinxRenderer, but this will be introduced in a later update + lines = SourceLines( + nb_cell["source"], + uri=document["source"], + metadata={"cell_index": cell_index}, + standardize_ends=True, + ) + + # parse the source markdown text; + # at this point span/inline level tokens are not yet processed, but + # link/footnote definitions are collected/stored in the global context + mkdown_tokens.extend(tokenize_block(lines)) + + # TODO for md cells, think of a way to implement the previous + # `if "hide_input" in tags:` logic + + elif nb_cell["cell_type"] == "code": + # here we do nothing but store the cell as a custom token + mkdown_tokens.append( + NbCodeCell( + cell=nb_cell, + position=Position( + line_start=0, + uri=document["source"], + data={"cell_index": cell_index}, + ), + ) + ) + + # Now all definitions have been gathered, we walk the tokens and + # process any inline text + for token in mkdown_tokens + list( + get_parse_context().foot_definitions.values() + ): + token.expand_spans() + + # If there are widgets, this will embed the state of all widgets in a script + if contains_widgets(ntbk): + mkdown_tokens.insert(0, JupyterWidgetState(state=get_widgets(ntbk))) + + # create the front matter token + front_matter = FrontMatter(content=ntbk.metadata, position=None) + + # Finally, we create the top-level markdown document + markdown_doc = Document( + children=mkdown_tokens, + front_matter=front_matter, + link_definitions=parse_context.link_definitions, + footnotes=parse_context.foot_definitions, + footref_order=parse_context.foot_references, + ) + self.reporter = document.reporter self.config = self.default_config.copy() try: @@ -29,8 +124,6 @@ def parse(self, inputstring, document): except AttributeError: pass - ntbk = nbf.reads(inputstring, nbf.NO_CONVERT) - # Write the notebook's output to disk path_doc = Path(document.settings.env.docname) doc_relpath = path_doc.parent @@ -39,69 +132,44 @@ def parse(self, inputstring, document): output_dir = build_dir.joinpath("jupyter_execute", doc_relpath) write_notebook_output(ntbk, str(output_dir), doc_filename) - # Parse notebook-level metadata as front-matter - # For now, only keep key/val pairs that point to int/float/string - metadata = ntbk.metadata - docinfo = dict_to_docinfo(metadata) - document += docinfo + # render the Markdown AST to docutils AST + renderer = SphinxNBRenderer( + parse_context=parse_context, document=document, current_node=None + ) + renderer.render(markdown_doc) + + +class JupyterWidgetState(BlockToken): + def __init__(self, state): + self.state = state + + +class NbCodeCell(BlockToken): + def __init__(self, cell, position): + self.cell = cell + self.position = position + + +class SphinxNBRenderer(SphinxRenderer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.render_map["NbCodeCell"] = self.render_nb_code_cell + self.render_map["JupyterWidgetState"] = self.render_jupyter_widget_state + + def render_jupyter_widget_state(self, token): + self.document.append(JupyterWidgetStateNode(state=token.state)) + + def render_nb_code_cell(self, token: NbCodeCell): + """Render a Jupyter notebook cell.""" + cell = token.cell + tags = cell.metadata.get("tags", []) - # If there are widgets, this will embed the state of all widgets in a script - if contains_widgets(ntbk): - document.append(JupyterWidgetStateNode(state=get_widgets(ntbk))) - renderer = SphinxRenderer(document=document, current_node=None) - with renderer: - # Loop through cells and render them - for ii, cell in enumerate(ntbk.cells): - # Skip empty cells - if len(cell["source"]) == 0: - continue - try: - _render_cell(cell, renderer) - except Exception as exc: - source = cell["source"][:50] - if len(cell["source"]) > 50: - source = source + "..." - msg_node = self.reporter.error( - ( - f"\nError parsing notebook cell #{ii+1}: {exc}\n" - f"Type: {cell['cell_type']}\n" - f"Source:\n{source}\n\n" - ) - ) - msg_node += nodes.literal_block(cell["source"], cell["source"]) - renderer.current_node += [msg_node] - continue - - -def _render_cell(cell, renderer): - """Render a cell with a SphinxRenderer instance. - - Returns nothing because the renderer updates itself. - """ - tags = cell.metadata.get("tags", []) - if "remove_cell" in tags: - return - - # If a markdown cell, simply call the Myst parser and append children - if cell["cell_type"] == "markdown": - document = Document.read(cell["source"], front_matter=False) - # Check for tag-specific behavior because markdown isn't wrapped in a cell - if "hide_input" in tags: - container = nodes.container() - container["classes"].extend(["toggle"]) - with renderer.current_node_context(container, append=True): - renderer.render(document) - else: - renderer.render(document) - - # If a code cell, convert the code + outputs - elif cell["cell_type"] == "code": # Cell container will wrap whatever is in the cell classes = ["cell"] for tag in tags: classes.append(f"tag_{tag}") sphinx_cell = CellNode(classes=classes, cell_type=cell["cell_type"]) - renderer.current_node += sphinx_cell + self.current_node += sphinx_cell if "remove_input" not in tags: cell_input = CellInputNode(classes=["cell_input"]) sphinx_cell += cell_input diff --git a/setup.py b/setup.py index b31a6455..abe01552 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ python_requires=">=3.6", package_data={"myst_nb": ["_static/mystnb.css"]}, install_requires=[ - "myst-parser~=0.5", + "myst-parser~=0.6.0", "docutils>=0.15", "sphinx>=2,<3", "jupyter_sphinx==0.2.4a1",