From 90ba199bef233cf354615e53764a490152e8818c Mon Sep 17 00:00:00 2001 From: Morgan Fouesneau Date: Mon, 27 Feb 2023 14:06:18 -0800 Subject: [PATCH] removed debugging; added citation treatement --- docs/MPIA daily digest.ipynb | 186 ++--------------------------------- 1 file changed, 10 insertions(+), 176 deletions(-) diff --git a/docs/MPIA daily digest.ipynb b/docs/MPIA daily digest.ipynb index e6552f5b..29647b2e 100644 --- a/docs/MPIA daily digest.ipynb +++ b/docs/MPIA daily digest.ipynb @@ -30,7 +30,8 @@ " get_paper_from_identifier,\n", " retrieve_document_source, \n", " get_markdown_badge)\n", - "from arxiv_on_deck_2 import (latex, \n", + "from arxiv_on_deck_2 import (latex,\n", + " latex_bib,\n", " mpia,\n", " highlight_authors_in_list)\n", "\n", @@ -204,6 +205,13 @@ "\n", " full_md = doc.generate_markdown_text()\n", " \n", + " # replace citations\n", + " try:\n", + " bibdata = latex_bib.LatexBib.from_doc(doc)\n", + " full_md = latex_bib.replace_citations(full_md, bibdata)\n", + " except Exception as e:\n", + " print(e)\n", + " \n", " documents.append((paper_id, full_md))\n", " except Exception as e:\n", " warnings.warn(latex.LatexWarning(f\"{paper_id:s} did not run properly\\n\" +\n", @@ -643,180 +651,6 @@ "with open(\"_build/html/index_daily.html\", 'w') as fout:\n", " fout.write(page)" ] - }, - { - "cell_type": "markdown", - "id": "1cebacbc", - "metadata": {}, - "source": [ - "# Debugging papers" - ] - }, - { - "cell_type": "raw", - "id": "debe7cc6", - "metadata": {}, - "source": [ - "raise NotImplementedError(\"Manual Stop\")" - ] - }, - { - "cell_type": "raw", - "id": "eb169192", - "metadata": { - "scrolled": true - }, - "source": [ - "from IPython.display import display, Markdown\n", - "from TexSoup import TexSoup\n", - "import re\n", - "\n", - "def bracket_error(source: str):\n", - " \"\"\" Find problematic portions of the document \"\"\"\n", - " \n", - " print(\"len(source)\", len(source))\n", - " \n", - " # Checking header\n", - " begin_doc = next(re.finditer(r'\\\\begin\\{document\\}', doc.source)).span()[1]\n", - " header = source[:begin_doc]\n", - " text = header + r\"\\n\\end{document}\"\n", - "\n", - " try:\n", - " # print(\"Header check... \", end='')\n", - " TexSoup(text)\n", - " display(Markdown(f\"**[OK]** - Header\"))\n", - " except:\n", - " raise RuntimeError(\"Error in the header\")\n", - " \n", - " # Check the text per section until the end.\n", - " # Do not stop and try them all.\n", - " \n", - " problematic_text = []\n", - " \n", - " sections = ([(0, begin_doc, 'until first section')] + \n", - " [(g.span()[0], g.span()[1], g.group()) for g in re.finditer(r'\\\\section\\{.*\\}', source)] +\n", - " [(g.span()[0], g.span()[1], g.group()) for g in re.finditer(r'\\\\begin\\{appendix\\}', source)]\n", - " )\n", - " sections.append([len(source), len(source), 'end'])\n", - " \n", - " sections = sorted(sections, key=lambda x: x[0])\n", - " \n", - " prev_pos, prev_name = (0, 'header')\n", - " parsed = []\n", - " \n", - " for span, span_end, name in sections:\n", - "\n", - " if span - prev_pos <= 0:\n", - " continue\n", - " \n", - "\n", - " text = source[prev_pos:span]\n", - " if prev_pos > begin_doc:\n", - " text = r\"\\n\\begin{document}\" + text + r\"\\n\\end{document}\"\n", - " else:\n", - " text = text + r\"\\n\\end{document}\"\n", - " try:\n", - " #print(f\"{prev_pos}:{prev_name}-->{span}:{name} check... \", end='')\n", - " parsed.append(TexSoup(text, tolerance=1)) # allow not ending env\n", - " display(Markdown(f\"**[OK]** - *{prev_pos}*:{prev_name} → *{span}*:{name}\"))\n", - " # print(\"ok\")\n", - "\n", - " prev_pos = span\n", - " prev_name = name\n", - " except:\n", - " # print(f\"error between {prev_pos} and {span}\")\n", - " display(Markdown(f\"**[ERR]** *{prev_pos}*:{prev_name} → *{span}*:{name}\"))\n", - " problematic_text.append((prev_pos, source[prev_pos:span]))\n", - " prev_pos = span\n", - " prev_name = name\n", - " # raise\n", - " return problematic_text, parsed\n", - "\n", - "\n", - "def check_environment(text, offset=0):\n", - " \"\"\" Check environment \"\"\"\n", - " env = re.compile(r\"\\\\begin\\{(?P.*)\\}(.*)\\\\end\\{(?P=env)\\}\", re.DOTALL)\n", - "\n", - " for match in env.finditer(text):\n", - " beg, end = match.span()\n", - " beg += offset\n", - " end += offset\n", - " envname = match.groups()[0]\n", - " try:\n", - " latex.TexSoup(match.group())\n", - " except Exception as e:\n", - " display(e)\n", - " print(f\"Error in {envname:s} between {beg} and {end}\")\n", - " return match.groups()[1], beg, end" - ] - }, - { - "cell_type": "raw", - "id": "2315e835", - "metadata": { - "scrolled": false - }, - "source": [ - "import importlib\n", - "importlib.reload(latex)\n", - "which = \"2204.03253\"\n", - "paper_id = f'{which:s}'\n", - "folder = f'tmp_{paper_id:s}'\n", - "\n", - "if not os.path.isdir(folder):\n", - " folder = retrieve_document_source(f\"{paper_id}\", f'tmp_{paper_id}')\n", - "\n", - "try:\n", - " doc = latex.LatexDocument(folder, validation=validation) \n", - "except AffiliationError as affilerror:\n", - " msg = f\"ArXiv:{paper_id:s} is not an MPIA paper... \" + str(affilerror)\n", - " print(msg)\n", - "\n", - "\n", - "# Hack because sometimes author parsing does not work well\n", - "if (len(doc.authors) != len(paper['authors'])):\n", - " doc._authors = paper['authors']\n", - "if (doc.abstract) in (None, ''):\n", - " doc._abstract = paper['abstract']\n", - "\n", - "doc.comment = get_markdown_badge(paper_id) + \" _\" + paper['comments'] + \"_\"\n", - "doc.highlight_authors_in_list(hl_list)\n", - "\n", - "full_md = doc.generate_markdown_text()" - ] - }, - { - "cell_type": "raw", - "id": "dd3781db", - "metadata": { - "scrolled": false - }, - "source": [ - "doc" - ] - }, - { - "cell_type": "raw", - "id": "c6eed834", - "metadata": {}, - "source": [ - "# [check_environment(k) for k in bracket_error(doc.source)]\n", - "_, _, a = latex.get_content_per_section(doc.source, verbose=True)\n", - "if not a:\n", - " print(\"no issues per section\")\n", - "for ak in a:\n", - " r = check_environment(ak[1], offset=ak[0])\n", - " print(r[1], r[2])\n", - " print(r[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71ce768a", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -835,7 +669,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.9" }, "vscode": { "interpreter": {