Skip to content

Commit

Permalink
removed debugging; added citation treatement
Browse files Browse the repository at this point in the history
  • Loading branch information
Morgan Fouesneau committed Feb 27, 2023
1 parent d876442 commit 90ba199
Showing 1 changed file with 10 additions and 176 deletions.
186 changes: 10 additions & 176 deletions docs/MPIA daily digest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
" get_paper_from_identifier,\n",
" retrieve_document_source, \n",
" get_markdown_badge)\n",
"from arxiv_on_deck_2 import (latex, \n",
"from arxiv_on_deck_2 import (latex,\n",
" latex_bib,\n",
" mpia,\n",
" highlight_authors_in_list)\n",
"\n",
Expand Down Expand Up @@ -204,6 +205,13 @@
"\n",
" full_md = doc.generate_markdown_text()\n",
" \n",
" # replace citations\n",
" try:\n",
" bibdata = latex_bib.LatexBib.from_doc(doc)\n",
" full_md = latex_bib.replace_citations(full_md, bibdata)\n",
" except Exception as e:\n",
" print(e)\n",
" \n",
" documents.append((paper_id, full_md))\n",
" except Exception as e:\n",
" warnings.warn(latex.LatexWarning(f\"{paper_id:s} did not run properly\\n\" +\n",
Expand Down Expand Up @@ -643,180 +651,6 @@
"with open(\"_build/html/index_daily.html\", 'w') as fout:\n",
" fout.write(page)"
]
},
{
"cell_type": "markdown",
"id": "1cebacbc",
"metadata": {},
"source": [
"# Debugging papers"
]
},
{
"cell_type": "raw",
"id": "debe7cc6",
"metadata": {},
"source": [
"raise NotImplementedError(\"Manual Stop\")"
]
},
{
"cell_type": "raw",
"id": "eb169192",
"metadata": {
"scrolled": true
},
"source": [
"from IPython.display import display, Markdown\n",
"from TexSoup import TexSoup\n",
"import re\n",
"\n",
"def bracket_error(source: str):\n",
" \"\"\" Find problematic portions of the document \"\"\"\n",
" \n",
" print(\"len(source)\", len(source))\n",
" \n",
" # Checking header\n",
" begin_doc = next(re.finditer(r'\\\\begin\\{document\\}', doc.source)).span()[1]\n",
" header = source[:begin_doc]\n",
" text = header + r\"\\n\\end{document}\"\n",
"\n",
" try:\n",
" # print(\"Header check... \", end='')\n",
" TexSoup(text)\n",
" display(Markdown(f\"**[OK]** - Header\"))\n",
" except:\n",
" raise RuntimeError(\"Error in the header\")\n",
" \n",
" # Check the text per section until the end.\n",
" # Do not stop and try them all.\n",
" \n",
" problematic_text = []\n",
" \n",
" sections = ([(0, begin_doc, 'until first section')] + \n",
" [(g.span()[0], g.span()[1], g.group()) for g in re.finditer(r'\\\\section\\{.*\\}', source)] +\n",
" [(g.span()[0], g.span()[1], g.group()) for g in re.finditer(r'\\\\begin\\{appendix\\}', source)]\n",
" )\n",
" sections.append([len(source), len(source), 'end'])\n",
" \n",
" sections = sorted(sections, key=lambda x: x[0])\n",
" \n",
" prev_pos, prev_name = (0, 'header')\n",
" parsed = []\n",
" \n",
" for span, span_end, name in sections:\n",
"\n",
" if span - prev_pos <= 0:\n",
" continue\n",
" \n",
"\n",
" text = source[prev_pos:span]\n",
" if prev_pos > begin_doc:\n",
" text = r\"\\n\\begin{document}\" + text + r\"\\n\\end{document}\"\n",
" else:\n",
" text = text + r\"\\n\\end{document}\"\n",
" try:\n",
" #print(f\"{prev_pos}:{prev_name}-->{span}:{name} check... \", end='')\n",
" parsed.append(TexSoup(text, tolerance=1)) # allow not ending env\n",
" display(Markdown(f\"**[OK]** - *{prev_pos}*:{prev_name} &rarr; *{span}*:{name}\"))\n",
" # print(\"ok\")\n",
"\n",
" prev_pos = span\n",
" prev_name = name\n",
" except:\n",
" # print(f\"error between {prev_pos} and {span}\")\n",
" display(Markdown(f\"**[ERR]** *{prev_pos}*:{prev_name} &rarr; *{span}*:{name}\"))\n",
" problematic_text.append((prev_pos, source[prev_pos:span]))\n",
" prev_pos = span\n",
" prev_name = name\n",
" # raise\n",
" return problematic_text, parsed\n",
"\n",
"\n",
"def check_environment(text, offset=0):\n",
" \"\"\" Check environment \"\"\"\n",
" env = re.compile(r\"\\\\begin\\{(?P<env>.*)\\}(.*)\\\\end\\{(?P=env)\\}\", re.DOTALL)\n",
"\n",
" for match in env.finditer(text):\n",
" beg, end = match.span()\n",
" beg += offset\n",
" end += offset\n",
" envname = match.groups()[0]\n",
" try:\n",
" latex.TexSoup(match.group())\n",
" except Exception as e:\n",
" display(e)\n",
" print(f\"Error in {envname:s} between {beg} and {end}\")\n",
" return match.groups()[1], beg, end"
]
},
{
"cell_type": "raw",
"id": "2315e835",
"metadata": {
"scrolled": false
},
"source": [
"import importlib\n",
"importlib.reload(latex)\n",
"which = \"2204.03253\"\n",
"paper_id = f'{which:s}'\n",
"folder = f'tmp_{paper_id:s}'\n",
"\n",
"if not os.path.isdir(folder):\n",
" folder = retrieve_document_source(f\"{paper_id}\", f'tmp_{paper_id}')\n",
"\n",
"try:\n",
" doc = latex.LatexDocument(folder, validation=validation) \n",
"except AffiliationError as affilerror:\n",
" msg = f\"ArXiv:{paper_id:s} is not an MPIA paper... \" + str(affilerror)\n",
" print(msg)\n",
"\n",
"\n",
"# Hack because sometimes author parsing does not work well\n",
"if (len(doc.authors) != len(paper['authors'])):\n",
" doc._authors = paper['authors']\n",
"if (doc.abstract) in (None, ''):\n",
" doc._abstract = paper['abstract']\n",
"\n",
"doc.comment = get_markdown_badge(paper_id) + \" _\" + paper['comments'] + \"_\"\n",
"doc.highlight_authors_in_list(hl_list)\n",
"\n",
"full_md = doc.generate_markdown_text()"
]
},
{
"cell_type": "raw",
"id": "dd3781db",
"metadata": {
"scrolled": false
},
"source": [
"doc"
]
},
{
"cell_type": "raw",
"id": "c6eed834",
"metadata": {},
"source": [
"# [check_environment(k) for k in bracket_error(doc.source)]\n",
"_, _, a = latex.get_content_per_section(doc.source, verbose=True)\n",
"if not a:\n",
" print(\"no issues per section\")\n",
"for ak in a:\n",
" r = check_environment(ak[1], offset=ak[0])\n",
" print(r[1], r[2])\n",
" print(r[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71ce768a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -835,7 +669,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.10.9"
},
"vscode": {
"interpreter": {
Expand Down

0 comments on commit 90ba199

Please sign in to comment.