From 88d0dd12191cfddb7c78e0dbcfa1aa2de0ec4e88 Mon Sep 17 00:00:00 2001
From: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com>
Date: Tue, 9 Jan 2024 11:07:20 -0500
Subject: [PATCH] Split sphinxHtmlToMarkdown into helper functions (#596)
Part of https://github.com/Qiskit/documentation/issues/223. There are
still improvements we can make like better testing.
This PR tries to avoid making code changes and only moves the code to
helper functions.
---
.../lib/sphinx/sphinxHtmlToMarkdown.test.ts | 18 +-
scripts/lib/sphinx/sphinxHtmlToMarkdown.ts | 391 ++++++++++--------
2 files changed, 234 insertions(+), 175 deletions(-)
diff --git a/scripts/lib/sphinx/sphinxHtmlToMarkdown.test.ts b/scripts/lib/sphinx/sphinxHtmlToMarkdown.test.ts
index 0646ecbafe0..f5f4bd34d04 100644
--- a/scripts/lib/sphinx/sphinxHtmlToMarkdown.test.ts
+++ b/scripts/lib/sphinx/sphinxHtmlToMarkdown.test.ts
@@ -13,6 +13,12 @@
import { describe, test, expect } from "@jest/globals";
import { sphinxHtmlToMarkdown } from "./sphinxHtmlToMarkdown";
+const DEFAULT_ARGS = {
+ imageDestination: "/images/qiskit",
+ baseSourceUrl: "https://github.com/Qiskit/qiskit-ibm-runtime/tree/0.9.2/",
+ releaseNotesTitle: "My Quantum release notes",
+};
+
describe("sphinxHtmlToMarkdown", () => {
test("remove .html extension from relative links", async () => {
expect(
@@ -114,7 +120,7 @@ describe("sphinxHtmlToMarkdown", () => {
`,
url: "http://qiskit.org/docs/quantum-circuit.html",
- imageDestination: "/images/qiskit",
+ ...DEFAULT_ARGS,
}),
).toMatchInlineSnapshot(`
{
@@ -506,7 +512,7 @@ Can be either (1) a dictionary mapping XX angle values to fidelity at that angle
`,
url: "https://qiskit.org/documentation/partners/qiskit_ibm_runtime/stubs/qiskit_ibm_runtime.Sampler.html",
- baseSourceUrl: `https://github.com/Qiskit/qiskit-ibm-runtime/tree/0.9.2/`,
+ ...DEFAULT_ARGS,
})
).markdown,
).toMatchInlineSnapshot(`
@@ -1418,7 +1424,7 @@ compilation flow follows the structure given below:
Most circuits must undergo a series of transformations that make them compatible with a given target device, and optimize them to reduce the effects of noise on the resulting outcomes. Rewriting quantum circuits to match hardware constraints and optimizing for performance can be far from trivial. The flow of logic in the rewriting tool chain need not be linear, and can often have iterative sub-loops, conditional branches, and other complex behaviors. That being said, the standard compilation flow follows the structure given below:
- ![../\\_images/transpiling\\_core\\_steps.png](/images/api//transpiling_core_steps.png)
+ ![../\\_images/transpiling\\_core\\_steps.png](/images/qiskit/transpiling_core_steps.png)
Qiskit has four pre-built transpilation pipelines available here:
"
@@ -1443,13 +1449,13 @@ test("identify release notes", async () => {
`,
url: "http://qiskit.org/docs/release_notes.html",
- imageDestination: "/images/qiskit",
+ ...DEFAULT_ARGS,
}),
).toMatchInlineSnapshot(`
{
"images": [],
"isReleaseNotes": true,
- "markdown": "# Release Notes
+ "markdown": "# My Quantum release notes
@@ -1528,6 +1534,7 @@ async function toMd(html: string) {
await sphinxHtmlToMarkdown({
url: "https://qiskit.org/documentation/partners/qiskit_ibm_runtime/stubs/qiskit_ibm_runtime.Sampler.html",
html,
+ ...DEFAULT_ARGS,
})
).markdown;
}
@@ -1536,5 +1543,6 @@ async function toMdWithMeta(html: string) {
return await sphinxHtmlToMarkdown({
url: "https://qiskit.org/documentation/partners/qiskit_ibm_runtime/stubs/qiskit_ibm_runtime.Sampler.html",
html,
+ ...DEFAULT_ARGS,
});
}
diff --git a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts
index a2e078eb1f0..6ee1fd0c3e7 100644
--- a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts
+++ b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts
@@ -36,28 +36,208 @@ import { Root } from "mdast";
export async function sphinxHtmlToMarkdown(options: {
html: string;
url: string;
- imageDestination?: string;
+ imageDestination: string;
// url links to a fixed version and ending in /
// https://github.com/Qiskit/qiskit-ibm-runtime/tree/0.9.2/
- baseSourceUrl?: string;
- releaseNotesTitle?: string;
+ baseSourceUrl: string;
+ releaseNotesTitle: string;
}): Promise {
- const {
- html,
- url,
- imageDestination = "/images/api/",
- baseSourceUrl,
- releaseNotesTitle,
- } = options;
- const meta: PythonObjectMeta = {};
- const isReleaseNotes = url.endsWith("release_notes.html") ? true : false;
-
+ const { html, url, imageDestination, baseSourceUrl, releaseNotesTitle } =
+ options;
const $ = load(html);
const $main = $(`[role='main']`);
+
+ const isReleaseNotes = url.endsWith("release_notes.html");
const images = loadImages($, $main, url, imageDestination, isReleaseNotes);
+ if (url.endsWith("release_notes.html")) {
+ setReleaseNotesHeading($, releaseNotesTitle);
+ }
+
+ removeHtmlExtensionsInRelativeLinks($, $main);
+ removePermalinks($main);
+ removeDownloadSourceCode($main);
+ handleTabs($, $main);
+ addLanguageClassToCodeBlocks($, $main);
+ replaceSourceLinksWithGitHub($, $main, baseSourceUrl);
+ convertRubricsToHeaders($, $main);
+ processSimpleFieldLists($, $main);
+ removeColons($main);
+ preserveMathBlockWhitespace($, $main);
+
+ const meta: PythonObjectMeta = {};
+ processMembersAndSetMeta($, $main, meta);
+ maybeExtractAndSetModuleMetadata($, $main, meta);
+ if (meta.python_api_type === "module") {
+ updateModuleHeadings($, $main, meta);
+ }
+
+ const markdown = await generateMarkdownFile($main.html()!, meta);
+ return { markdown, meta, images, isReleaseNotes };
+}
+
+function loadImages(
+ $: CheerioAPI,
+ $main: Cheerio,
+ url: string,
+ imageDestination: string,
+ isReleaseNotes: boolean,
+): Array<{ src: string; dest: string }> {
+ const images: Array<{ src: string; dest: string }> = [];
+ $main
+ .find("img")
+ .toArray()
+ .forEach((img) => {
+ const $img = $(img);
+
+ const imageUrl = new URL($img.attr("src")!, url);
+ const src = imageUrl.toString();
+
+ const filename = last(src.split("/"));
+ const dest = `${imageDestination}/${filename}`;
+
+ $img.attr("src", dest);
+
+ if (isReleaseNotes) {
+ // Release notes links should point to the current version
+ $img.attr("src", dest.replace(/[0-9].*\//, ""));
+ }
+
+ images.push({ src, dest: dest });
+ });
+
+ return images;
+}
+
+function removeHtmlExtensionsInRelativeLinks(
+ $: CheerioAPI,
+ $main: Cheerio,
+): void {
+ $main.find("a").each((_, link) => {
+ const $link = $(link);
+ const href = $link.attr("href");
+ if (href && !href.startsWith("http")) {
+ $link.attr("href", href.replaceAll(".html", ""));
+ }
+ });
+}
+
+function setReleaseNotesHeading(
+ $: CheerioAPI,
+ releaseNotesTitle: string,
+): void {
+ $("h1").html(releaseNotesTitle);
+}
+
+function removePermalinks($main: Cheerio): void {
+ $main.find('a[title="Permalink to this headline"]').remove();
+ $main.find('a[title="Permalink to this heading"]').remove();
+ $main.find('a[title="Permalink to this definition"]').remove();
+ $main.find('a[title="Link to this heading"]').remove();
+ $main.find('a[title="Link to this definition"]').remove();
+}
+
+function removeDownloadSourceCode($main: Cheerio): void {
+ $main.find("p > a.reference.download.internal").closest("p").remove();
+}
+
+/**
+ * Convert sphinx-design tabs.
+ *
+ * Uses the heading for the summary and removes the blockquote.
+ */
+function handleTabs($: CheerioAPI, $main: Cheerio): void {
+ $main.find(".sd-summary-title").each((_, quote) => {
+ const $quote = $(quote);
+ $quote.replaceWith(`${$quote.html()}
`);
+ });
+
+ $main.find(".sd-card-body blockquote").each((_, quote) => {
+ const $quote = $(quote);
+ $quote.replaceWith($quote.children());
+ });
+}
+
+function addLanguageClassToCodeBlocks(
+ $: CheerioAPI,
+ $main: Cheerio,
+): void {
+ $main.find("pre").each((_, pre) => {
+ const $pre = $(pre);
+ $pre.replaceWith(
+ `${$pre.html()}
`,
+ );
+ });
+}
+
+// TODO(#519): figure out if this is working.
+function replaceSourceLinksWithGitHub(
+ $: CheerioAPI,
+ $main: Cheerio,
+ baseSourceUrl: string,
+): void {
+ $main.find("a").each((_, a) => {
+ const $a = $(a);
+ const href = $a.attr("href");
+ if (href?.startsWith("http:")) return;
+ if (href?.includes(`/_modules/`)) {
+ //_modules/qiskit_ibm_runtime/ibm_backend
+ const match = href?.match(/_modules\/(.*?)(#|$)/);
+ if (match) {
+ const newHref = `${baseSourceUrl}${match[1]}.py`;
+ $a.attr("href", newHref);
+ }
+ }
+ });
+}
+
+function convertRubricsToHeaders($: CheerioAPI, $main: Cheerio): void {
+ // Rubrics correspond to method and attribute headers.
+ // TODO(#479): ensure our understanding of what .rubric corresponds to is correct and figure out
+ // if always using makes sense.
+ $main.find(".rubric").each((_, el) => {
+ const $el = $(el);
+ $el.replaceWith(`${$el.html()}
`);
+ });
+}
+
+function processSimpleFieldLists($: CheerioAPI, $main: Cheerio): void {
+ // TODO(#479): Have a better understanding of what dl.field-list.simple corresponds to
+ // and confirm this behavior makes sense.
+ $main
+ .find("dl.field-list.simple")
+ .toArray()
+ .map((dl) => {
+ const $dl = $(dl);
+
+ $dl
+ .find("dt")
+ .toArray()
+ .forEach((dt) => {
+ const $dt = $(dt);
+ $dt.replaceWith(`${$dt.html()}`);
+ });
+
+ $dl
+ .find("dd")
+ .toArray()
+ .forEach((dd) => {
+ const $dd = $(dd);
+ $dd.replaceWith(`${$dd.html()}
`);
+ });
+
+ $dl.replaceWith(`${$dl.html()}
`);
+ });
+}
- preprocessHtml($, $main, baseSourceUrl, isReleaseNotes, releaseNotesTitle);
+function removeColons($main: Cheerio): void {
+ $main.find(".colon").remove();
+}
+function processMembersAndSetMeta(
+ $: CheerioAPI,
+ $main: Cheerio,
+ meta: PythonObjectMeta,
+): void {
let continueMapMembers = true;
while (continueMapMembers) {
// members can be recursive, so we need to pick elements one by one
@@ -181,17 +361,13 @@ export async function sphinxHtmlToMarkdown(options: {
$dl.replaceWith(`${replacement}
`);
}
+}
- // preserve math block whitespace
- $main
- .find("div.math")
- .toArray()
- .map((el) => {
- const $el = $(el);
- $el.replaceWith(`${$el.html()}
`);
- });
-
- // extract module metadata
+function maybeExtractAndSetModuleMetadata(
+ $: CheerioAPI,
+ $main: Cheerio,
+ meta: PythonObjectMeta,
+): void {
const modulePrefix = "module-";
const moduleIdWithPrefix = $main
.find("span, section")
@@ -202,164 +378,39 @@ export async function sphinxHtmlToMarkdown(options: {
meta.python_api_type = "module";
meta.python_api_name = moduleIdWithPrefix.slice(modulePrefix.length);
}
-
- // Update headings of modules
- if (meta.python_api_type === "module") {
- $main
- .find("h1,h2")
- .toArray()
- .forEach((el) => {
- const $el = $(el);
- const $a = $($el.find("a"));
- const signature = $a.text();
- $a.remove();
-
- let title = $el.text();
- title = title.replace("()", "");
- let replacement = `<${el.tagName}>${title}${el.tagName}>`;
- if (signature.trim().length > 0) {
- replacement += `${signature}
`;
- }
- $el.replaceWith(replacement);
- });
- }
-
- // convert to markdown
- const markdown = await generateMarkdownFile($main.html()!, meta);
-
- return { markdown, meta, images, isReleaseNotes };
}
-function loadImages(
- $: CheerioAPI,
- $main: Cheerio,
- url: string,
- imageDestination: string,
- isReleaseNotes: boolean,
-): Array<{ src: string; dest: string }> {
- const images: Array<{ src: string; dest: string }> = [];
+function preserveMathBlockWhitespace($: CheerioAPI, $main: Cheerio): void {
$main
- .find("img")
+ .find("div.math")
.toArray()
- .forEach((img) => {
- const $img = $(img);
-
- const imageUrl = new URL($img.attr("src")!, url);
- const src = imageUrl.toString();
-
- const filename = last(src.split("/"));
- const dest = `${imageDestination}/${filename}`;
-
- $img.attr("src", dest);
-
- if (isReleaseNotes) {
- // Release notes links should point to the current version
- $img.attr("src", dest.replace(/[0-9].*\//, ""));
- }
-
- images.push({ src, dest: dest });
+ .map((el) => {
+ const $el = $(el);
+ $el.replaceWith(`${$el.html()}
`);
});
-
- return images;
}
-function preprocessHtml(
+function updateModuleHeadings(
$: CheerioAPI,
$main: Cheerio,
- baseSourceUrl: string | undefined,
- isReleaseNotes: boolean,
- releaseNotesTitle: string | undefined,
+ meta: PythonObjectMeta,
): void {
- // remove html extensions in relative links
- $main.find("a").each((_, link) => {
- const $link = $(link);
- const href = $link.attr("href");
- if (href && !href.startsWith("http")) {
- $link.attr("href", href.replaceAll(".html", ""));
- }
- });
-
- // Custom heading for release notes
- if (isReleaseNotes && releaseNotesTitle) {
- $("h1").html(releaseNotesTitle);
- }
-
- // remove permalink links
- $main.find('a[title="Permalink to this headline"]').remove();
- $main.find('a[title="Permalink to this heading"]').remove();
- $main.find('a[title="Permalink to this definition"]').remove();
- $main.find('a[title="Link to this heading"]').remove();
- $main.find('a[title="Link to this definition"]').remove();
-
- // remove download source code
- $main.find("p > a.reference.download.internal").closest("p").remove();
-
- // handle tabs, use heading for the summary and remove the blockquote
- $main.find(".sd-summary-title").each((_, quote) => {
- const $quote = $(quote);
- $quote.replaceWith(`${$quote.html()}
`);
- });
-
- $main.find(".sd-card-body blockquote").each((_, quote) => {
- const $quote = $(quote);
- $quote.replaceWith($quote.children());
- });
-
- // add language class to code blocks
- $main.find("pre").each((_, pre) => {
- const $pre = $(pre);
- $pre.replaceWith(
- `${$pre.html()}
`,
- );
- });
-
- // replace source links
- $main.find("a").each((_, a) => {
- const $a = $(a);
- const href = $a.attr("href");
- if (href?.startsWith("http:")) return;
- if (href?.includes(`/_modules/`)) {
- //_modules/qiskit_ibm_runtime/ibm_backend
- const match = href?.match(/_modules\/(.*?)(#|$)/);
- if (match) {
- const newHref = `${baseSourceUrl ?? ""}${match[1]}.py`;
- $a.attr("href", newHref);
- }
- }
- });
-
- // use titles for method and attribute headers
- $main.find(".rubric").each((_, el) => {
- const $el = $(el);
- $el.replaceWith(`${$el.html()}
`);
- });
-
- // delete colons
- $main.find(".colon").remove();
-
$main
- .find("dl.field-list.simple")
+ .find("h1,h2")
.toArray()
- .map((dl) => {
- const $dl = $(dl);
-
- $dl
- .find("dt")
- .toArray()
- .forEach((dt) => {
- const $dt = $(dt);
- $dt.replaceWith(`${$dt.html()}`);
- });
-
- $dl
- .find("dd")
- .toArray()
- .forEach((dd) => {
- const $dd = $(dd);
- $dd.replaceWith(`${$dd.html()}
`);
- });
-
- $dl.replaceWith(`${$dl.html()}
`);
+ .forEach((el) => {
+ const $el = $(el);
+ const $a = $($el.find("a"));
+ const signature = $a.text();
+ $a.remove();
+
+ let title = $el.text();
+ title = title.replace("()", "");
+ let replacement = `<${el.tagName}>${title}${el.tagName}>`;
+ if (signature.trim().length > 0) {
+ replacement += `${signature}
`;
+ }
+ $el.replaceWith(replacement);
});
}