diff --git a/scripts/lib/sphinx/PythonObjectMeta.ts b/scripts/lib/sphinx/PythonObjectMeta.ts index f46066b2f51..b51396d1b39 100644 --- a/scripts/lib/sphinx/PythonObjectMeta.ts +++ b/scripts/lib/sphinx/PythonObjectMeta.ts @@ -10,15 +10,17 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. +export type PythonApiType = + | "class" + | "method" + | "property" + | "attribute" + | "module" + | "function" + | "exception"; + export type PythonObjectMeta = { python_api_name?: string; - python_api_type?: - | "class" - | "method" - | "property" - | "attribute" - | "module" - | "function" - | "exception"; + python_api_type?: PythonApiType; hardcoded_frontmatter?: string; }; diff --git a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts index 3dbd046c50b..5dc83d4b3ae 100644 --- a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts +++ b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts @@ -10,19 +10,19 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. -import { load } from "cheerio"; import { unified } from "unified"; import rehypeParse from "rehype-parse"; import rehypeRemark from "rehype-remark"; import remarkStringify from "remark-stringify"; import remarkGfm from "remark-gfm"; import { last, first, without, initial, tail } from "lodash"; +import { CheerioAPI, Cheerio, load } from "cheerio"; import { defaultHandlers, Handle, toMdast, all } from "hast-util-to-mdast"; import { toText } from "hast-util-to-text"; import remarkMath from "remark-math"; import remarkMdx from "remark-mdx"; import { SphinxToMdResult } from "./SphinxToMdResult"; -import { PythonObjectMeta } from "./PythonObjectMeta"; +import { PythonObjectMeta, PythonApiType } from "./PythonObjectMeta"; import { getLastPartFromFullIdentifier, removePrefix, @@ -42,7 +42,6 @@ export async function sphinxHtmlToMarkdown(options: { baseSourceUrl?: string; releaseNotesTitle?: string; }): Promise { - const images: Array<{ src: string; dest: string }> = []; const { html, url, @@ -53,130 +52,11 @@ export async function sphinxHtmlToMarkdown(options: { const meta: PythonObjectMeta = {}; const isReleaseNotes = url.endsWith("release_notes.html") ? true : false; - const $page = load(html); - const main = $page(`[role='main']`); - const $main = $page(main); - - // remove html extensions in relative links - $main.find("a").each((_, link) => { - const $link = $page(link); - const href = $link.attr("href"); - if (href && !href.startsWith("http")) { - $link.attr("href", href.replaceAll(".html", "")); - } - }); - - if (isReleaseNotes && releaseNotesTitle) { - // Replace heading with custom heading - $page("h1").html(releaseNotesTitle); - } - - $main - .find("img") - .toArray() - .forEach((el) => { - const $img = $page(el); - - const imageUrl = new URL($img.attr("src")!, url); - const src = imageUrl.toString(); - - const filename = last(src.split("/")); - const dest = `${imageDestination}/${filename}`; - - $img.attr("src", dest); - - if (isReleaseNotes) { - // Release notes links should point to the current version - $img.attr("src", dest.replace(/[0-9].*\//, "")); - } - - images.push({ src, dest: dest }); - }); - - // remove permalink links - $main.find('a[title="Permalink to this headline"]').remove(); - $main.find('a[title="Permalink to this heading"]').remove(); - $main.find('a[title="Permalink to this definition"]').remove(); - $main.find('a[title="Link to this heading"]').remove(); - $main.find('a[title="Link to this definition"]').remove(); - - // remove download source code - $main.find("p > a.reference.download.internal").closest("p").remove(); - - // handle tabs, use heading for the summary and remove the blockquote - $main.find(".sd-summary-title").each((_, quote) => { - const $quote = $page(quote); - $quote.replaceWith(`

${$quote.html()}

`); - }); - - $main.find(".sd-card-body blockquote").each((_, quote) => { - const $quote = $page(quote); - $quote.replaceWith($quote.children()); - }); - - // add language class to code blocks - $main.find("pre").each((_, pre) => { - const $pre = $page(pre); - $pre.replaceWith( - `
${$pre.html()}
`, - ); - }); - - // replace source links - $main.find("a").each((_, a) => { - const $a = $page(a); - const href = $a.attr("href"); - if (href?.startsWith("http:")) return; - if (href?.includes(`/_modules/`)) { - //_modules/qiskit_ibm_runtime/ibm_backend - const match = href?.match(/_modules\/(.*?)(#|$)/); - if (match) { - const newHref = `${baseSourceUrl ?? ""}${match[1]}.py`; - $a.attr("href", newHref); - } - } - }); + const $ = load(html); + const $main = $(`[role='main']`); + const images = loadImages($, $main, url, imageDestination, isReleaseNotes); - // use titles for method and attribute headers - $main.find(".rubric").each((_, el) => { - const $el = $page(el); - $el.replaceWith(`

${$el.html()}

`); - }); - - // delete colons - $main.find(".colon").remove(); - - // translate type headings to titles - function findByText(selector: string, text: string) { - return $main - .find(selector) - .filter((i, el) => $page(el).text().trim() === text); - } - - $main - .find("dl.field-list.simple") - .toArray() - .map((dl) => { - const $dl = $page(dl); - - $dl - .find("dt") - .toArray() - .forEach((dt) => { - const $dt = $page(dt); - $dt.replaceWith(`${$dt.html()}`); - }); - - $dl - .find("dd") - .toArray() - .forEach((dd) => { - const $dd = $page(dd); - $dd.replaceWith(`
${$dd.html()}
`); - }); - - $dl.replaceWith(`
${$dl.html()}
`); - }); + preprocessHtml($, $main, baseSourceUrl, isReleaseNotes, releaseNotesTitle); let continueMapMembers = true; while (continueMapMembers) { @@ -192,12 +72,12 @@ export async function sphinxHtmlToMarkdown(options: { continue; } - const $dl = $page(dl); + const $dl = $(dl); const replacement = $dl .children() .toArray() .map((child) => { - const $child = $page(child); + const $child = $(child); $child.find(".viewcode-link").closest("a").remove(); const id = $dl.find("dt").attr("id") || ""; @@ -207,7 +87,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_name = id; } - findByText("em.property", "class").remove(); + findByText($, $main, "em.property", "class").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("property")) { if (!meta.python_api_type) { @@ -219,7 +99,7 @@ export async function sphinxHtmlToMarkdown(options: { } } - findByText("em.property", "property").remove(); + findByText($, $main, "em.property", "property").remove(); const signature = $child.find("em").text()?.replace(/^:\s+/, ""); if (signature.trim().length === 0) return; return `

${signature}

`; @@ -233,13 +113,13 @@ export async function sphinxHtmlToMarkdown(options: { } else { // Inline methods if (id) { - $page( - `

${getLastPartFromFullIdentifier(id)}

`, - ).insertBefore($dl); + $(`

${getLastPartFromFullIdentifier(id)}

`).insertBefore( + $dl, + ); } } - findByText("em.property", "method").remove(); + findByText($, $main, "em.property", "method").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("attribute")) { if (!meta.python_api_type) { @@ -250,7 +130,7 @@ export async function sphinxHtmlToMarkdown(options: { $dl.siblings("h1").text(getLastPartFromFullIdentifier(id)); } - findByText("em.property", "attribute").remove(); + findByText($, $main, "em.property", "attribute").remove(); const signature = $child.find("em").text()?.replace(/^:\s+/, ""); if (signature.trim().length === 0) return; return `

${signature}

`; @@ -289,7 +169,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_type = "function"; meta.python_api_name = id; } - findByText("em.property", "function").remove(); + findByText($, $main, "em.property", "function").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("exception")) { if (!meta.python_api_type) { @@ -297,7 +177,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_name = id; } - findByText("em.property", "exception").remove(); + findByText($, $main, "em.property", "exception").remove(); return `

${$child.html()}

`; } @@ -313,7 +193,7 @@ export async function sphinxHtmlToMarkdown(options: { .find("div.math") .toArray() .map((el) => { - const $el = $page(el); + const $el = $(el); $el.replaceWith(`
${$el.html()}
`); }); @@ -322,7 +202,7 @@ export async function sphinxHtmlToMarkdown(options: { const moduleIdWithPrefix = $main .find("span, section") .toArray() - .map((el) => $page(el).attr("id")) + .map((el) => $(el).attr("id")) .find((id) => id?.startsWith(modulePrefix)); if (moduleIdWithPrefix) { meta.python_api_type = "module"; @@ -335,8 +215,8 @@ export async function sphinxHtmlToMarkdown(options: { .find("h1,h2") .toArray() .forEach((el) => { - const $el = $page(el); - const $a = $page($el.find("a")); + const $el = $(el); + const $a = $($el.find("a")); const signature = $a.text(); $a.remove(); @@ -351,8 +231,148 @@ export async function sphinxHtmlToMarkdown(options: { } // convert to markdown - const mainHtml = main.html()!; + const markdown = await generateMarkdownFile($main.html()!, meta); + + return { markdown, meta, images, isReleaseNotes }; +} + +function loadImages( + $: CheerioAPI, + $main: Cheerio, + url: string, + imageDestination: string, + isReleaseNotes: boolean, +): Array<{ src: string; dest: string }> { + const images: Array<{ src: string; dest: string }> = []; + $main + .find("img") + .toArray() + .forEach((img) => { + const $img = $(img); + + const imageUrl = new URL($img.attr("src")!, url); + const src = imageUrl.toString(); + const filename = last(src.split("/")); + const dest = `${imageDestination}/${filename}`; + + $img.attr("src", dest); + + if (isReleaseNotes) { + // Release notes links should point to the current version + $img.attr("src", dest.replace(/[0-9].*\//, "")); + } + + images.push({ src, dest: dest }); + }); + + return images; +} + +function preprocessHtml( + $: CheerioAPI, + $main: Cheerio, + baseSourceUrl: string | undefined, + isReleaseNotes: boolean, + releaseNotesTitle: string | undefined, +): void { + // remove html extensions in relative links + $main.find("a").each((_, link) => { + const $link = $(link); + const href = $link.attr("href"); + if (href && !href.startsWith("http")) { + $link.attr("href", href.replaceAll(".html", "")); + } + }); + + // Custom heading for release notes + if (isReleaseNotes && releaseNotesTitle) { + $("h1").html(releaseNotesTitle); + } + + // remove permalink links + $main.find('a[title="Permalink to this headline"]').remove(); + $main.find('a[title="Permalink to this heading"]').remove(); + $main.find('a[title="Permalink to this definition"]').remove(); + $main.find('a[title="Link to this heading"]').remove(); + $main.find('a[title="Link to this definition"]').remove(); + + // remove download source code + $main.find("p > a.reference.download.internal").closest("p").remove(); + + // handle tabs, use heading for the summary and remove the blockquote + $main.find(".sd-summary-title").each((_, quote) => { + const $quote = $(quote); + $quote.replaceWith(`

${$quote.html()}

`); + }); + + $main.find(".sd-card-body blockquote").each((_, quote) => { + const $quote = $(quote); + $quote.replaceWith($quote.children()); + }); + + // add language class to code blocks + $main.find("pre").each((_, pre) => { + const $pre = $(pre); + $pre.replaceWith( + `
${$pre.html()}
`, + ); + }); + + // replace source links + $main.find("a").each((_, a) => { + const $a = $(a); + const href = $a.attr("href"); + if (href?.startsWith("http:")) return; + if (href?.includes(`/_modules/`)) { + //_modules/qiskit_ibm_runtime/ibm_backend + const match = href?.match(/_modules\/(.*?)(#|$)/); + if (match) { + const newHref = `${baseSourceUrl ?? ""}${match[1]}.py`; + $a.attr("href", newHref); + } + } + }); + + // use titles for method and attribute headers + $main.find(".rubric").each((_, el) => { + const $el = $(el); + $el.replaceWith(`

${$el.html()}

`); + }); + + // delete colons + $main.find(".colon").remove(); + + $main + .find("dl.field-list.simple") + .toArray() + .map((dl) => { + const $dl = $(dl); + + $dl + .find("dt") + .toArray() + .forEach((dt) => { + const $dt = $(dt); + $dt.replaceWith(`${$dt.html()}`); + }); + + $dl + .find("dd") + .toArray() + .forEach((dd) => { + const $dd = $(dd); + $dd.replaceWith(`
${$dd.html()}
`); + }); + + $dl.replaceWith(`
${$dl.html()}
`); + }); +} + +async function generateMarkdownFile( + mainHtml: string, + meta: PythonObjectMeta, +): Promise { const handlers: Record = { br(h, node: any) { return all(h, node); @@ -483,63 +503,55 @@ export async function sphinxHtmlToMarkdown(options: { handlers, }) .use(remarkStringify, remarkStringifyOptions) - .use(() => { - return (root: Root) => { - // merge contiguous emphasis - visit(root, "emphasis", (node, index, parent) => { - if (index === null || parent === null) return; - let nextIndex = index + 1; - while (parent.children[nextIndex]?.type === "emphasis") { - node.children.push( - ...((parent.children[nextIndex] as any).children ?? []), - ); - nextIndex++; - } - parent.children.splice(index + 1, nextIndex - (index + 1)); - }); + .use(() => (root: Root) => { + // merge contiguous emphasis + visit(root, "emphasis", (node, index, parent) => { + if (index === null || parent === null) return; + let nextIndex = index + 1; + while (parent.children[nextIndex]?.type === "emphasis") { + node.children.push( + ...((parent.children[nextIndex] as any).children ?? []), + ); + nextIndex++; + } + parent.children.splice(index + 1, nextIndex - (index + 1)); // remove initial and trailing spaces from emphasis - visit(root, "emphasis", (node, index, parent) => { - if (index === null || parent === null) return; - const firstChild = first(node.children); - if (firstChild?.type === "text") { - const match = firstChild.value.match(/^\s+/); - if (match) { - if (match[0] === firstChild.value) { - node.children = tail(node.children); - } else { - firstChild.value = removePrefix(firstChild.value, match[0]); - } - parent.children.splice(index, 0, { - type: "text", - value: match[0], - }); + const firstChild = first(node.children); + if (firstChild?.type === "text") { + const match = firstChild.value.match(/^\s+/); + if (match) { + if (match[0] === firstChild.value) { + node.children = tail(node.children); + } else { + firstChild.value = removePrefix(firstChild.value, match[0]); } + parent.children.splice(index, 0, { + type: "text", + value: match[0], + }); } - const lastChild = last(node.children); - if (lastChild?.type === "text") { - const match = lastChild.value.match(/\s+$/); - if (match) { - if (match[0] === lastChild.value) { - node.children = initial(node.children); - } else { - lastChild.value = removeSuffix(lastChild.value, match[0]); - } - parent.children.splice(index + 1, 0, { - type: "text", - value: match[0], - }); + } + const lastChild = last(node.children); + if (lastChild?.type === "text") { + const match = lastChild.value.match(/\s+$/); + if (match) { + if (match[0] === lastChild.value) { + node.children = initial(node.children); + } else { + lastChild.value = removeSuffix(lastChild.value, match[0]); } + parent.children.splice(index + 1, 0, { + type: "text", + value: match[0], + }); } - }); - }; + } + }); }) .process(mainHtml); - let markdown = mdFile.toString(); - markdown = markdown.replaceAll(``, ""); - - return { markdown, meta, images, isReleaseNotes }; + return mdFile.toString().replaceAll(``, ""); } function buildAdmonition(options: { @@ -581,3 +593,33 @@ function buildSpanId(id: string): MdxJsxFlowElement { children: [], }; } + +/** + * Find the element that both matches the `selector` and whose content is the same as `text` + */ +function findByText( + $: CheerioAPI, + $main: Cheerio, + selector: string, + text: string, +): Cheerio { + return $main.find(selector).filter((i, el) => $(el).text().trim() === text); +} + +function getPythonApiType($dl: Cheerio): PythonApiType | undefined { + for (const className of [ + "function", + "class", + "exception", + "method", + "property", + "attribute", + "module", + ]) { + if ($dl.hasClass(className)) { + return className as PythonApiType; + } + } + + return undefined; +}