From 037fa35045efc9f1f9c832aa9b2bd484b98f2f71 Mon Sep 17 00:00:00 2001 From: Claas Augner Date: Thu, 17 Nov 2022 12:39:57 +0100 Subject: [PATCH] refactor(build): split up document-extractor --- build/document-extractor.ts | 833 -------------------------------- build/extract-bcd.ts | 143 ++++++ build/extract-sections.ts | 454 +++++++++++++++++ build/extract-sidebar.ts | 43 ++ build/extract-specifications.ts | 126 +++++ build/extract-summary.ts | 59 +++ build/index.ts | 8 +- libs/types/document.ts | 10 +- 8 files changed, 834 insertions(+), 842 deletions(-) delete mode 100644 build/document-extractor.ts create mode 100644 build/extract-bcd.ts create mode 100644 build/extract-sections.ts create mode 100644 build/extract-sidebar.ts create mode 100644 build/extract-specifications.ts create mode 100644 build/extract-summary.ts diff --git a/build/document-extractor.ts b/build/document-extractor.ts deleted file mode 100644 index e19377597bcd..000000000000 --- a/build/document-extractor.ts +++ /dev/null @@ -1,833 +0,0 @@ -import * as cheerio from "cheerio"; -import { packageBCD } from "./resolve-bcd"; -import * as bcd from "@mdn/browser-compat-data/types"; -import { - BCDSection, - Doc, - ProseSection, - Section, - SpecificationsSection, -} from "../libs/types"; -import specs from "browser-specs"; -import web from "../kumascript/src/api/web"; - -interface SimpleSupportStatementWithReleaseDate - extends bcd.SimpleSupportStatement { - release_date?: string; -} - -type SectionsAndFlaws = [Section[], string[]]; - -/** Extract and mutate the $ if it as a "Quick_links" section. - * But only if it exists. - * - * If you had this: - * - * const $ = cheerio.load(` - * - *

Headline

- *

Text

- * `) - * const sidebar = extractSidebar($); - * console.log(sidebar); - * // '' - * console.log($.html()); - * // '

Headline

\n

Text

' - * - * ...give or take some whitespace. - */ -export function extractSidebar($: cheerio.CheerioAPI, doc: Partial) { - const search = $("#Quick_links"); - - if (!search.length) { - doc.sidebarHTML = ""; - return; - } - - // Open menu and highlight current page. - search.find(`a[href='${doc.mdn_url}']`).each((_i, el) => { - $(el).parents("details").prop("open", true); - $(el).attr("aria-current", "page"); - // Highlight, unless it already is highlighted (e.g. heading). - if ($(el).find("em,strong").length === 0) { - $(el).parent().wrapInner(""); - } - }); - - doc.sidebarHTML = search.html(); - search.remove(); -} - -export function extractSections($: cheerio.CheerioAPI): [Section[], string[]] { - const flaws: string[] = []; - const sections: Section[] = []; - const section = cheerio - .load("
", { - // decodeEntities: false - })("div") - .eq(0); - - const body = $("body")[0] as cheerio.ParentNode; - const iterable = [...(body.childNodes as cheerio.Element[])]; - - let c = 0; - iterable.forEach((child) => { - if ( - (child as cheerio.Element).tagName === "h2" || - (child as cheerio.Element).tagName === "h3" - ) { - if (c) { - const [subSections, subFlaws] = addSections(section.clone()); - sections.push(...subSections); - flaws.push(...subFlaws); - section.empty(); - } - c = 0; - } - // We *could* wrap this in something like `if (child.tagName) {` - // which would exclude any node that isn't a tag, such as comments. - // That might make the DOM nodes more compact and memory efficient. - c++; - section.append(child); - }); - if (c) { - // last straggler - const [subSections, subFlaws] = addSections(section); - sections.push(...subSections); - flaws.push(...subFlaws); - } - - // Check for and mutate possible duplicated IDs. - // If a HTML document has...: - // - //

Check these examples

- // ... - //

Examples

- // - // then this can cause various problems. For example, the anchor links - // won't work. The Table of Contents won't be able to do a loop with unique - // `key={section.id}` values. - // The reason we need to loop through to get a list of all existing IDs - // first is because we might have this: - // - //

Foo X

- //

Foo Y

- //

Foo Z

- // - // So when you encounter `

Foo Y

` you'll know that you - // can't suggest it to be `

Foo Y

` because that ID - // is taken by another one, later. - const allIDs = new Set( - sections - .map((section) => section.value.id) - .filter(Boolean) - .map((id) => id.toLowerCase()) - ); - - const seenIDs = new Set(); - for (const section of sections) { - const originalID = section.value.id; - if (!originalID) { - // Not all sections have an ID. For example, prose sections that don't - // start with a

. - // Since we're primarily concerned about *uniqueness* here, let's just - // skip worrying about these. - continue; - } - // We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`. - const id = originalID.toLowerCase(); - if (seenIDs.has(id)) { - // That's bad! We have to come up with a new ID but it can't be one - // that's used by another other section. - let increment = 2; - let newID = `${originalID}_${increment}`; - while ( - seenIDs.has(newID.toLowerCase()) || - allIDs.has(newID.toLowerCase()) - ) { - increment++; - newID = `${originalID}_${increment}`; - } - section.value.id = newID; - seenIDs.add(newID.toLowerCase()); - flaws.push( - `'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})` - ); - } else { - seenIDs.add(id); - } - } - - return [sections, flaws]; -} - -/** Return an array of new sections to be added to the complete document. - * - * Generally, this function is called with a cheerio (`$`) section that - * has HTML in it. The task is to structure that a little bit. - * If the HTML inside the '$' is: - * - *

Foo

- *

Bla bla

- *
  • One
- * - * then, the expected output is to return: - * - * [{ - * type: "prose", - * id: "foo", - * title: "Foo" - * content: "

Bla bla

\n

  • One
" - * }] - * - * The reason it's always returning an array is because of special - * sections. A special section is one where we try to transform it - * first. For example BCD tables. If the input is this: - * - *

Browser Compat

- *
...
- * - * Then, extract the ID, get the structured data and eventually return this: - * - * [{ - * type: "browser_compatibility", - * value: { - * query: "foo.bar.thing", - * id: "browser_compat", - * title: "Browser Compat", - * data: {....} - * }] - * - * Another example is for the specification section. If the input is this: - * - *

Specifications

- *
...
- * - * Then, extract the data-bcd-query and return this: - * - * [{ - * type: "specifications", - * value: { - * query: "foo.bar.thing", - * id: "specifications", - * title: "Specifications", - * specifications: {....} - * }] - */ -function addSections($: cheerio.Cheerio): SectionsAndFlaws { - const flaws: string[] = []; - - const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length; - if (countPotentialSpecialDivs) { - /** If there's exactly 1 special table the only section to add is something - * like this: - * { - * "type": "browser_compatibility", - * "value": { - * "title": "Browser compatibility", - * "id": "browser_compatibility", - * "query": "html.elements.video", - * "data": {....} - * } - * - * Where the 'title' and 'id' values comes from the

tag (if available). - * - * However, if there are **multiple special tables**, - * it needs to return something like this: - * - * [{ - * "type": "prose", - * "value": { - * "id": "browser_compatibility", - * "title": "Browser compatibility" - * "content": "Possible stuff before the table" - * }, - * { - * "type": "browser_compatibility", - * "value": { - * "query": "html.elements.video", - * "data": {.... - * }, - * { - * "type": "prose", - * "value": { - * "content": "Any other stuff before table maybe" - * }, - */ - if (countPotentialSpecialDivs > 1) { - const subSections: Section[] = []; - const section = cheerio - .load("
", { - // decodeEntities: false - })("div") - .eq(0); - - // Loop over each and every "root element" in the node and keep piling - // them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then - // add that to the stack, clear and repeat. - const div = $[0] as cheerio.ParentNode; - const iterable = [...(div.childNodes as cheerio.Element[])]; - let c = 0; - let countSpecialDivsFound = 0; - iterable.forEach((child) => { - if ( - child.tagName === "div" && - child.attribs && - child.attribs.class && - (child.attribs.class.includes("bc-data") || - child.attribs.class.includes("bc-specs")) - ) { - countSpecialDivsFound++; - if (c) { - const [proseSections, proseFlaws] = _addSectionProse( - section.clone() - ); - subSections.push(...proseSections); - flaws.push(...proseFlaws); - section.empty(); - c = 0; // reset the counter - } - section.append(child); - // XXX That `_addSingleSpecialSection(section.clone())` might return a - // and empty array and that means it failed and we should - // bail. - subSections.push(..._addSingleSpecialSection(section.clone())); - section.empty(); - } else { - section.append(child); - c++; - } - }); - if (c) { - const [proseSections, proseFlaws] = _addSectionProse(section.clone()); - subSections.push(...proseSections); - flaws.push(...proseFlaws); - } - if (countSpecialDivsFound !== countPotentialSpecialDivs) { - const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound; - const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${ - leftoverCount > 1 ? "s" : "" - } found but deeply nested.`; - flaws.push(explanation); - } - return [subSections, flaws]; - } - const specialSections = _addSingleSpecialSection($); - - // The _addSingleSpecialSection() function will have sucked up the

or

- // and the `div.bc-data` or `div.bc-specs` to turn it into a special section. - // First remove that, then put whatever HTML is left as a prose - // section underneath. - $.find("div.bc-data, h2, h3").remove(); - $.find("div.bc-specs, h2, h3").remove(); - const [proseSections, proseFlaws] = _addSectionProse($); - specialSections.push(...proseSections); - flaws.push(...proseFlaws); - - if (specialSections.length) { - return [specialSections, flaws]; - } - } - - // all else, leave as is - const [proseSections, proseFlaws] = _addSectionProse($); - flaws.push(...proseFlaws); - - return [proseSections, flaws]; -} - -function _addSingleSpecialSection( - $: cheerio.Cheerio -): Section[] { - let id: string | null = null; - let title: string | null = null; - let isH3 = false; - - const h2s = $.find("h2"); - if (h2s.length === 1) { - id = h2s.attr("id"); - title = h2s.text(); - } else { - const h3s = $.find("h3"); - if (h3s.length === 1) { - id = h3s.attr("id"); - title = h3s.text(); - isH3 = true; - } - } - - let dataQuery = ""; - let hasMultipleQueries = false; - let specURLsString = ""; - let specialSectionType: string | null = null; - if ($.find("div.bc-data").length) { - specialSectionType = "browser_compatibility"; - const elem = $.find("div.bc-data"); - // Macro adds "data-query", but some translated-content still uses "id". - dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? ""; - hasMultipleQueries = elem.attr("data-multiple") === "true"; - } else if ($.find("div.bc-specs").length) { - specialSectionType = "specifications"; - dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? ""; - specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? ""; - } - - // Some old legacy documents haven't been re-rendered yet, since it - // was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."` - // or `data-bcd="..."` attribute. If that's the case, bail and fall - // back on a regular prose section :( - if (!dataQuery && specURLsString === "") { - // I wish there was a good place to log this! - return _addSectionProse($)[0]; - } - const query = dataQuery.replace(/^bcd:/, ""); - const { browsers, data }: { browsers: bcd.Browsers; data: bcd.Identifier } = - packageBCD(query); - - if (specialSectionType === "browser_compatibility") { - if (data === undefined) { - return [ - { - type: specialSectionType, - value: { - title, - id, - isH3, - data: null, - query, - browsers: null, - }, - }, - ]; - } - return _buildSpecialBCDSection(); - } else if (specialSectionType === "specifications") { - if (query === undefined && specURLsString === "") { - return [ - { - type: specialSectionType, - value: { - title, - id, - isH3, - query, - specifications: [], - }, - }, - ]; - } - return _buildSpecialSpecSection(); - } - - throw new Error(`Unrecognized special section type '${specialSectionType}'`); - - function _buildSpecialBCDSection(): [BCDSection] { - // First extract a map of all release data, keyed by (normalized) browser - // name and the versions. - // You'll have a map that looks like this: - // - // 'chrome_android': { - // '28': { - // release_date: '2012-06-01', - // release_notes: '...', - // ... - // - // The reason we extract this to a locally scoped map, is so we can - // use it to augment the `__compat` blocks for the latest version - // when (if known) it was added. - const browserReleaseData = new Map(); - for (const [name, browser] of Object.entries(browsers)) { - const releaseData = new Map(); - for (const [version, data] of Object.entries(browser.releases || [])) { - if (data) { - releaseData.set(version, data); - } - } - browserReleaseData.set(name, releaseData); - } - - for (const block of _extractCompatBlocks(data)) { - for (const [browser, originalInfo] of Object.entries(block.support)) { - // `originalInfo` here will be one of the following: - // - a single simple_support_statement: - // { version_added: 42 } - // - an array of simple_support_statements: - // [ { version_added: 42 }, { prefix: '-moz', version_added: 35 } ] - // - // Standardize the first version to an array of one, so we don't have - // to deal with two different forms below - - const infos: SimpleSupportStatementWithReleaseDate[] = Array.isArray( - originalInfo - ) - ? originalInfo - : [originalInfo]; - - for (const infoEntry of infos) { - const added = - typeof infoEntry.version_added === "string" && - infoEntry.version_added.startsWith("≤") - ? infoEntry.version_added.slice(1) - : infoEntry.version_added; - if (browserReleaseData.has(browser)) { - if (browserReleaseData.get(browser).has(added)) { - infoEntry.release_date = browserReleaseData - .get(browser) - .get(added).release_date; - } - } - } - - infos.sort((a, b) => - _compareVersions(_getFirstVersion(b), _getFirstVersion(a)) - ); - - block.support[browser] = infos; - } - } - - if (hasMultipleQueries) { - title = query; - id = query; - isH3 = true; - } - return [ - { - type: "browser_compatibility", - value: { - title, - id, - isH3, - data, - query, - browsers, - }, - }, - ]; - } - - function _getFirstVersion(support: bcd.SimpleSupportStatement): string { - if (typeof support.version_added === "string") { - return support.version_added; - } else if (typeof support.version_removed === "string") { - return support.version_removed; - } else { - return "0"; - } - } - - function _compareVersions(a: string, b: string) { - const x = _splitVersion(a); - const y = _splitVersion(b); - - return _compareNumberArray(x, y); - } - - function _compareNumberArray(a: number[], b: number[]): number { - while (a.length || b.length) { - const x = a.shift() || 0; - const y = b.shift() || 0; - if (x !== y) { - return x - y; - } - } - - return 0; - } - function _splitVersion(version: string): number[] { - if (version.startsWith("≤")) { - version = version.slice(1); - } - - return version.split(".").map(Number); - } - - /** - * Recursively extracts `__compat` objects from the `feature` and from all - * nested features at any depth. - * - * @param {bcd.Identifier} feature The feature. - * @returns {bcd.CompatStatement[]} The array of `__compat` objects. - */ - function _extractCompatBlocks( - feature: bcd.Identifier - ): bcd.CompatStatement[] { - const blocks: bcd.CompatStatement[] = []; - for (const [key, value] of Object.entries(feature)) { - if (key === "__compat") { - blocks.push(value as bcd.CompatStatement); - } else if (typeof value === "object") { - blocks.push(..._extractCompatBlocks(value as bcd.Identifier)); - } - } - return blocks; - } - - function _buildSpecialSpecSection(): [SpecificationsSection] { - // Collect spec URLs from a BCD feature, a 'spec-urls' value, or both; - // For a BCD feature, it can either be a string or an array of strings. - let specURLs: string[] = []; - - function getSpecURLs(data: bcd.Identifier) { - // If we’re processing data for just one feature, then the 'data' - // variable will have a __compat key. So we get the one spec_url - // value from that, and move on. - // - // The value may have data for subfeatures too — each subfeature with - // its own __compat key that may have a spec_url — but in that case, - // for the purposes of the Specifications section, we don’t want to - // recurse through all the subfeatures to get those spec_url values; - // instead we only want the spec_url from the top-level __compat key. - if (data && data.__compat) { - const compat = data.__compat; - if (compat.spec_url) { - if (Array.isArray(compat.spec_url)) { - specURLs.push(...compat.spec_url); - } else { - specURLs.push(compat.spec_url); - } - } - } else { - // If we get here, we’re processing data for two or more features - // and the 'data' variable will contain multiple blocks (objects) - // — one for each feature. - if (!data) { - return; - } - for (const block of Object.values(data)) { - if (!block) { - continue; - } - if (!("__compat" in block)) { - // Some features — e.g., css.properties.justify-content — have - // no compat data themselves but have subfeatures with compat - // data. So we recurse through the nested property values until - // we either do or don’t find any subfeatures with spec URLs. - // Otherwise, if we’re processing multiple top-level features - // (that is, from a browser-compat value which is an array), - // we’d end up entirely missing the data for this feature. - getSpecURLs(block as bcd.Identifier); - } else { - // If we get here, we’ve got a __compat key, and we can extract - // any spec URLs its value may contain. - const compat = block.__compat; - if (compat && compat.spec_url) { - if (Array.isArray(compat.spec_url)) { - specURLs.push(...compat.spec_url); - } else { - specURLs.push(compat.spec_url); - } - } - } - } - } - } - - if (query) { - for (const feature of query.split(",").map((id) => id.trim())) { - const { data } = packageBCD(feature); - // If 'data' is non-null, we have data for one or more BCD features - // that we can extract spec URLs from. - getSpecURLs(data); - } - } - - if (specURLsString !== "") { - // If specURLsString is non-empty, then it has the string contents - // of the document’s 'spec-urls' frontmatter key: one or more URLs. - specURLs.push(...specURLsString.split(",").map((url) => url.trim())); - } - - // Eliminate any duplicate spec URLs - specURLs = [...new Set(specURLs)]; - - // Use BCD specURLs to look up more specification data - // from the browser-specs package - const specifications = specURLs - .map((specURL) => { - const spec = specs.find( - (spec) => - specURL.startsWith(spec.url) || - specURL.startsWith(spec.nightly.url) || - spec.nightly.alternateUrls.some((s) => specURL.startsWith(s)) || - // When grabbing series nightly, make sure we're grabbing the latest spec version - (spec.shortname === spec.series.currentSpecification && - specURL.startsWith(spec.series.nightlyUrl)) - ); - const specificationsData = { - bcdSpecificationURL: specURL, - title: "Unknown specification", - }; - if (spec) { - specificationsData.title = spec.title; - } else { - const specList = web.getJSONData("SpecData"); - const titleFromSpecData = Object.keys(specList).find( - (key) => specList[key]["url"] === specURL.split("#")[0] - ); - if (titleFromSpecData) { - specificationsData.title = titleFromSpecData; - } - } - - return specificationsData; - }) - .filter(Boolean); - - return [ - { - type: "specifications", - value: { - title, - id, - isH3, - specifications, - query, - }, - }, - ]; - } -} - -function _addSectionProse( - $: cheerio.Cheerio -): SectionsAndFlaws { - let id: string | null = null; - let title: string | null = null; - let titleAsText = ""; - let isH3 = false; - - const flaws: string[] = []; - - // The way this works... - // Given a section of HTML, try to extract a id, title, - - let h2found = false; - const h2s = $.find("h2"); - h2s.each((i) => { - const h2 = h2s.eq(i); - - if (i) { - // Excess! - flaws.push( - `Excess

tag that is NOT at root-level (id='${h2.attr( - "id" - )}', text='${h2.text()}')` - ); - } else { - // First element - id = h2.attr("id") ?? ""; - title = h2.html() ?? ""; - titleAsText = h2.text(); - h2.remove(); - } - h2found = true; - }); - - // If there was no

, look through all the

s. - if (!h2found) { - const h3s = $.find("h3"); - h3s.each((i) => { - const h3 = h3s.eq(i); - if (i) { - // Excess! - flaws.push( - `Excess

tag that is NOT at root-level (id='${h3.attr( - "id" - )}', text='${h3.text()}')` - ); - } else { - id = h3.attr("id") ?? ""; - title = h3.html() ?? ""; - titleAsText = h3.text(); - if (id && title) { - isH3 = true; - h3.remove(); - } - } - }); - } - - if (id) { - // Remove trailing underscores (https://github.com/mdn/yari/issues/5492). - id = id.replace(/_+$/g, ""); - } - - const value: ProseSection["value"] = { - id, - title, - isH3, - content: $.html()?.trim(), - }; - - // Only include it if it's useful. It's an optional property and it's - // potentially a waste of space to include it if it's not different. - if (titleAsText && titleAsText !== title) { - value["titleAsText"] = titleAsText; - } - - const sections: ProseSection[] = []; - if (value.content || value.title) { - sections.push({ - type: "prose", - value, - }); - } - - return [sections, flaws]; -} - -/** - * Given an array of sections, return a plain text - * string of a summary. No HTML or Kumascript allowed. - */ -export function extractSummary(sections: Section[]): string { - let summary = ""; // default and fallback is an empty string. - - function extractFirstGoodParagraph($): string { - const seoSummary = $("span.seoSummary, .summary"); - if (seoSummary.length && seoSummary.text()) { - return seoSummary.text(); - } - let summary = ""; - $("p").each((i, p) => { - // The `.each()` can only take a callback, so we need a solution - // to exit early once we've found the first working summary. - if (summary) return; // it already been found! - const text = $(p).text().trim(); - // Avoid those whose paragraph is just a failing KS macro - if (text && !text.includes("Redirect") && !text.startsWith("{{")) { - summary = text; - } - }); - return summary; - } - // If the sections contains a "Summary" one, use that, otherwise - // use the first prose one. - const summarySections = sections.filter( - (section: Section): section is ProseSection => - section.type === "prose" && section.value.title === "Summary" - ); - if (summarySections.length) { - const $ = cheerio.load(summarySections[0].value.content ?? ""); - summary = extractFirstGoodParagraph($); - } else { - for (const section of sections) { - if ( - section.type !== "prose" || - !section.value || - !section.value.content - ) { - continue; - } - const $ = cheerio.load(section.value.content); - // Remove non-p tags that we should not be looking inside. - $("div.notecard, div.note, div.blockIndicator").remove(); - summary = extractFirstGoodParagraph($); - if (summary) { - break; - } - } - } - return summary; -} diff --git a/build/extract-bcd.ts b/build/extract-bcd.ts new file mode 100644 index 000000000000..0a1adccdf953 --- /dev/null +++ b/build/extract-bcd.ts @@ -0,0 +1,143 @@ +import * as bcd from "@mdn/browser-compat-data/types"; +import { packageBCD } from "./resolve-bcd"; + +interface SimpleSupportStatementWithReleaseDate + extends bcd.SimpleSupportStatement { + release_date?: string; +} + +export function extractBCD(query: string): { + browsers: bcd.Browsers | null; + data: bcd.Identifier | null; +} { + const { browsers, data }: { browsers: bcd.Browsers; data: bcd.Identifier } = + packageBCD(query); + + if (data === undefined) { + return { browsers: null, data: null }; + } + + // First extract a map of all release data, keyed by (normalized) browser + // name and the versions. + // You'll have a map that looks like this: + // + // 'chrome_android': { + // '28': { + // release_date: '2012-06-01', + // release_notes: '...', + // ... + // + // The reason we extract this to a locally scoped map, is so we can + // use it to augment the `__compat` blocks for the latest version + // when (if known) it was added. + const browserReleaseData = new Map(); + for (const [name, browser] of Object.entries(browsers)) { + const releaseData = new Map(); + for (const [version, data] of Object.entries(browser.releases || [])) { + if (data) { + releaseData.set(version, data); + } + } + browserReleaseData.set(name, releaseData); + } + + for (const block of _extractCompatBlocks(data)) { + for (const [browser, originalInfo] of Object.entries(block.support)) { + // `originalInfo` here will be one of the following: + // - a single simple_support_statement: + // { version_added: 42 } + // - an array of simple_support_statements: + // [ { version_added: 42 }, { prefix: '-moz', version_added: 35 } ] + // + // Standardize the first version to an array of one, so we don't have + // to deal with two different forms below + + const infos: SimpleSupportStatementWithReleaseDate[] = Array.isArray( + originalInfo + ) + ? originalInfo + : [originalInfo]; + + for (const infoEntry of infos) { + const added = + typeof infoEntry.version_added === "string" && + infoEntry.version_added.startsWith("≤") + ? infoEntry.version_added.slice(1) + : infoEntry.version_added; + if (browserReleaseData.has(browser)) { + if (browserReleaseData.get(browser).has(added)) { + infoEntry.release_date = browserReleaseData + .get(browser) + .get(added).release_date; + } + } + } + + infos.sort((a, b) => + _compareVersions(_getFirstVersion(b), _getFirstVersion(a)) + ); + + block.support[browser] = infos; + } + } + + return { + data, + browsers, + }; +} + +function _getFirstVersion(support: bcd.SimpleSupportStatement): string { + if (typeof support.version_added === "string") { + return support.version_added; + } else if (typeof support.version_removed === "string") { + return support.version_removed; + } else { + return "0"; + } +} + +function _compareVersions(a: string, b: string) { + const x = _splitVersion(a); + const y = _splitVersion(b); + + return _compareNumberArray(x, y); +} + +function _compareNumberArray(a: number[], b: number[]): number { + while (a.length || b.length) { + const x = a.shift() || 0; + const y = b.shift() || 0; + if (x !== y) { + return x - y; + } + } + + return 0; +} +function _splitVersion(version: string): number[] { + if (version.startsWith("≤")) { + version = version.slice(1); + } + + return version.split(".").map(Number); +} + +/** + * Recursively extracts `__compat` objects from the `feature` and from all + * nested features at any depth. + * + * @param {bcd.Identifier} feature The feature. + * @returns {bcd.CompatStatement[]} The array of `__compat` objects. + */ +function _extractCompatBlocks(feature: bcd.Identifier): bcd.CompatStatement[] { + const blocks: bcd.CompatStatement[] = []; + for (const [key, value] of Object.entries(feature)) { + if (key === "__compat") { + blocks.push(value as bcd.CompatStatement); + } else if (typeof value === "object") { + blocks.push(..._extractCompatBlocks(value as bcd.Identifier)); + } + } + return blocks; +} diff --git a/build/extract-sections.ts b/build/extract-sections.ts new file mode 100644 index 000000000000..ac15cc0ef576 --- /dev/null +++ b/build/extract-sections.ts @@ -0,0 +1,454 @@ +import * as cheerio from "cheerio"; +import { extractBCD } from "./extract-bcd"; +import { extractSpecifications } from "./extract-specifications"; +import { ProseSection } from "../libs/types"; +import { Section } from "../libs/types/document"; + +export type SectionsAndFlaws = [Section[], string[]]; + +export function extractSections($: cheerio.CheerioAPI): [Section[], string[]] { + const flaws: string[] = []; + const sections: Section[] = []; + const section = cheerio + .load("
", { + // decodeEntities: false + })("div") + .eq(0); + + const body = $("body")[0] as cheerio.ParentNode; + const iterable = [...(body.childNodes as cheerio.Element[])]; + + let c = 0; + iterable.forEach((child) => { + if ( + (child as cheerio.Element).tagName === "h2" || + (child as cheerio.Element).tagName === "h3" + ) { + if (c) { + const [subSections, subFlaws] = addSections(section.clone()); + sections.push(...subSections); + flaws.push(...subFlaws); + section.empty(); + } + c = 0; + } + // We *could* wrap this in something like `if (child.tagName) {` + // which would exclude any node that isn't a tag, such as comments. + // That might make the DOM nodes more compact and memory efficient. + c++; + section.append(child); + }); + if (c) { + // last straggler + const [subSections, subFlaws] = addSections(section); + sections.push(...subSections); + flaws.push(...subFlaws); + } + + // Check for and mutate possible duplicated IDs. + // If a HTML document has...: + // + //

Check these examples

+ // ... + //

Examples

+ // + // then this can cause various problems. For example, the anchor links + // won't work. The Table of Contents won't be able to do a loop with unique + // `key={section.id}` values. + // The reason we need to loop through to get a list of all existing IDs + // first is because we might have this: + // + //

Foo X

+ //

Foo Y

+ //

Foo Z

+ // + // So when you encounter `

Foo Y

` you'll know that you + // can't suggest it to be `

Foo Y

` because that ID + // is taken by another one, later. + const allIDs = new Set( + sections + .map((section) => section.value.id) + .filter(Boolean) + .map((id) => id.toLowerCase()) + ); + + const seenIDs = new Set(); + for (const section of sections) { + const originalID = section.value.id; + if (!originalID) { + // Not all sections have an ID. For example, prose sections that don't + // start with a

. + // Since we're primarily concerned about *uniqueness* here, let's just + // skip worrying about these. + continue; + } + // We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`. + const id = originalID.toLowerCase(); + if (seenIDs.has(id)) { + // That's bad! We have to come up with a new ID but it can't be one + // that's used by another other section. + let increment = 2; + let newID = `${originalID}_${increment}`; + while ( + seenIDs.has(newID.toLowerCase()) || + allIDs.has(newID.toLowerCase()) + ) { + increment++; + newID = `${originalID}_${increment}`; + } + section.value.id = newID; + seenIDs.add(newID.toLowerCase()); + flaws.push( + `'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})` + ); + } else { + seenIDs.add(id); + } + } + + return [sections, flaws]; +} + +/** Return an array of new sections to be added to the complete document. + * + * Generally, this function is called with a cheerio (`$`) section that + * has HTML in it. The task is to structure that a little bit. + * If the HTML inside the '$' is: + * + *

Foo

+ *

Bla bla

+ *
  • One
+ * + * then, the expected output is to return: + * + * [{ + * type: "prose", + * id: "foo", + * title: "Foo" + * content: "

Bla bla

\n

  • One
" + * }] + * + * The reason it's always returning an array is because of special + * sections. A special section is one where we try to transform it + * first. For example BCD tables. If the input is this: + * + *

Browser Compat

+ *
...
+ * + * Then, extract the ID, get the structured data and eventually return this: + * + * [{ + * type: "browser_compatibility", + * value: { + * query: "foo.bar.thing", + * id: "browser_compat", + * title: "Browser Compat", + * data: {....} + * }] + * + * Another example is for the specification section. If the input is this: + * + *

Specifications

+ *
...
+ * + * Then, extract the data-bcd-query and return this: + * + * [{ + * type: "specifications", + * value: { + * query: "foo.bar.thing", + * id: "specifications", + * title: "Specifications", + * specifications: {....} + * }] + */ +function addSections($: cheerio.Cheerio): SectionsAndFlaws { + const flaws: string[] = []; + + const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length; + if (countPotentialSpecialDivs) { + /** If there's exactly 1 special table the only section to add is something + * like this: + * { + * "type": "browser_compatibility", + * "value": { + * "title": "Browser compatibility", + * "id": "browser_compatibility", + * "query": "html.elements.video", + * "data": {....} + * } + * + * Where the 'title' and 'id' values comes from the

tag (if available). + * + * However, if there are **multiple special tables**, + * it needs to return something like this: + * + * [{ + * "type": "prose", + * "value": { + * "id": "browser_compatibility", + * "title": "Browser compatibility" + * "content": "Possible stuff before the table" + * }, + * { + * "type": "browser_compatibility", + * "value": { + * "query": "html.elements.video", + * "data": {.... + * }, + * { + * "type": "prose", + * "value": { + * "content": "Any other stuff before table maybe" + * }, + */ + if (countPotentialSpecialDivs > 1) { + const subSections: Section[] = []; + const section = cheerio + .load("
", { + // decodeEntities: false + })("div") + .eq(0); + + // Loop over each and every "root element" in the node and keep piling + // them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then + // add that to the stack, clear and repeat. + const div = $[0] as cheerio.ParentNode; + const iterable = [...(div.childNodes as cheerio.Element[])]; + let c = 0; + let countSpecialDivsFound = 0; + iterable.forEach((child) => { + if ( + child.tagName === "div" && + child.attribs && + child.attribs.class && + (child.attribs.class.includes("bc-data") || + child.attribs.class.includes("bc-specs")) + ) { + countSpecialDivsFound++; + if (c) { + const [proseSections, proseFlaws] = buildSection(section.clone()); + subSections.push(...proseSections); + flaws.push(...proseFlaws); + section.empty(); + c = 0; // reset the counter + } + section.append(child); + // XXX That `_addSingleSpecialSection(section.clone())` might return a + // and empty array and that means it failed and we should + // bail. + subSections.push(...addSpecialSection(section.clone())); + section.empty(); + } else { + section.append(child); + c++; + } + }); + if (c) { + const [proseSections, proseFlaws] = buildSection(section.clone()); + subSections.push(...proseSections); + flaws.push(...proseFlaws); + } + if (countSpecialDivsFound !== countPotentialSpecialDivs) { + const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound; + const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${ + leftoverCount > 1 ? "s" : "" + } found but deeply nested.`; + flaws.push(explanation); + } + return [subSections, flaws]; + } + const specialSections = addSpecialSection($); + + // The _addSingleSpecialSection() function will have sucked up the

or

+ // and the `div.bc-data` or `div.bc-specs` to turn it into a special section. + // First remove that, then put whatever HTML is left as a prose + // section underneath. + $.find("div.bc-data, h2, h3").remove(); + $.find("div.bc-specs, h2, h3").remove(); + const [proseSections, proseFlaws] = buildSection($); + specialSections.push(...proseSections); + flaws.push(...proseFlaws); + + if (specialSections.length) { + return [specialSections, flaws]; + } + } + + // all else, leave as is + const [proseSections, proseFlaws] = buildSection($); + flaws.push(...proseFlaws); + + return [proseSections, flaws]; +} + +function addSpecialSection($: cheerio.Cheerio): Section[] { + let id: string | null = null; + let title: string | null = null; + let isH3 = false; + + const h2s = $.find("h2"); + if (h2s.length === 1) { + id = h2s.attr("id"); + title = h2s.text(); + } else { + const h3s = $.find("h3"); + if (h3s.length === 1) { + id = h3s.attr("id"); + title = h3s.text(); + isH3 = true; + } + } + + let dataQuery = ""; + let hasMultipleQueries = false; + let specURLsString = ""; + let specialSectionType: string | null = null; + if ($.find("div.bc-data").length) { + specialSectionType = "browser_compatibility"; + const elem = $.find("div.bc-data"); + // Macro adds "data-query", but some translated-content still uses "id". + dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? ""; + hasMultipleQueries = elem.attr("data-multiple") === "true"; + } else if ($.find("div.bc-specs").length) { + specialSectionType = "specifications"; + dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? ""; + specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? ""; + } + + // Some old legacy documents haven't been re-rendered yet, since it + // was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."` + // or `data-bcd="..."` attribute. If that's the case, bail and fall + // back on a regular prose section :( + if (!dataQuery && specURLsString === "") { + // I wish there was a good place to log this! + return buildSection($)[0]; + } + const query = dataQuery.replace(/^bcd:/, ""); + + if (specialSectionType === "browser_compatibility") { + const { data, browsers } = extractBCD(query); + + if (hasMultipleQueries) { + title = query; + id = query; + isH3 = true; + } + + return [ + { + type: "browser_compatibility", + value: { + title, + id, + isH3, + data, + query, + browsers, + }, + }, + ]; + } else if (specialSectionType === "specifications") { + const specifications = extractSpecifications(query, specURLsString); + + return [ + { + type: specialSectionType, + value: { + title, + id, + isH3, + query, + specifications, + }, + }, + ]; + } + + throw new Error(`Unrecognized special section type '${specialSectionType}'`); +} + +function buildSection($: cheerio.Cheerio): SectionsAndFlaws { + let id: string | null = null; + let title: string | null = null; + let titleAsText = ""; + let isH3 = false; + + const flaws: string[] = []; + + // The way this works... + // Given a section of HTML, try to extract a id, title, + let h2found = false; + const h2s = $.find("h2"); + h2s.each((i) => { + const h2 = h2s.eq(i); + + if (i) { + // Excess! + flaws.push( + `Excess

tag that is NOT at root-level (id='${h2.attr( + "id" + )}', text='${h2.text()}')` + ); + } else { + // First element + id = h2.attr("id") ?? ""; + title = h2.html() ?? ""; + titleAsText = h2.text(); + h2.remove(); + } + h2found = true; + }); + + // If there was no

, look through all the

s. + if (!h2found) { + const h3s = $.find("h3"); + h3s.each((i) => { + const h3 = h3s.eq(i); + if (i) { + // Excess! + flaws.push( + `Excess

tag that is NOT at root-level (id='${h3.attr( + "id" + )}', text='${h3.text()}')` + ); + } else { + id = h3.attr("id") ?? ""; + title = h3.html() ?? ""; + titleAsText = h3.text(); + if (id && title) { + isH3 = true; + h3.remove(); + } + } + }); + } + + if (id) { + // Remove trailing underscores (https://github.com/mdn/yari/issues/5492). + id = id.replace(/_+$/g, ""); + } + + const value: ProseSection["value"] = { + id, + title, + isH3, + content: $.html()?.trim(), + }; + + // Only include it if it's useful. It's an optional property and it's + // potentially a waste of space to include it if it's not different. + if (titleAsText && titleAsText !== title) { + value["titleAsText"] = titleAsText; + } + + const sections: ProseSection[] = []; + if (value.content || value.title) { + sections.push({ + type: "prose", + value, + }); + } + + return [sections, flaws]; +} diff --git a/build/extract-sidebar.ts b/build/extract-sidebar.ts new file mode 100644 index 000000000000..37774eef39d8 --- /dev/null +++ b/build/extract-sidebar.ts @@ -0,0 +1,43 @@ +import * as cheerio from "cheerio"; +import { Doc } from "../libs/types/document"; + +/** Extract and mutate the $ if it as a "Quick_links" section. + * But only if it exists. + * + * If you had this: + * + * const $ = cheerio.load(` + * + *

Headline

+ *

Text

+ * `) + * const sidebar = extractSidebar($); + * console.log(sidebar); + * // '' + * console.log($.html()); + * // '

Headline

\n

Text

' + * + * ...give or take some whitespace. + */ + +export function extractSidebar($: cheerio.CheerioAPI, doc: Partial) { + const search = $("#Quick_links"); + + if (!search.length) { + doc.sidebarHTML = ""; + return; + } + + // Open menu and highlight current page. + search.find(`a[href='${doc.mdn_url}']`).each((_i, el) => { + $(el).parents("details").prop("open", true); + $(el).attr("aria-current", "page"); + // Highlight, unless it already is highlighted (e.g. heading). + if ($(el).find("em,strong").length === 0) { + $(el).parent().wrapInner(""); + } + }); + + doc.sidebarHTML = search.html(); + search.remove(); +} diff --git a/build/extract-specifications.ts b/build/extract-specifications.ts new file mode 100644 index 000000000000..94562f0a0edb --- /dev/null +++ b/build/extract-specifications.ts @@ -0,0 +1,126 @@ +import { packageBCD } from "./resolve-bcd"; +import * as bcd from "@mdn/browser-compat-data/types"; +import { Specification } from "../libs/types/document"; +import specs from "browser-specs"; +import web from "../kumascript/src/api/web"; + +export function extractSpecifications( + query: string, + specURLsString +): Specification[] { + if (query === undefined && specURLsString === "") { + return []; + } + + // Collect spec URLs from a BCD feature, a 'spec-urls' value, or both; + // For a BCD feature, it can either be a string or an array of strings. + let specURLs: string[] = []; + + function getSpecURLs(data: bcd.Identifier) { + // If we’re processing data for just one feature, then the 'data' + // variable will have a __compat key. So we get the one spec_url + // value from that, and move on. + // + // The value may have data for subfeatures too — each subfeature with + // its own __compat key that may have a spec_url — but in that case, + // for the purposes of the Specifications section, we don’t want to + // recurse through all the subfeatures to get those spec_url values; + // instead we only want the spec_url from the top-level __compat key. + if (data && data.__compat) { + const compat = data.__compat; + if (compat.spec_url) { + if (Array.isArray(compat.spec_url)) { + specURLs.push(...compat.spec_url); + } else { + specURLs.push(compat.spec_url); + } + } + } else { + // If we get here, we’re processing data for two or more features + // and the 'data' variable will contain multiple blocks (objects) + // — one for each feature. + if (!data) { + return; + } + for (const block of Object.values(data)) { + if (!block) { + continue; + } + if (!("__compat" in block)) { + // Some features — e.g., css.properties.justify-content — have + // no compat data themselves but have subfeatures with compat + // data. So we recurse through the nested property values until + // we either do or don’t find any subfeatures with spec URLs. + // Otherwise, if we’re processing multiple top-level features + // (that is, from a browser-compat value which is an array), + // we’d end up entirely missing the data for this feature. + getSpecURLs(block as bcd.Identifier); + } else { + // If we get here, we’ve got a __compat key, and we can extract + // any spec URLs its value may contain. + const compat = block.__compat; + if (compat && compat.spec_url) { + if (Array.isArray(compat.spec_url)) { + specURLs.push(...compat.spec_url); + } else { + specURLs.push(compat.spec_url); + } + } + } + } + } + } + + if (query) { + for (const feature of query.split(",").map((id) => id.trim())) { + const { data } = packageBCD(feature); + // If 'data' is non-null, we have data for one or more BCD features + // that we can extract spec URLs from. + getSpecURLs(data); + } + } + + if (specURLsString !== "") { + // If specURLsString is non-empty, then it has the string contents + // of the document’s 'spec-urls' frontmatter key: one or more URLs. + specURLs.push(...specURLsString.split(",").map((url) => url.trim())); + } + + // Eliminate any duplicate spec URLs + specURLs = [...new Set(specURLs)]; + + // Use BCD specURLs to look up more specification data + // from the browser-specs package + const specifications = specURLs + .map((specURL) => { + const spec = specs.find( + (spec) => + specURL.startsWith(spec.url) || + specURL.startsWith(spec.nightly.url) || + spec.nightly.alternateUrls.some((s) => specURL.startsWith(s)) || + // When grabbing series nightly, make sure we're grabbing the latest spec version + (spec.shortname === spec.series.currentSpecification && + specURL.startsWith(spec.series.nightlyUrl)) + ); + const specificationsData = { + bcdSpecificationURL: specURL, + title: "Unknown specification", + }; + if (spec) { + specificationsData.title = spec.title; + } else { + const specList = web.getJSONData("SpecData"); + const titleFromSpecData = Object.keys(specList).find( + (key) => specList[key]["url"] === specURL.split("#")[0] + ); + if (titleFromSpecData) { + specificationsData.title = titleFromSpecData; + } + } + + return specificationsData; + }) + .filter(Boolean); + + return specifications; +} diff --git a/build/extract-summary.ts b/build/extract-summary.ts new file mode 100644 index 000000000000..589f8cca66d4 --- /dev/null +++ b/build/extract-summary.ts @@ -0,0 +1,59 @@ +import * as cheerio from "cheerio"; +import { ProseSection, Section } from "../libs/types/document"; + +/** + * Given an array of sections, return a plain text + * string of a summary. No HTML or Kumascript allowed. + */ + +export function extractSummary(sections: Section[]): string { + let summary = ""; // default and fallback is an empty string. + + // If the sections contains a "Summary" one, use that, otherwise + // use the first prose one. + const summarySections = sections.filter( + (section: Section): section is ProseSection => + section.type === "prose" && section.value.title === "Summary" + ); + if (summarySections.length) { + const $ = cheerio.load(summarySections[0].value.content ?? ""); + summary = extractFirstGoodParagraph($); + } else { + for (const section of sections) { + if ( + section.type !== "prose" || + !section.value || + !section.value.content + ) { + continue; + } + const $ = cheerio.load(section.value.content); + // Remove non-p tags that we should not be looking inside. + $("div.notecard, div.note, div.blockIndicator").remove(); + summary = extractFirstGoodParagraph($); + if (summary) { + break; + } + } + } + return summary; +} + +function extractFirstGoodParagraph($: cheerio.CheerioAPI): string { + const seoSummary = $("span.seoSummary, .summary"); + if (seoSummary.length && seoSummary.text()) { + return seoSummary.text(); + } + let summary = ""; + $("p").each((i, p) => { + // The `.each()` can only take a callback, so we need a solution + // to exit early once we've found the first working summary. + if (summary) return; // it already been found! + const text = $(p).text().trim(); + // Avoid those whose paragraph is just a failing KS macro + if (text && !text.includes("Redirect") && !text.startsWith("{{")) { + summary = text; + } + }); + return summary; +} diff --git a/build/index.ts b/build/index.ts index 1cba26a2897d..47721d252c23 100644 --- a/build/index.ts +++ b/build/index.ts @@ -13,11 +13,9 @@ import { CONTENT_ROOT, REPOSITORY_URLS } from "../libs/env"; import * as kumascript from "../kumascript"; import { FLAW_LEVELS } from "../libs/constants"; -import { - extractSections, - extractSidebar, - extractSummary, -} from "./document-extractor"; +import { extractSidebar } from "./extract-sidebar"; +import { extractSections } from "./extract-sections"; +import { extractSummary } from "./extract-summary"; export { default as SearchIndex } from "./search-index"; import { addBreadcrumbData } from "./document-utils"; import { fixFixableFlaws, injectFlaws, injectSectionFlaws } from "./flaws"; diff --git a/libs/types/document.ts b/libs/types/document.ts index 9581a8b02702..9b525a1959fa 100644 --- a/libs/types/document.ts +++ b/libs/types/document.ts @@ -187,6 +187,11 @@ export interface ProseSection { titleAsText?: string; }; } + +export interface Specification { + bcdSpecificationURL: any; + title: string; +} export interface SpecificationsSection { type: "specifications"; value: { @@ -194,10 +199,7 @@ export interface SpecificationsSection { title: string; isH3: boolean; query: string; - specifications: { - bcdSpecificationURL: any; - title: string; - }[]; + specifications: Specification[]; }; }