From 037fa35045efc9f1f9c832aa9b2bd484b98f2f71 Mon Sep 17 00:00:00 2001
From: Claas Augner <caugner@mozilla.com>
Date: Thu, 17 Nov 2022 12:39:57 +0100
Subject: [PATCH] refactor(build): split up document-extractor

---
 build/document-extractor.ts     | 833 --------------------------------
 build/extract-bcd.ts            | 143 ++++++
 build/extract-sections.ts       | 454 +++++++++++++++++
 build/extract-sidebar.ts        |  43 ++
 build/extract-specifications.ts | 126 +++++
 build/extract-summary.ts        |  59 +++
 build/index.ts                  |   8 +-
 libs/types/document.ts          |  10 +-
 8 files changed, 834 insertions(+), 842 deletions(-)
 delete mode 100644 build/document-extractor.ts
 create mode 100644 build/extract-bcd.ts
 create mode 100644 build/extract-sections.ts
 create mode 100644 build/extract-sidebar.ts
 create mode 100644 build/extract-specifications.ts
 create mode 100644 build/extract-summary.ts
diff --git a/build/document-extractor.ts b/build/document-extractor.ts
deleted file mode 100644
index e19377597bcd..000000000000
--- a/build/document-extractor.ts
+++ /dev/null
@@ -1,833 +0,0 @@
-import * as cheerio from "cheerio";
-import { packageBCD } from "./resolve-bcd";
-import * as bcd from "@mdn/browser-compat-data/types";
-import {
-  BCDSection,
-  Doc,
-  ProseSection,
-  Section,
-  SpecificationsSection,
-} from "../libs/types";
-import specs from "browser-specs";
-import web from "../kumascript/src/api/web";
-
-interface SimpleSupportStatementWithReleaseDate
-  extends bcd.SimpleSupportStatement {
-  release_date?: string;
-}
-
-type SectionsAndFlaws = [Section[], string[]];
-
-/** Extract and mutate the $ if it as a "Quick_links" section.
- * But only if it exists.
- *
- * If you had this:
- *
- *   const $ = cheerio.load(`
- *      <div id="Quick_links">Stuff</div>
- *      <h2>Headline<h2>
- *      <p>Text</p>
- *    `)
- *   const sidebar = extractSidebar($);
- *   console.log(sidebar);
- *   // '<div id="Quick_links">Stuff</div>'
- *   console.log($.html());
- *   // '<h2>Headline<h2>\n<p>Text</p>'
- *
- * ...give or take some whitespace.
- */
-export function extractSidebar($: cheerio.CheerioAPI, doc: Partial<Doc>) {
-  const search = $("#Quick_links");
-
-  if (!search.length) {
-    doc.sidebarHTML = "";
-    return;
-  }
-
-  // Open menu and highlight current page.
-  search.find(`a[href='${doc.mdn_url}']`).each((_i, el) => {
-    $(el).parents("details").prop("open", true);
-    $(el).attr("aria-current", "page");
-    // Highlight, unless it already is highlighted (e.g. heading).
-    if ($(el).find("em,strong").length === 0) {
-      $(el).parent().wrapInner("<em></em>");
-    }
-  });
-
-  doc.sidebarHTML = search.html();
-  search.remove();
-}
-
-export function extractSections($: cheerio.CheerioAPI): [Section[], string[]] {
-  const flaws: string[] = [];
-  const sections: Section[] = [];
-  const section = cheerio
-    .load("<div></div>", {
-      // decodeEntities: false
-    })("div")
-    .eq(0);
-
-  const body = $("body")[0] as cheerio.ParentNode;
-  const iterable = [...(body.childNodes as cheerio.Element[])];
-
-  let c = 0;
-  iterable.forEach((child) => {
-    if (
-      (child as cheerio.Element).tagName === "h2" ||
-      (child as cheerio.Element).tagName === "h3"
-    ) {
-      if (c) {
-        const [subSections, subFlaws] = addSections(section.clone());
-        sections.push(...subSections);
-        flaws.push(...subFlaws);
-        section.empty();
-      }
-      c = 0;
-    }
-    // We *could* wrap this in something like `if (child.tagName) {`
-    // which would exclude any node that isn't a tag, such as comments.
-    // That might make the DOM nodes more compact and memory efficient.
-    c++;
-    section.append(child);
-  });
-  if (c) {
-    // last straggler
-    const [subSections, subFlaws] = addSections(section);
-    sections.push(...subSections);
-    flaws.push(...subFlaws);
-  }
-
-  // Check for and mutate possible duplicated IDs.
-  // If a HTML document has...:
-  //
-  //   <h2 id="Examples">Check these examples</h2>
-  //   ...
-  //   <h2 id="examples">Examples</h2>
-  //
-  // then this can cause various problems. For example, the anchor links
-  // won't work. The Table of Contents won't be able to do a loop with unique
-  // `key={section.id}` values.
-  // The reason we need to loop through to get a list of all existing IDs
-  // first is because we might have this:
-  //
-  //  <h2 id="foo">Foo X</h2>
-  //  <h2 id="foo">Foo Y</h2>
-  //  <h2 id="foo_2">Foo Z</h2>
-  //
-  // So when you encounter `<h2 id="foo">Foo Y</h2>` you'll know that you
-  // can't suggest it to be `<h2 id="foo_2">Foo Y</h2>` because that ID
-  // is taken by another one, later.
-  const allIDs = new Set(
-    sections
-      .map((section) => section.value.id)
-      .filter(Boolean)
-      .map((id) => id.toLowerCase())
-  );
-
-  const seenIDs = new Set();
-  for (const section of sections) {
-    const originalID = section.value.id;
-    if (!originalID) {
-      // Not all sections have an ID. For example, prose sections that don't
-      // start with a <h2>.
-      // Since we're primarily concerned about *uniqueness* here, let's just
-      // skip worrying about these.
-      continue;
-    }
-    // We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`.
-    const id = originalID.toLowerCase();
-    if (seenIDs.has(id)) {
-      // That's bad! We have to come up with a new ID but it can't be one
-      // that's used by another other section.
-      let increment = 2;
-      let newID = `${originalID}_${increment}`;
-      while (
-        seenIDs.has(newID.toLowerCase()) ||
-        allIDs.has(newID.toLowerCase())
-      ) {
-        increment++;
-        newID = `${originalID}_${increment}`;
-      }
-      section.value.id = newID;
-      seenIDs.add(newID.toLowerCase());
-      flaws.push(
-        `'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})`
-      );
-    } else {
-      seenIDs.add(id);
-    }
-  }
-
-  return [sections, flaws];
-}
-
-/** Return an array of new sections to be added to the complete document.
- *
- * Generally, this function is called with a cheerio (`$`) section that
- * has HTML in it. The task is to structure that a little bit.
- * If the HTML inside the '$' is:
- *
- *   <h2 id="foo">Foo</h2>
- *   <p>Bla bla</p>
- *   <ul><li>One</li></ul>
- *
- * then, the expected output is to return:
- *
- *   [{
- *       type: "prose",
- *       id: "foo",
- *       title: "Foo"
- *       content: "<p>Bla bla<p>\n<ul><li>One</li></ul>"
- *   }]
- *
- * The reason it's always returning an array is because of special
- * sections. A special section is one where we try to transform it
- * first. For example BCD tables. If the input is this:
- *
- *   <h2 id="browser_compat">Browser Compat</h2>
- *   <div class="bc-data" data-query="foo.bar.thing">...</div>
- *
- * Then, extract the ID, get the structured data and eventually return this:
- *
- *   [{
- *     type: "browser_compatibility",
- *     value: {
- *        query: "foo.bar.thing",
- *        id: "browser_compat",
- *        title: "Browser Compat",
- *        data: {....}
- *   }]
- *
- * Another example is for the specification section. If the input is this:
- *
- *   <h2 id="Specifications">Specifications</h2>
- *   <div class="bc-specs" data-bcd-query="foo.bar.thing">...</div>
- *
- * Then, extract the data-bcd-query and return this:
- *
- *   [{
- *     type: "specifications",
- *     value: {
- *        query: "foo.bar.thing",
- *        id: "specifications",
- *        title: "Specifications",
- *        specifications: {....}
- *   }]
- */
-function addSections($: cheerio.Cheerio<cheerio.Element>): SectionsAndFlaws {
-  const flaws: string[] = [];
-
-  const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length;
-  if (countPotentialSpecialDivs) {
-    /** If there's exactly 1 special table the only section to add is something
-     * like this:
-     *    {
-     *     "type": "browser_compatibility",
-     *     "value": {
-     *       "title": "Browser compatibility",
-     *       "id": "browser_compatibility",
-     *       "query": "html.elements.video",
-     *       "data": {....}
-     *    }
-     *
-     * Where the 'title' and 'id' values comes from the <h2> tag (if available).
-     *
-     * However, if there are **multiple special tables**,
-     * it needs to return something like this:
-     *
-     *   [{
-     *     "type": "prose",
-     *     "value": {
-     *       "id": "browser_compatibility",
-     *       "title": "Browser compatibility"
-     *       "content": "Possible stuff before the table"
-     *    },
-     *    {
-     *     "type": "browser_compatibility",
-     *     "value": {
-     *        "query": "html.elements.video",
-     *        "data": {....
-     *    },
-     *   {
-     *     "type": "prose",
-     *     "value": {
-     *       "content": "Any other stuff before table maybe"
-     *    },
-     */
-    if (countPotentialSpecialDivs > 1) {
-      const subSections: Section[] = [];
-      const section = cheerio
-        .load("<div></div>", {
-          // decodeEntities: false
-        })("div")
-        .eq(0);
-
-      // Loop over each and every "root element" in the node and keep piling
-      // them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then
-      // add that to the stack, clear and repeat.
-      const div = $[0] as cheerio.ParentNode;
-      const iterable = [...(div.childNodes as cheerio.Element[])];
-      let c = 0;
-      let countSpecialDivsFound = 0;
-      iterable.forEach((child) => {
-        if (
-          child.tagName === "div" &&
-          child.attribs &&
-          child.attribs.class &&
-          (child.attribs.class.includes("bc-data") ||
-            child.attribs.class.includes("bc-specs"))
-        ) {
-          countSpecialDivsFound++;
-          if (c) {
-            const [proseSections, proseFlaws] = _addSectionProse(
-              section.clone()
-            );
-            subSections.push(...proseSections);
-            flaws.push(...proseFlaws);
-            section.empty();
-            c = 0; // reset the counter
-          }
-          section.append(child);
-          // XXX That `_addSingleSpecialSection(section.clone())` might return a
-          // and empty array and that means it failed and we should
-          // bail.
-          subSections.push(..._addSingleSpecialSection(section.clone()));
-          section.empty();
-        } else {
-          section.append(child);
-          c++;
-        }
-      });
-      if (c) {
-        const [proseSections, proseFlaws] = _addSectionProse(section.clone());
-        subSections.push(...proseSections);
-        flaws.push(...proseFlaws);
-      }
-      if (countSpecialDivsFound !== countPotentialSpecialDivs) {
-        const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound;
-        const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${
-          leftoverCount > 1 ? "s" : ""
-        } found but deeply nested.`;
-        flaws.push(explanation);
-      }
-      return [subSections, flaws];
-    }
-    const specialSections = _addSingleSpecialSection($);
-
-    // The _addSingleSpecialSection() function will have sucked up the <h2> or <h3>
-    // and the `div.bc-data` or `div.bc-specs` to turn it into a special section.
-    // First remove that, then put whatever HTML is left as a prose
-    // section underneath.
-    $.find("div.bc-data, h2, h3").remove();
-    $.find("div.bc-specs, h2, h3").remove();
-    const [proseSections, proseFlaws] = _addSectionProse($);
-    specialSections.push(...proseSections);
-    flaws.push(...proseFlaws);
-
-    if (specialSections.length) {
-      return [specialSections, flaws];
-    }
-  }
-
-  // all else, leave as is
-  const [proseSections, proseFlaws] = _addSectionProse($);
-  flaws.push(...proseFlaws);
-
-  return [proseSections, flaws];
-}
-
-function _addSingleSpecialSection(
-  $: cheerio.Cheerio<cheerio.Element>
-): Section[] {
-  let id: string | null = null;
-  let title: string | null = null;
-  let isH3 = false;
-
-  const h2s = $.find("h2");
-  if (h2s.length === 1) {
-    id = h2s.attr("id");
-    title = h2s.text();
-  } else {
-    const h3s = $.find("h3");
-    if (h3s.length === 1) {
-      id = h3s.attr("id");
-      title = h3s.text();
-      isH3 = true;
-    }
-  }
-
-  let dataQuery = "";
-  let hasMultipleQueries = false;
-  let specURLsString = "";
-  let specialSectionType: string | null = null;
-  if ($.find("div.bc-data").length) {
-    specialSectionType = "browser_compatibility";
-    const elem = $.find("div.bc-data");
-    // Macro adds "data-query", but some translated-content still uses "id".
-    dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? "";
-    hasMultipleQueries = elem.attr("data-multiple") === "true";
-  } else if ($.find("div.bc-specs").length) {
-    specialSectionType = "specifications";
-    dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? "";
-    specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? "";
-  }
-
-  // Some old legacy documents haven't been re-rendered yet, since it
-  // was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."`
-  // or `data-bcd="..."` attribute. If that's the case, bail and fall
-  // back on a regular prose section :(
-  if (!dataQuery && specURLsString === "") {
-    // I wish there was a good place to log this!
-    return _addSectionProse($)[0];
-  }
-  const query = dataQuery.replace(/^bcd:/, "");
-  const { browsers, data }: { browsers: bcd.Browsers; data: bcd.Identifier } =
-    packageBCD(query);
-
-  if (specialSectionType === "browser_compatibility") {
-    if (data === undefined) {
-      return [
-        {
-          type: specialSectionType,
-          value: {
-            title,
-            id,
-            isH3,
-            data: null,
-            query,
-            browsers: null,
-          },
-        },
-      ];
-    }
-    return _buildSpecialBCDSection();
-  } else if (specialSectionType === "specifications") {
-    if (query === undefined && specURLsString === "") {
-      return [
-        {
-          type: specialSectionType,
-          value: {
-            title,
-            id,
-            isH3,
-            query,
-            specifications: [],
-          },
-        },
-      ];
-    }
-    return _buildSpecialSpecSection();
-  }
-
-  throw new Error(`Unrecognized special section type '${specialSectionType}'`);
-
-  function _buildSpecialBCDSection(): [BCDSection] {
-    // First extract a map of all release data, keyed by (normalized) browser
-    // name and the versions.
-    // You'll have a map that looks like this:
-    //
-    //   'chrome_android': {
-    //      '28': {
-    //        release_date: '2012-06-01',
-    //        release_notes: '...',
-    //        ...
-    //
-    // The reason we extract this to a locally scoped map, is so we can
-    // use it to augment the `__compat` blocks for the latest version
-    // when (if known) it was added.
-    const browserReleaseData = new Map();
-    for (const [name, browser] of Object.entries(browsers)) {
-      const releaseData = new Map();
-      for (const [version, data] of Object.entries(browser.releases || [])) {
-        if (data) {
-          releaseData.set(version, data);
-        }
-      }
-      browserReleaseData.set(name, releaseData);
-    }
-
-    for (const block of _extractCompatBlocks(data)) {
-      for (const [browser, originalInfo] of Object.entries(block.support)) {
-        // `originalInfo` here will be one of the following:
-        //  - a single simple_support_statement:
-        //    { version_added: 42 }
-        //  - an array of simple_support_statements:
-        //    [ { version_added: 42 }, { prefix: '-moz', version_added: 35 } ]
-        //
-        // Standardize the first version to an array of one, so we don't have
-        // to deal with two different forms below
-
-        const infos: SimpleSupportStatementWithReleaseDate[] = Array.isArray(
-          originalInfo
-        )
-          ? originalInfo
-          : [originalInfo];
-
-        for (const infoEntry of infos) {
-          const added =
-            typeof infoEntry.version_added === "string" &&
-            infoEntry.version_added.startsWith("≤")
-              ? infoEntry.version_added.slice(1)
-              : infoEntry.version_added;
-          if (browserReleaseData.has(browser)) {
-            if (browserReleaseData.get(browser).has(added)) {
-              infoEntry.release_date = browserReleaseData
-                .get(browser)
-                .get(added).release_date;
-            }
-          }
-        }
-
-        infos.sort((a, b) =>
-          _compareVersions(_getFirstVersion(b), _getFirstVersion(a))
-        );
-
-        block.support[browser] = infos;
-      }
-    }
-
-    if (hasMultipleQueries) {
-      title = query;
-      id = query;
-      isH3 = true;
-    }
-    return [
-      {
-        type: "browser_compatibility",
-        value: {
-          title,
-          id,
-          isH3,
-          data,
-          query,
-          browsers,
-        },
-      },
-    ];
-  }
-
-  function _getFirstVersion(support: bcd.SimpleSupportStatement): string {
-    if (typeof support.version_added === "string") {
-      return support.version_added;
-    } else if (typeof support.version_removed === "string") {
-      return support.version_removed;
-    } else {
-      return "0";
-    }
-  }
-
-  function _compareVersions(a: string, b: string) {
-    const x = _splitVersion(a);
-    const y = _splitVersion(b);
-
-    return _compareNumberArray(x, y);
-  }
-
-  function _compareNumberArray(a: number[], b: number[]): number {
-    while (a.length || b.length) {
-      const x = a.shift() || 0;
-      const y = b.shift() || 0;
-      if (x !== y) {
-        return x - y;
-      }
-    }
-
-    return 0;
-  }
-  function _splitVersion(version: string): number[] {
-    if (version.startsWith("≤")) {
-      version = version.slice(1);
-    }
-
-    return version.split(".").map(Number);
-  }
-
-  /**
-   * Recursively extracts `__compat` objects from the `feature` and from all
-   * nested features at any depth.
-   *
-   * @param {bcd.Identifier} feature The feature.
-   * @returns {bcd.CompatStatement[]} The array of `__compat` objects.
-   */
-  function _extractCompatBlocks(
-    feature: bcd.Identifier
-  ): bcd.CompatStatement[] {
-    const blocks: bcd.CompatStatement[] = [];
-    for (const [key, value] of Object.entries(feature)) {
-      if (key === "__compat") {
-        blocks.push(value as bcd.CompatStatement);
-      } else if (typeof value === "object") {
-        blocks.push(..._extractCompatBlocks(value as bcd.Identifier));
-      }
-    }
-    return blocks;
-  }
-
-  function _buildSpecialSpecSection(): [SpecificationsSection] {
-    // Collect spec URLs from a BCD feature, a 'spec-urls' value, or both;
-    // For a BCD feature, it can either be a string or an array of strings.
-    let specURLs: string[] = [];
-
-    function getSpecURLs(data: bcd.Identifier) {
-      // If we’re processing data for just one feature, then the 'data'
-      // variable will have a __compat key. So we get the one spec_url
-      // value from that, and move on.
-      //
-      // The value may have data for subfeatures too — each subfeature with
-      // its own __compat key that may have a spec_url — but in that case,
-      // for the purposes of the Specifications section, we don’t want to
-      // recurse through all the subfeatures to get those spec_url values;
-      // instead we only want the spec_url from the top-level __compat key.
-      if (data && data.__compat) {
-        const compat = data.__compat;
-        if (compat.spec_url) {
-          if (Array.isArray(compat.spec_url)) {
-            specURLs.push(...compat.spec_url);
-          } else {
-            specURLs.push(compat.spec_url);
-          }
-        }
-      } else {
-        // If we get here, we’re processing data for two or more features
-        // and the 'data' variable will contain multiple blocks (objects)
-        // — one for each feature.
-        if (!data) {
-          return;
-        }
-        for (const block of Object.values(data)) {
-          if (!block) {
-            continue;
-          }
-          if (!("__compat" in block)) {
-            // Some features — e.g., css.properties.justify-content — have
-            // no compat data themselves but have subfeatures with compat
-            // data. So we recurse through the nested property values until
-            // we either do or don’t find any subfeatures with spec URLs.
-            // Otherwise, if we’re processing multiple top-level features
-            // (that is, from a browser-compat value which is an array),
-            // we’d end up entirely missing the data for this feature.
-            getSpecURLs(block as bcd.Identifier);
-          } else {
-            // If we get here, we’ve got a __compat key, and we can extract
-            // any spec URLs its value may contain.
-            const compat = block.__compat;
-            if (compat && compat.spec_url) {
-              if (Array.isArray(compat.spec_url)) {
-                specURLs.push(...compat.spec_url);
-              } else {
-                specURLs.push(compat.spec_url);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    if (query) {
-      for (const feature of query.split(",").map((id) => id.trim())) {
-        const { data } = packageBCD(feature);
-        // If 'data' is non-null, we have data for one or more BCD features
-        // that we can extract spec URLs from.
-        getSpecURLs(data);
-      }
-    }
-
-    if (specURLsString !== "") {
-      // If specURLsString is non-empty, then it has the string contents
-      // of the document’s 'spec-urls' frontmatter key: one or more URLs.
-      specURLs.push(...specURLsString.split(",").map((url) => url.trim()));
-    }
-
-    // Eliminate any duplicate spec URLs
-    specURLs = [...new Set(specURLs)];
-
-    // Use BCD specURLs to look up more specification data
-    // from the browser-specs package
-    const specifications = specURLs
-      .map((specURL) => {
-        const spec = specs.find(
-          (spec) =>
-            specURL.startsWith(spec.url) ||
-            specURL.startsWith(spec.nightly.url) ||
-            spec.nightly.alternateUrls.some((s) => specURL.startsWith(s)) ||
-            // When grabbing series nightly, make sure we're grabbing the latest spec version
-            (spec.shortname === spec.series.currentSpecification &&
-              specURL.startsWith(spec.series.nightlyUrl))
-        );
-        const specificationsData = {
-          bcdSpecificationURL: specURL,
-          title: "Unknown specification",
-        };
-        if (spec) {
-          specificationsData.title = spec.title;
-        } else {
-          const specList = web.getJSONData("SpecData");
-          const titleFromSpecData = Object.keys(specList).find(
-            (key) => specList[key]["url"] === specURL.split("#")[0]
-          );
-          if (titleFromSpecData) {
-            specificationsData.title = titleFromSpecData;
-          }
-        }
-
-        return specificationsData;
-      })
-      .filter(Boolean);
-
-    return [
-      {
-        type: "specifications",
-        value: {
-          title,
-          id,
-          isH3,
-          specifications,
-          query,
-        },
-      },
-    ];
-  }
-}
-
-function _addSectionProse(
-  $: cheerio.Cheerio<cheerio.Element>
-): SectionsAndFlaws {
-  let id: string | null = null;
-  let title: string | null = null;
-  let titleAsText = "";
-  let isH3 = false;
-
-  const flaws: string[] = [];
-
-  // The way this works...
-  // Given a section of HTML, try to extract a id, title,
-
-  let h2found = false;
-  const h2s = $.find("h2");
-  h2s.each((i) => {
-    const h2 = h2s.eq(i);
-
-    if (i) {
-      // Excess!
-      flaws.push(
-        `Excess <h2> tag that is NOT at root-level (id='${h2.attr(
-          "id"
-        )}', text='${h2.text()}')`
-      );
-    } else {
-      // First element
-      id = h2.attr("id") ?? "";
-      title = h2.html() ?? "";
-      titleAsText = h2.text();
-      h2.remove();
-    }
-    h2found = true;
-  });
-
-  // If there was no <h2>, look through all the <h3>s.
-  if (!h2found) {
-    const h3s = $.find("h3");
-    h3s.each((i) => {
-      const h3 = h3s.eq(i);
-      if (i) {
-        // Excess!
-        flaws.push(
-          `Excess <h3> tag that is NOT at root-level (id='${h3.attr(
-            "id"
-          )}', text='${h3.text()}')`
-        );
-      } else {
-        id = h3.attr("id") ?? "";
-        title = h3.html() ?? "";
-        titleAsText = h3.text();
-        if (id && title) {
-          isH3 = true;
-          h3.remove();
-        }
-      }
-    });
-  }
-
-  if (id) {
-    // Remove trailing underscores (https://github.com/mdn/yari/issues/5492).
-    id = id.replace(/_+$/g, "");
-  }
-
-  const value: ProseSection["value"] = {
-    id,
-    title,
-    isH3,
-    content: $.html()?.trim(),
-  };
-
-  // Only include it if it's useful. It's an optional property and it's
-  // potentially a waste of space to include it if it's not different.
-  if (titleAsText && titleAsText !== title) {
-    value["titleAsText"] = titleAsText;
-  }
-
-  const sections: ProseSection[] = [];
-  if (value.content || value.title) {
-    sections.push({
-      type: "prose",
-      value,
-    });
-  }
-
-  return [sections, flaws];
-}
-
-/**
- * Given an array of sections, return a plain text
- * string of a summary. No HTML or Kumascript allowed.
- */
-export function extractSummary(sections: Section[]): string {
-  let summary = ""; // default and fallback is an empty string.
-
-  function extractFirstGoodParagraph($): string {
-    const seoSummary = $("span.seoSummary, .summary");
-    if (seoSummary.length && seoSummary.text()) {
-      return seoSummary.text();
-    }
-    let summary = "";
-    $("p").each((i, p) => {
-      // The `.each()` can only take a callback, so we need a solution
-      // to exit early once we've found the first working summary.
-      if (summary) return; // it already been found!
-      const text = $(p).text().trim();
-      // Avoid those whose paragraph is just a failing KS macro
-      if (text && !text.includes("Redirect") && !text.startsWith("{{")) {
-        summary = text;
-      }
-    });
-    return summary;
-  }
-  // If the sections contains a "Summary" one, use that, otherwise
-  // use the first prose one.
-  const summarySections = sections.filter(
-    (section: Section): section is ProseSection =>
-      section.type === "prose" && section.value.title === "Summary"
-  );
-  if (summarySections.length) {
-    const $ = cheerio.load(summarySections[0].value.content ?? "");
-    summary = extractFirstGoodParagraph($);
-  } else {
-    for (const section of sections) {
-      if (
-        section.type !== "prose" ||
-        !section.value ||
-        !section.value.content
-      ) {
-        continue;
-      }
-      const $ = cheerio.load(section.value.content);
-      // Remove non-p tags that we should not be looking inside.
-      $("div.notecard, div.note, div.blockIndicator").remove();
-      summary = extractFirstGoodParagraph($);
-      if (summary) {
-        break;
-      }
-    }
-  }
-  return summary;
-}
diff --git a/build/extract-bcd.ts b/build/extract-bcd.ts
new file mode 100644
index 000000000000..0a1adccdf953
--- /dev/null
+++ b/build/extract-bcd.ts
@@ -0,0 +1,143 @@
+import * as bcd from "@mdn/browser-compat-data/types";
+import { packageBCD } from "./resolve-bcd";
+
+interface SimpleSupportStatementWithReleaseDate
+  extends bcd.SimpleSupportStatement {
+  release_date?: string;
+}
+
+export function extractBCD(query: string): {
+  browsers: bcd.Browsers | null;
+  data: bcd.Identifier | null;
+} {
+  const { browsers, data }: { browsers: bcd.Browsers; data: bcd.Identifier } =
+    packageBCD(query);
+
+  if (data === undefined) {
+    return { browsers: null, data: null };
+  }
+
+  // First extract a map of all release data, keyed by (normalized) browser
+  // name and the versions.
+  // You'll have a map that looks like this:
+  //
+  //   'chrome_android': {
+  //      '28': {
+  //        release_date: '2012-06-01',
+  //        release_notes: '...',
+  //        ...
+  //
+  // The reason we extract this to a locally scoped map, is so we can
+  // use it to augment the `__compat` blocks for the latest version
+  // when (if known) it was added.
+  const browserReleaseData = new Map();
+  for (const [name, browser] of Object.entries(browsers)) {
+    const releaseData = new Map();
+    for (const [version, data] of Object.entries(browser.releases || [])) {
+      if (data) {
+        releaseData.set(version, data);
+      }
+    }
+    browserReleaseData.set(name, releaseData);
+  }
+
+  for (const block of _extractCompatBlocks(data)) {
+    for (const [browser, originalInfo] of Object.entries(block.support)) {
+      // `originalInfo` here will be one of the following:
+      //  - a single simple_support_statement:
+      //    { version_added: 42 }
+      //  - an array of simple_support_statements:
+      //    [ { version_added: 42 }, { prefix: '-moz', version_added: 35 } ]
+      //
+      // Standardize the first version to an array of one, so we don't have
+      // to deal with two different forms below
+
+      const infos: SimpleSupportStatementWithReleaseDate[] = Array.isArray(
+        originalInfo
+      )
+        ? originalInfo
+        : [originalInfo];
+
+      for (const infoEntry of infos) {
+        const added =
+          typeof infoEntry.version_added === "string" &&
+          infoEntry.version_added.startsWith("≤")
+            ? infoEntry.version_added.slice(1)
+            : infoEntry.version_added;
+        if (browserReleaseData.has(browser)) {
+          if (browserReleaseData.get(browser).has(added)) {
+            infoEntry.release_date = browserReleaseData
+              .get(browser)
+              .get(added).release_date;
+          }
+        }
+      }
+
+      infos.sort((a, b) =>
+        _compareVersions(_getFirstVersion(b), _getFirstVersion(a))
+      );
+
+      block.support[browser] = infos;
+    }
+  }
+
+  return {
+    data,
+    browsers,
+  };
+}
+
+function _getFirstVersion(support: bcd.SimpleSupportStatement): string {
+  if (typeof support.version_added === "string") {
+    return support.version_added;
+  } else if (typeof support.version_removed === "string") {
+    return support.version_removed;
+  } else {
+    return "0";
+  }
+}
+
+function _compareVersions(a: string, b: string) {
+  const x = _splitVersion(a);
+  const y = _splitVersion(b);
+
+  return _compareNumberArray(x, y);
+}
+
+function _compareNumberArray(a: number[], b: number[]): number {
+  while (a.length || b.length) {
+    const x = a.shift() || 0;
+    const y = b.shift() || 0;
+    if (x !== y) {
+      return x - y;
+    }
+  }
+
+  return 0;
+}
+function _splitVersion(version: string): number[] {
+  if (version.startsWith("≤")) {
+    version = version.slice(1);
+  }
+
+  return version.split(".").map(Number);
+}
+
+/**
+ * Recursively extracts `__compat` objects from the `feature` and from all
+ * nested features at any depth.
+ *
+ * @param {bcd.Identifier} feature The feature.
+ * @returns {bcd.CompatStatement[]} The array of `__compat` objects.
+ */
+function _extractCompatBlocks(feature: bcd.Identifier): bcd.CompatStatement[] {
+  const blocks: bcd.CompatStatement[] = [];
+  for (const [key, value] of Object.entries(feature)) {
+    if (key === "__compat") {
+      blocks.push(value as bcd.CompatStatement);
+    } else if (typeof value === "object") {
+      blocks.push(..._extractCompatBlocks(value as bcd.Identifier));
+    }
+  }
+  return blocks;
+}
diff --git a/build/extract-sections.ts b/build/extract-sections.ts
new file mode 100644
index 000000000000..ac15cc0ef576
--- /dev/null
+++ b/build/extract-sections.ts
@@ -0,0 +1,454 @@
+import * as cheerio from "cheerio";
+import { extractBCD } from "./extract-bcd";
+import { extractSpecifications } from "./extract-specifications";
+import { ProseSection } from "../libs/types";
+import { Section } from "../libs/types/document";
+
+export type SectionsAndFlaws = [Section[], string[]];
+
+export function extractSections($: cheerio.CheerioAPI): [Section[], string[]] {
+  const flaws: string[] = [];
+  const sections: Section[] = [];
+  const section = cheerio
+    .load("<div></div>", {
+      // decodeEntities: false
+    })("div")
+    .eq(0);
+
+  const body = $("body")[0] as cheerio.ParentNode;
+  const iterable = [...(body.childNodes as cheerio.Element[])];
+
+  let c = 0;
+  iterable.forEach((child) => {
+    if (
+      (child as cheerio.Element).tagName === "h2" ||
+      (child as cheerio.Element).tagName === "h3"
+    ) {
+      if (c) {
+        const [subSections, subFlaws] = addSections(section.clone());
+        sections.push(...subSections);
+        flaws.push(...subFlaws);
+        section.empty();
+      }
+      c = 0;
+    }
+    // We *could* wrap this in something like `if (child.tagName) {`
+    // which would exclude any node that isn't a tag, such as comments.
+    // That might make the DOM nodes more compact and memory efficient.
+    c++;
+    section.append(child);
+  });
+  if (c) {
+    // last straggler
+    const [subSections, subFlaws] = addSections(section);
+    sections.push(...subSections);
+    flaws.push(...subFlaws);
+  }
+
+  // Check for and mutate possible duplicated IDs.
+  // If a HTML document has...:
+  //
+  //   <h2 id="Examples">Check these examples</h2>
+  //   ...
+  //   <h2 id="examples">Examples</h2>
+  //
+  // then this can cause various problems. For example, the anchor links
+  // won't work. The Table of Contents won't be able to do a loop with unique
+  // `key={section.id}` values.
+  // The reason we need to loop through to get a list of all existing IDs
+  // first is because we might have this:
+  //
+  //  <h2 id="foo">Foo X</h2>
+  //  <h2 id="foo">Foo Y</h2>
+  //  <h2 id="foo_2">Foo Z</h2>
+  //
+  // So when you encounter `<h2 id="foo">Foo Y</h2>` you'll know that you
+  // can't suggest it to be `<h2 id="foo_2">Foo Y</h2>` because that ID
+  // is taken by another one, later.
+  const allIDs = new Set(
+    sections
+      .map((section) => section.value.id)
+      .filter(Boolean)
+      .map((id) => id.toLowerCase())
+  );
+
+  const seenIDs = new Set();
+  for (const section of sections) {
+    const originalID = section.value.id;
+    if (!originalID) {
+      // Not all sections have an ID. For example, prose sections that don't
+      // start with a <h2>.
+      // Since we're primarily concerned about *uniqueness* here, let's just
+      // skip worrying about these.
+      continue;
+    }
+    // We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`.
+    const id = originalID.toLowerCase();
+    if (seenIDs.has(id)) {
+      // That's bad! We have to come up with a new ID but it can't be one
+      // that's used by another other section.
+      let increment = 2;
+      let newID = `${originalID}_${increment}`;
+      while (
+        seenIDs.has(newID.toLowerCase()) ||
+        allIDs.has(newID.toLowerCase())
+      ) {
+        increment++;
+        newID = `${originalID}_${increment}`;
+      }
+      section.value.id = newID;
+      seenIDs.add(newID.toLowerCase());
+      flaws.push(
+        `'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})`
+      );
+    } else {
+      seenIDs.add(id);
+    }
+  }
+
+  return [sections, flaws];
+}
+
+/** Return an array of new sections to be added to the complete document.
+ *
+ * Generally, this function is called with a cheerio (`$`) section that
+ * has HTML in it. The task is to structure that a little bit.
+ * If the HTML inside the '$' is:
+ *
+ *   <h2 id="foo">Foo</h2>
+ *   <p>Bla bla</p>
+ *   <ul><li>One</li></ul>
+ *
+ * then, the expected output is to return:
+ *
+ *   [{
+ *       type: "prose",
+ *       id: "foo",
+ *       title: "Foo"
+ *       content: "<p>Bla bla<p>\n<ul><li>One</li></ul>"
+ *   }]
+ *
+ * The reason it's always returning an array is because of special
+ * sections. A special section is one where we try to transform it
+ * first. For example BCD tables. If the input is this:
+ *
+ *   <h2 id="browser_compat">Browser Compat</h2>
+ *   <div class="bc-data" data-query="foo.bar.thing">...</div>
+ *
+ * Then, extract the ID, get the structured data and eventually return this:
+ *
+ *   [{
+ *     type: "browser_compatibility",
+ *     value: {
+ *        query: "foo.bar.thing",
+ *        id: "browser_compat",
+ *        title: "Browser Compat",
+ *        data: {....}
+ *   }]
+ *
+ * Another example is for the specification section. If the input is this:
+ *
+ *   <h2 id="Specifications">Specifications</h2>
+ *   <div class="bc-specs" data-bcd-query="foo.bar.thing">...</div>
+ *
+ * Then, extract the data-bcd-query and return this:
+ *
+ *   [{
+ *     type: "specifications",
+ *     value: {
+ *        query: "foo.bar.thing",
+ *        id: "specifications",
+ *        title: "Specifications",
+ *        specifications: {....}
+ *   }]
+ */
+function addSections($: cheerio.Cheerio<cheerio.Element>): SectionsAndFlaws {
+  const flaws: string[] = [];
+
+  const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length;
+  if (countPotentialSpecialDivs) {
+    /** If there's exactly 1 special table the only section to add is something
+     * like this:
+     *    {
+     *     "type": "browser_compatibility",
+     *     "value": {
+     *       "title": "Browser compatibility",
+     *       "id": "browser_compatibility",
+     *       "query": "html.elements.video",
+     *       "data": {....}
+     *    }
+     *
+     * Where the 'title' and 'id' values comes from the <h2> tag (if available).
+     *
+     * However, if there are **multiple special tables**,
+     * it needs to return something like this:
+     *
+     *   [{
+     *     "type": "prose",
+     *     "value": {
+     *       "id": "browser_compatibility",
+     *       "title": "Browser compatibility"
+     *       "content": "Possible stuff before the table"
+     *    },
+     *    {
+     *     "type": "browser_compatibility",
+     *     "value": {
+     *        "query": "html.elements.video",
+     *        "data": {....
+     *    },
+     *   {
+     *     "type": "prose",
+     *     "value": {
+     *       "content": "Any other stuff before table maybe"
+     *    },
+     */
+    if (countPotentialSpecialDivs > 1) {
+      const subSections: Section[] = [];
+      const section = cheerio
+        .load("<div></div>", {
+          // decodeEntities: false
+        })("div")
+        .eq(0);
+
+      // Loop over each and every "root element" in the node and keep piling
+      // them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then
+      // add that to the stack, clear and repeat.
+      const div = $[0] as cheerio.ParentNode;
+      const iterable = [...(div.childNodes as cheerio.Element[])];
+      let c = 0;
+      let countSpecialDivsFound = 0;
+      iterable.forEach((child) => {
+        if (
+          child.tagName === "div" &&
+          child.attribs &&
+          child.attribs.class &&
+          (child.attribs.class.includes("bc-data") ||
+            child.attribs.class.includes("bc-specs"))
+        ) {
+          countSpecialDivsFound++;
+          if (c) {
+            const [proseSections, proseFlaws] = buildSection(section.clone());
+            subSections.push(...proseSections);
+            flaws.push(...proseFlaws);
+            section.empty();
+            c = 0; // reset the counter
+          }
+          section.append(child);
+          // XXX That `_addSingleSpecialSection(section.clone())` might return a
+          // and empty array and that means it failed and we should
+          // bail.
+          subSections.push(...addSpecialSection(section.clone()));
+          section.empty();
+        } else {
+          section.append(child);
+          c++;
+        }
+      });
+      if (c) {
+        const [proseSections, proseFlaws] = buildSection(section.clone());
+        subSections.push(...proseSections);
+        flaws.push(...proseFlaws);
+      }
+      if (countSpecialDivsFound !== countPotentialSpecialDivs) {
+        const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound;
+        const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${
+          leftoverCount > 1 ? "s" : ""
+        } found but deeply nested.`;
+        flaws.push(explanation);
+      }
+      return [subSections, flaws];
+    }
+    const specialSections = addSpecialSection($);
+
+    // The _addSingleSpecialSection() function will have sucked up the <h2> or <h3>
+    // and the `div.bc-data` or `div.bc-specs` to turn it into a special section.
+    // First remove that, then put whatever HTML is left as a prose
+    // section underneath.
+    $.find("div.bc-data, h2, h3").remove();
+    $.find("div.bc-specs, h2, h3").remove();
+    const [proseSections, proseFlaws] = buildSection($);
+    specialSections.push(...proseSections);
+    flaws.push(...proseFlaws);
+
+    if (specialSections.length) {
+      return [specialSections, flaws];
+    }
+  }
+
+  // all else, leave as is
+  const [proseSections, proseFlaws] = buildSection($);
+  flaws.push(...proseFlaws);
+
+  return [proseSections, flaws];
+}
+
+function addSpecialSection($: cheerio.Cheerio<cheerio.Element>): Section[] {
+  let id: string | null = null;
+  let title: string | null = null;
+  let isH3 = false;
+
+  const h2s = $.find("h2");
+  if (h2s.length === 1) {
+    id = h2s.attr("id");
+    title = h2s.text();
+  } else {
+    const h3s = $.find("h3");
+    if (h3s.length === 1) {
+      id = h3s.attr("id");
+      title = h3s.text();
+      isH3 = true;
+    }
+  }
+
+  let dataQuery = "";
+  let hasMultipleQueries = false;
+  let specURLsString = "";
+  let specialSectionType: string | null = null;
+  if ($.find("div.bc-data").length) {
+    specialSectionType = "browser_compatibility";
+    const elem = $.find("div.bc-data");
+    // Macro adds "data-query", but some translated-content still uses "id".
+    dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? "";
+    hasMultipleQueries = elem.attr("data-multiple") === "true";
+  } else if ($.find("div.bc-specs").length) {
+    specialSectionType = "specifications";
+    dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? "";
+    specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? "";
+  }
+
+  // Some old legacy documents haven't been re-rendered yet, since it
+  // was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."`
+  // or `data-bcd="..."` attribute. If that's the case, bail and fall
+  // back on a regular prose section :(
+  if (!dataQuery && specURLsString === "") {
+    // I wish there was a good place to log this!
+    return buildSection($)[0];
+  }
+  const query = dataQuery.replace(/^bcd:/, "");
+
+  if (specialSectionType === "browser_compatibility") {
+    const { data, browsers } = extractBCD(query);
+
+    if (hasMultipleQueries) {
+      title = query;
+      id = query;
+      isH3 = true;
+    }
+
+    return [
+      {
+        type: "browser_compatibility",
+        value: {
+          title,
+          id,
+          isH3,
+          data,
+          query,
+          browsers,
+        },
+      },
+    ];
+  } else if (specialSectionType === "specifications") {
+    const specifications = extractSpecifications(query, specURLsString);
+
+    return [
+      {
+        type: specialSectionType,
+        value: {
+          title,
+          id,
+          isH3,
+          query,
+          specifications,
+        },
+      },
+    ];
+  }
+
+  throw new Error(`Unrecognized special section type '${specialSectionType}'`);
+}
+
+function buildSection($: cheerio.Cheerio<cheerio.Element>): SectionsAndFlaws {
+  let id: string | null = null;
+  let title: string | null = null;
+  let titleAsText = "";
+  let isH3 = false;
+
+  const flaws: string[] = [];
+
+  // The way this works...
+  // Given a section of HTML, try to extract a id, title,
+  let h2found = false;
+  const h2s = $.find("h2");
+  h2s.each((i) => {
+    const h2 = h2s.eq(i);
+
+    if (i) {
+      // Excess!
+      flaws.push(
+        `Excess <h2> tag that is NOT at root-level (id='${h2.attr(
+          "id"
+        )}', text='${h2.text()}')`
+      );
+    } else {
+      // First element
+      id = h2.attr("id") ?? "";
+      title = h2.html() ?? "";
+      titleAsText = h2.text();
+      h2.remove();
+    }
+    h2found = true;
+  });
+
+  // If there was no <h2>, look through all the <h3>s.
+  if (!h2found) {
+    const h3s = $.find("h3");
+    h3s.each((i) => {
+      const h3 = h3s.eq(i);
+      if (i) {
+        // Excess!
+        flaws.push(
+          `Excess <h3> tag that is NOT at root-level (id='${h3.attr(
+            "id"
+          )}', text='${h3.text()}')`
+        );
+      } else {
+        id = h3.attr("id") ?? "";
+        title = h3.html() ?? "";
+        titleAsText = h3.text();
+        if (id && title) {
+          isH3 = true;
+          h3.remove();
+        }
+      }
+    });
+  }
+
+  if (id) {
+    // Remove trailing underscores (https://github.com/mdn/yari/issues/5492).
+    id = id.replace(/_+$/g, "");
+  }
+
+  const value: ProseSection["value"] = {
+    id,
+    title,
+    isH3,
+    content: $.html()?.trim(),
+  };
+
+  // Only include it if it's useful. It's an optional property and it's
+  // potentially a waste of space to include it if it's not different.
+  if (titleAsText && titleAsText !== title) {
+    value["titleAsText"] = titleAsText;
+  }
+
+  const sections: ProseSection[] = [];
+  if (value.content || value.title) {
+    sections.push({
+      type: "prose",
+      value,
+    });
+  }
+
+  return [sections, flaws];
+}
diff --git a/build/extract-sidebar.ts b/build/extract-sidebar.ts
new file mode 100644
index 000000000000..37774eef39d8
--- /dev/null
+++ b/build/extract-sidebar.ts
@@ -0,0 +1,43 @@
+import * as cheerio from "cheerio";
+import { Doc } from "../libs/types/document";
+
+/** Extract and mutate the $ if it as a "Quick_links" section.
+ * But only if it exists.
+ *
+ * If you had this:
+ *
+ *   const $ = cheerio.load(`
+ *      <div id="Quick_links">Stuff</div>
+ *      <h2>Headline<h2>
+ *      <p>Text</p>
+ *    `)
+ *   const sidebar = extractSidebar($);
+ *   console.log(sidebar);
+ *   // '<div id="Quick_links">Stuff</div>'
+ *   console.log($.html());
+ *   // '<h2>Headline<h2>\n<p>Text</p>'
+ *
+ * ...give or take some whitespace.
+ */
+
+export function extractSidebar($: cheerio.CheerioAPI, doc: Partial<Doc>) {
+  const search = $("#Quick_links");
+
+  if (!search.length) {
+    doc.sidebarHTML = "";
+    return;
+  }
+
+  // Open menu and highlight current page.
+  search.find(`a[href='${doc.mdn_url}']`).each((_i, el) => {
+    $(el).parents("details").prop("open", true);
+    $(el).attr("aria-current", "page");
+    // Highlight, unless it already is highlighted (e.g. heading).
+    if ($(el).find("em,strong").length === 0) {
+      $(el).parent().wrapInner("<em></em>");
+    }
+  });
+
+  doc.sidebarHTML = search.html();
+  search.remove();
+}
diff --git a/build/extract-specifications.ts b/build/extract-specifications.ts
new file mode 100644
index 000000000000..94562f0a0edb
--- /dev/null
+++ b/build/extract-specifications.ts
@@ -0,0 +1,126 @@
+import { packageBCD } from "./resolve-bcd";
+import * as bcd from "@mdn/browser-compat-data/types";
+import { Specification } from "../libs/types/document";
+import specs from "browser-specs";
+import web from "../kumascript/src/api/web";
+
+export function extractSpecifications(
+  query: string,
+  specURLsString
+): Specification[] {
+  if (query === undefined && specURLsString === "") {
+    return [];
+  }
+
+  // Collect spec URLs from a BCD feature, a 'spec-urls' value, or both;
+  // For a BCD feature, it can either be a string or an array of strings.
+  let specURLs: string[] = [];
+
+  function getSpecURLs(data: bcd.Identifier) {
+    // If we’re processing data for just one feature, then the 'data'
+    // variable will have a __compat key. So we get the one spec_url
+    // value from that, and move on.
+    //
+    // The value may have data for subfeatures too — each subfeature with
+    // its own __compat key that may have a spec_url — but in that case,
+    // for the purposes of the Specifications section, we don’t want to
+    // recurse through all the subfeatures to get those spec_url values;
+    // instead we only want the spec_url from the top-level __compat key.
+    if (data && data.__compat) {
+      const compat = data.__compat;
+      if (compat.spec_url) {
+        if (Array.isArray(compat.spec_url)) {
+          specURLs.push(...compat.spec_url);
+        } else {
+          specURLs.push(compat.spec_url);
+        }
+      }
+    } else {
+      // If we get here, we’re processing data for two or more features
+      // and the 'data' variable will contain multiple blocks (objects)
+      // — one for each feature.
+      if (!data) {
+        return;
+      }
+      for (const block of Object.values(data)) {
+        if (!block) {
+          continue;
+        }
+        if (!("__compat" in block)) {
+          // Some features — e.g., css.properties.justify-content — have
+          // no compat data themselves but have subfeatures with compat
+          // data. So we recurse through the nested property values until
+          // we either do or don’t find any subfeatures with spec URLs.
+          // Otherwise, if we’re processing multiple top-level features
+          // (that is, from a browser-compat value which is an array),
+          // we’d end up entirely missing the data for this feature.
+          getSpecURLs(block as bcd.Identifier);
+        } else {
+          // If we get here, we’ve got a __compat key, and we can extract
+          // any spec URLs its value may contain.
+          const compat = block.__compat;
+          if (compat && compat.spec_url) {
+            if (Array.isArray(compat.spec_url)) {
+              specURLs.push(...compat.spec_url);
+            } else {
+              specURLs.push(compat.spec_url);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (query) {
+    for (const feature of query.split(",").map((id) => id.trim())) {
+      const { data } = packageBCD(feature);
+      // If 'data' is non-null, we have data for one or more BCD features
+      // that we can extract spec URLs from.
+      getSpecURLs(data);
+    }
+  }
+
+  if (specURLsString !== "") {
+    // If specURLsString is non-empty, then it has the string contents
+    // of the document’s 'spec-urls' frontmatter key: one or more URLs.
+    specURLs.push(...specURLsString.split(",").map((url) => url.trim()));
+  }
+
+  // Eliminate any duplicate spec URLs
+  specURLs = [...new Set(specURLs)];
+
+  // Use BCD specURLs to look up more specification data
+  // from the browser-specs package
+  const specifications = specURLs
+    .map((specURL) => {
+      const spec = specs.find(
+        (spec) =>
+          specURL.startsWith(spec.url) ||
+          specURL.startsWith(spec.nightly.url) ||
+          spec.nightly.alternateUrls.some((s) => specURL.startsWith(s)) ||
+          // When grabbing series nightly, make sure we're grabbing the latest spec version
+          (spec.shortname === spec.series.currentSpecification &&
+            specURL.startsWith(spec.series.nightlyUrl))
+      );
+      const specificationsData = {
+        bcdSpecificationURL: specURL,
+        title: "Unknown specification",
+      };
+      if (spec) {
+        specificationsData.title = spec.title;
+      } else {
+        const specList = web.getJSONData("SpecData");
+        const titleFromSpecData = Object.keys(specList).find(
+          (key) => specList[key]["url"] === specURL.split("#")[0]
+        );
+        if (titleFromSpecData) {
+          specificationsData.title = titleFromSpecData;
+        }
+      }
+
+      return specificationsData;
+    })
+    .filter(Boolean);
+
+  return specifications;
+}
diff --git a/build/extract-summary.ts b/build/extract-summary.ts
new file mode 100644
index 000000000000..589f8cca66d4
--- /dev/null
+++ b/build/extract-summary.ts
@@ -0,0 +1,59 @@
+import * as cheerio from "cheerio";
+import { ProseSection, Section } from "../libs/types/document";
+
+/**
+ * Given an array of sections, return a plain text
+ * string of a summary. No HTML or Kumascript allowed.
+ */
+
+export function extractSummary(sections: Section[]): string {
+  let summary = ""; // default and fallback is an empty string.
+
+  // If the sections contains a "Summary" one, use that, otherwise
+  // use the first prose one.
+  const summarySections = sections.filter(
+    (section: Section): section is ProseSection =>
+      section.type === "prose" && section.value.title === "Summary"
+  );
+  if (summarySections.length) {
+    const $ = cheerio.load(summarySections[0].value.content ?? "");
+    summary = extractFirstGoodParagraph($);
+  } else {
+    for (const section of sections) {
+      if (
+        section.type !== "prose" ||
+        !section.value ||
+        !section.value.content
+      ) {
+        continue;
+      }
+      const $ = cheerio.load(section.value.content);
+      // Remove non-p tags that we should not be looking inside.
+      $("div.notecard, div.note, div.blockIndicator").remove();
+      summary = extractFirstGoodParagraph($);
+      if (summary) {
+        break;
+      }
+    }
+  }
+  return summary;
+}
+
+function extractFirstGoodParagraph($: cheerio.CheerioAPI): string {
+  const seoSummary = $("span.seoSummary, .summary");
+  if (seoSummary.length && seoSummary.text()) {
+    return seoSummary.text();
+  }
+  let summary = "";
+  $("p").each((i, p) => {
+    // The `.each()` can only take a callback, so we need a solution
+    // to exit early once we've found the first working summary.
+    if (summary) return; // it already been found!
+    const text = $(p).text().trim();
+    // Avoid those whose paragraph is just a failing KS macro
+    if (text && !text.includes("Redirect") && !text.startsWith("{{")) {
+      summary = text;
+    }
+  });
+  return summary;
+}
diff --git a/build/index.ts b/build/index.ts
index 1cba26a2897d..47721d252c23 100644
--- a/build/index.ts
+++ b/build/index.ts
@@ -13,11 +13,9 @@ import { CONTENT_ROOT, REPOSITORY_URLS } from "../libs/env";
 import * as kumascript from "../kumascript";
 
 import { FLAW_LEVELS } from "../libs/constants";
-import {
-  extractSections,
-  extractSidebar,
-  extractSummary,
-} from "./document-extractor";
+import { extractSidebar } from "./extract-sidebar";
+import { extractSections } from "./extract-sections";
+import { extractSummary } from "./extract-summary";
 export { default as SearchIndex } from "./search-index";
 import { addBreadcrumbData } from "./document-utils";
 import { fixFixableFlaws, injectFlaws, injectSectionFlaws } from "./flaws";
diff --git a/libs/types/document.ts b/libs/types/document.ts
index 9581a8b02702..9b525a1959fa 100644
--- a/libs/types/document.ts
+++ b/libs/types/document.ts
@@ -187,6 +187,11 @@ export interface ProseSection {
     titleAsText?: string;
   };
 }
+
+export interface Specification {
+  bcdSpecificationURL: any;
+  title: string;
+}
 export interface SpecificationsSection {
   type: "specifications";
   value: {
@@ -194,10 +199,7 @@ export interface SpecificationsSection {
     title: string;
     isH3: boolean;
     query: string;
-    specifications: {
-      bcdSpecificationURL: any;
-      title: string;
-    }[];
+    specifications: Specification[];
   };
 }