From 17863ff741e5da5bad12af30d206f3a10bf1b0ed Mon Sep 17 00:00:00 2001 From: Francois Daoust Date: Thu, 17 Oct 2024 16:51:12 +0200 Subject: [PATCH] Make W3C script use the W3C API as source This is a complete re-write of the W3C update script to switch from the still-maintained-but-deprecated `tr.rdf` file to the more complete and current W3C API instead. What changes? Essentially nothing substantial in terms of data, but: - When an entry is updated, the `source` property will target the API endpoint from which the data was pulled, such as: `https://api.w3.org/specifications/_shortname_` - The W3C API has a few additional statuses that were not reported in `tr.rdf` such as `DNOTE`, `FPWD`, `LCWD`, and the registry statuses. - The script fills out properties more systematically for versions - The order of the properties for each entry is not always exactly the same as the order generated by the previous script. The first time the script runs, it will: - Fix a few entries of very old specs in Specref, for which the title is not the title of the actual spec. - Add entries for the draft registries published by a couple of groups. - Complete a few entries with additional versions that did not exist in `tr.rdf` for some reason. - Create consistent `obsoletes` properties, as Specref contains a few `obsoletes` properties that don't have a matching `obsoletedBy` property. The script only updates recently published entries by default. That is, it does not attempt to refresh the whole list. That's needed because the W3C API follows the HAL convention: https://en.wikipedia.org/wiki/Hypertext_Application_Language One consequence is that each API request returns only a minimal amount of information, and re-generating the entire `w3c.json` file requires sending ~30000 requests, which would be at best impractical to do on an hourly basis, all the more so because the W3C API server has some rate limits rules in place (6000 requests every 10 minutes). More importantly, that would be a waste of resources as data essentially never changes once published. Thus the script takes an incremental approach instead and only refreshes: 1. Specifications recently published... where recently published means specifications published since the newest publication date known to Specref minus 2 weeks by default. The "minus 2 weeks" is meant to catch data fixes that are sometimes made shortly after publication. 2. Specifications for which the base info (title, URL) is not aligned with the W3C API. That's meant to fix the data in Specref during the transition, and to catch further updates that could be made to the W3C API once in a while. All in all, a typical update should send ~500 requests to the W3C API. The code throttles requests to 1 every 100ms. Running the script should take ~1-2 minutes. A more thorough refresh may be forced by calling the script with a date as parameter (format YYYY-MM-DD, with month and day optional). The date gets interpreted as the synchronization point. For example, to refresh all specs published since 2023, run: ``` node scripts/w3c.js 2023 ``` To force a "full" refresh (any year prior to 1995 would work): ``` node scripts/w3c.js 1995 ``` A full refresh sends ~30k requests to the W3C API and may take >2h. I suggest to run a full refresh manually once, shortly after this script starts being used, and then to run it again every few months to capture potential fixes that might have been made to the data in the meantime. Running that full refresh will also be useful to fix the few `obsoletedBy` properties that are not fully correct, and to move a few `hasErrata` links to the right spec version, as some entries have these links at the root level of the entry in Specref, whereas the latest version is no longer the REC that linked to the errata. I worked with @deniak to fix and complete the data in the W3C API where Specref had more correct info. I also updated entries that contained incorrect info in Specref. The script contains a number of comments to explain the different cases that need to be handled to be able to fully map the data in the W3C API with the data in Specref. There will remain a few entries where the mapping is somewhat imperfect, notably when the shortname of a spec evolved from a level-less shortname to shortname with level, and sometimes back to a level-less shortname (examples include `user-timing`, `performance-timeline`). There are also a few entries for old specs that are flagged as retired in Specref (`isRetired: true`) but not in the W3C API. Mismatches are reported to the console as warnings. These should be addressed over time. The script preserves the information in Specref in any case. The script also preserves the information in Specref in case of transient network errors while fetching info from the W3C API. The new overwrites rules are needed during the transition (the changes need to be made as the same time as the data gets updated), but can be dropped afterwards. They affect specifications that switched from a shortname without a level to a shortname with a level. Longer term, these should be handled through the notion of specification series (see #811). --- overwrites/w3c.json | 24 +- scripts/get-shortname.js | 82 +--- scripts/helper.js | 1 + scripts/w3c.js | 919 ++++++++++++++++++++++++++------------- 4 files changed, 617 insertions(+), 409 deletions(-) diff --git a/overwrites/w3c.json b/overwrites/w3c.json index 6f67dd5fd..a466ea68a 100644 --- a/overwrites/w3c.json +++ b/overwrites/w3c.json @@ -1,18 +1,8 @@ [ - { "id": "WebIDL", "action": "delete" }, - { "id": "WebIDL-1", "action": "delete" }, - { "id": "WebIDL-1", "action": "createAlias", "aliasOf": "WEBIDL"}, - { "id": "websockets", "action": "delete" }, - { "id": "url", "action": "delete" }, - { "id": "fullscreen", "action": "delete" }, - { "id": "notifications", "action": "delete" }, - { "id": "dom", "action": "delete" }, - { "id": "encoding", "action": "delete" }, - { "id": "hr-time", "action": "createAlias", "aliasOf": "hr-time-3"}, - { "id": "resource-timing", "action": "deleteProp", "prop": "obsoletes"}, - { "id": "resource-timing-1", "action": "createAlias", "aliasOf": "resource-timing"}, - { "id": "PNG", "action": "renameTo", "newId": "PNG-1"}, - { "id": "PNG", "action": "createAlias", "aliasOf": "png-3"}, - { "id": "media-source", "action": "createAlias", "aliasOf": "media-source-2" }, - { "id": "encrypted-media", "action": "createAlias", "aliasOf": "encrypted-media-2" } -] + { "id": "vocab-dcat", "action": "delete" }, + { "id": "vocab-dcat", "action": "createAlias", "aliasOf": "vocab-dcat-1" }, + { "id": "wot-architecture", "action": "delete" }, + { "id": "wot-architecture", "action": "createAlias", "aliasOf": "wot-architecture10" }, + { "id": "wot-thing-description", "action": "delete" }, + { "id": "wot-thing-description", "action": "createAlias", "aliasOf": "wot-thing-description10" } +] \ No newline at end of file diff --git a/scripts/get-shortname.js b/scripts/get-shortname.js index 56a8f5337..fa0b51fdb 100644 --- a/scripts/get-shortname.js +++ b/scripts/get-shortname.js @@ -1,36 +1,9 @@ var TR_URL = /https?:\/\/www\.w3\.org\/TR\//; -var SPECIAL_CASES = { - 'https://www.w3.org/Search/9605-Indexing-Workshop/ReportOutcomes/S6Group2': "S6Group2", - 'https://www.w3.org/TR/1998/NOTE-P3P10-Protocols': "P3P10-Protocols", - 'https://www.w3.org/1999/05/WCA-terms/': "WCA-terms", - 'https://www.w3.org/TR/1998/WD-HTTP-NG-goals': "HTTP-NG-goals", - 'https://www.w3.org/TR/REC-html32': "HTML32", - 'https://www.w3.org/TR/2001/WD-xhtml1-20011004/': 'xhtml1' -}; - var SHORT_NAME_SPECIAL_CASES = { - "html": "w3c-html", - "NOTE-CSS-potential": "CSS-potential", - "NOTE-P3P10-principles": "P3P10-principles", - "NOTE-SYMM-modules": "SYMM-modules", - "NOTE-XML-FRAG-REQ": "XML-FRAG-REQ", - "NOTE-html-lan": "html-lan", - "NOTE-voice": "voice", - "NOTE-xh": "xh", - "NOTE-xlink-principles": "xlink-principles", - "NOTE-xml-canonical-req": "xml-canonical-req", - "NOTE-xml-infoset-req": "xml-infoset-req", - "NOTE-xml-schema-req": "xml-schema-req", - "NOTE-xptr-infoset-liaison": "xptr-infoset-liaison", - "NOTE-xptr-req": "xptr-req", - "NOTE-HTTP-NG-testbed": "HTTP-NG-testbed", - "NOTE-WCA": "WCA", - "NOTE-html40-mobile": "html40-mobile", - "NOTE-rdf-uml": "rdf-uml", - "NOTE-xlink-req": "xlink-req", "WD-DSIG-label-arch": "DSIG-label-arch", "WD-HTTP-NG-architecture": "HTTP-NG-architecture", + "WD-HTTP-NG-goals": "HTTP-NG-goals", "WD-HTTP-NG-interfaces": "HTTP-NG-interfaces", "WD-HTTP-NG-wire": "HTTP-NG-wire", "WD-P3P-arch": "P3P-arch", @@ -38,67 +11,19 @@ var SHORT_NAME_SPECIAL_CASES = { "WD-SVGReq": "SVGReq", "WD-XSLReq": "XSLReq", "WD-acss": "acss", - "WD-font": "font", "WD-http-pep": "http-pep", "WD-ilu-requestor": "ilu-requestor", "WD-jepi-uppflow": "jepi-uppflow", "WD-mux": "mux", "WD-positioning": "positioning", "WD-print": "print", - "NOTE-agent-attributes": "agent-attributes", - "NOTE-amaya": "amaya", - "NOTE-AS": "AS", - "NOTE-authentform": "authentform", - "NOTE-CCPPexchange": "CCPPexchange", - "NOTE-CSS-smil": "CSS-smil", - "NOTE-datetime": "datetime", - "NOTE-dcd": "dcd", - "NOTE-ddml": "ddml", - "NOTE-drawml": "drawml", - "NOTE-drp": "drp", - "NOTE-eu-conf": "eu-conf", - "NOTE-expamaya": "expamaya", - "NOTE-framework-970706": "framework-970706", - "NOTE-gdiff": "gdiff", - "NOTE-HGML": "HGML", - "NOTE-html": "html", - "NOTE-HTMLComponents": "HTMLComponents", - "NOTE-HTMLplusTIME": "HTMLplusTIME", - "NOTE-http-edit-dist-scenarios": "http-edit-dist-scenarios", - "NOTE-ice": "ice", - "NOTE-IPWG-Practices": "IPWG-Practices", - "NOTE-jepi": "jepi", - "NOTE-link": "link", - "NOTE-MCF-XML": "MCF-XML", - "NOTE-P3P-CACM": "P3P-CACM", - "NOTE-PICS-Cookie-extension": "PICS-Cookie-extension", - "NOTE-pics-ng-metadata": "pics-ng-metadata", - "NOTE-PICS-Statement": "PICS-Statement", - "NOTE-PIDL": "PIDL", - "NOTE-rdf-simple-intro": "rdf-simple-intro", - "NOTE-rdfarch": "rdfarch", - "NOTE-SDML": "SDML", - "NOTE-SOX": "SOX", - "NOTE-stts2": "stts2", - "NOTE-STTS3": "STTS3", - "NOTE-TPRC-970930": "TPRC-970930", - "NOTE-uclp": "uclp", - "NOTE-VML": "VML", - "NOTE-WAP": "WAP", - "NOTE-webarch-extlang": "webarch-extlang", - "NOTE-widl": "widl", - "NOTE-XFDL": "XFDL", - "NOTE-xml-ql": "xml-ql", - "NOTE-XSL": "XSL", - "NOTE-XSL-and-CSS": "XSL-and-CSS", - "REC-CSS1": "CSS1", "REC-DOM-Level-1": "DOM-Level-1", "REC-DSig-label": "DSig-label", + "REC-html32": "HTML32", "REC-MathML": "MathML", "REC-PICS-labels": "PICS-labels", "REC-PICS-services": "PICS-services", "REC-PICSRules": "PICSRules", - "REC-rdf-syntax": "rdf-syntax-grammar", "REC-smil": "smil", "REC-WebCGM": "WebCGM", "WD-logfile": "logfile", @@ -111,10 +36,9 @@ var SHORT_NAME_SPECIAL_CASES = { } function getShortName(url) { - if (SPECIAL_CASES[url]) return SPECIAL_CASES[url]; var parts = url.replace(TR_URL, "").split("/").filter(function(p) { return p != "/" && p != ''; }); if (parts.length > 1) throw new Error("Can't identify shortName from url " + url); - var part = parts[0] + var part = parts[0]; return SHORT_NAME_SPECIAL_CASES[part] || part; } diff --git a/scripts/helper.js b/scripts/helper.js index 63c5d84b8..c36945843 100644 --- a/scripts/helper.js +++ b/scripts/helper.js @@ -133,5 +133,6 @@ function tryOverwrite(f) { delete ref[action.prop]; } }); + references = sortRefs(references); writeBiblio(f, references); } diff --git a/scripts/w3c.js b/scripts/w3c.js index 17e1f58a7..e0f0df4c2 100755 --- a/scripts/w3c.js +++ b/scripts/w3c.js @@ -1,354 +1,647 @@ #!/usr/bin/env node -var request = require('request'), - userAgent =require("./user-agent"), - xml2js = require('xml2js'), - bibref = require('../lib/bibref'), - helper = require('./helper'), - getShortName = require('./get-shortname'), - leveled = require('./leveled'); - -var RDF_FILE = "https://www.w3.org/2002/01/tr-automation/tr.rdf"; -var FILENAME = "w3c.json"; -var current = helper.readBiblio(FILENAME); - -var STATUSES = { - 'NOTE': 'NOTE', - 'REC': 'REC', - 'CR': 'CR', - 'WD': 'WD', - 'LastCall': 'LCWD', - 'PER': 'PER', - 'PR': 'PR' -}; +const userAgent =require('./user-agent'); +const bibref = require('../lib/bibref'); +const helper = require('./helper'); +const getShortName = require('./get-shortname'); + +// Read CLI arguments +const args = process.argv.slice(2); +const verbose = !!args.find(arg => arg === '--verbose' || arg === '-v'); +let since = args.find(arg => arg.match(/^\d{4}(-\d\d(-\d\d)?)?$/)); +if (!since && args.find(arg => !arg.startsWith('-'))) { + console.log( +`Cannot interpret the provided parameter as a date. +Please provide a date following the format YYYY, YYYY-MM, or YYYY-MM-DD format. +Or run the script without parameter to update recently published specs`); + process.exit(1); +} + +const W3C_API = "https://api.w3.org/"; +const FILENAME = "w3c.json"; +const current = helper.readBiblio(FILENAME); -var TR_URLS = { - "https://www.w3.org/TR/REC-CSS1": "https://www.w3.org/TR/CSS1/", - "https://www.w3.org/TR/REC-CSS2": "https://www.w3.org/TR/CSS2/", - "https://www.w3.org/TR/REC-DOM-Level-1": "https://www.w3.org/TR/DOM-Level-1/", - "https://www.w3.org/TR/REC-DSig-label/": "https://www.w3.org/TR/DSig-label/", - "https://www.w3.org/TR/REC-MathML": "https://www.w3.org/TR/MathML/", - "https://www.w3.org/TR/REC-PICS-labels": "https://www.w3.org/TR/PICS-labels/", - "https://www.w3.org/TR/REC-PICS-services": "https://www.w3.org/TR/PICS-services/", - "https://www.w3.org/TR/REC-PICSRules": "https://www.w3.org/TR/PICSRules/", - "https://www.w3.org/TR/REC-WebCGM": "https://www.w3.org/TR/WebCGM/", - "https://www.w3.org/TR/REC-png": "https://www.w3.org/TR/PNG/", - "https://www.w3.org/TR/REC-rdf-syntax": "https://www.w3.org/TR/rdf-syntax-grammar/", - "https://www.w3.org/TR/REC-smil/": "https://www.w3.org/TR/SMIL/", - "https://www.w3.org/TR/REC-xml-names": "https://www.w3.org/TR/xml-names/", - "https://www.w3.org/TR/REC-xml": "https://www.w3.org/TR/xml/", - "https://www.w3.org/TR/xml-events": "https://www.w3.org/TR/xml-events2/", - "https://www.w3.org/TR/2001/WD-xhtml1-20011004/": "https://www.w3.org/TR/xhtml1/", +// Record the entries that used to be aliases and that get turned into concrete +// entries, so that we may reverse the alias link in the end. +const aliasesToInvert = {}; + +// Record the number of requests sent to the W3C API +let requestsToW3CApi = 0; + +/** + * Specref uses abbreviations for W3C statuses. + * + * TODO: It would probably be better to use longer forms throughout. That would + * require updating all statuses in refs/w3c.json at once, and making sure that + * consumers are aware of the change first... + * + * Note that the W3C API considers that the status of a Retired spec is + * "Retired", while Specref records that information on the side and uses the + * status of the spec before it got retired. + */ +const STATUSES = { + 'First Public Working Draft': 'FPWD', + 'Working Draft': 'WD', + 'Last Call Working Draft': 'LCWD', + 'Candidate Recommendation': 'CR', + 'Candidate Recommendation Draft': 'CRD', + 'Candidate Recommendation Snapshot': 'CR', + 'Proposed Recommendation': 'PR', + 'Proposed Edited Recommendation': 'PER', + 'Recommendation': 'REC', + 'Draft Note': 'DNOTE', + 'Note': 'NOTE', + 'Draft Registry': 'DRY', + 'Candidate Registry Draft': 'CRYD', + 'Candidate Registry': 'CRY', + 'Registry': 'RY', + 'Statement': 'STMT' }; +function getStatus(version, versions) { + if (version.status in STATUSES) { + return STATUSES[version.status]; + } + if (version.status === 'Retired') { + // Let's look for something that looks like a W3C status in the URL of + // the retired spec and fallback to the status of the first previous + // version for which we can determine a non-retired status otherwise. + // Note: while that may seem strange, more than one "Retired" version + // of a spec may be published for a single spec. For an example, see: + // https://api.w3.org/specifications/cselection/versions?embed=1 + const reStatus = /^https:\/\/www\.w3\.org\/TR\/(?:\d{4}\/)?([^-]+)-.*-\d{6,8}/; + let seen = false; + for (let i = versions.length - 1; i >= 0; i--) { + const currVersion = versions[i]; + if (!seen) { + seen = currVersion === version; + } + if (!seen) { + continue; + } + if (currVersion.status in STATUSES) { + return STATUSES[currVersion.status]; + } + const match = currVersion.uri.match(reStatus); + if (match && Object.values(STATUSES).includes(match[1])) { + return match[1]; + } + } -function convertToHttps(url) { - if (url) { - url = url.replace(/^http:\/\/www\.w3\.org/, "https://www.w3.org"); - url = url.replace(/http:\/\/([a-zA-Z0-9_-]+)\.github\.io/, "https://$1.github.io"); + // And then, there's WCA-Terms, which just exists as a "Retired" spec + // published in 1999 in the W3C API (that's the only case): + // https://api.w3.org/specifications/WCA-terms + if (version.shortlink === 'https://www.w3.org/1999/05/WCA-terms/') { + return STATUSES['Working Draft']; + } } - return url; + console.error(`- No good status in ${version._links.self.href} for ${version.title}, using "WD"`); + return STATUSES['Working Draft']; } function makeKey(ref) { return ref.rawDate.replace(/\-/g, ''); } -function getKey(shortname) { - return shortname.replace(/^.*?(\d+)$/, "$1"); +/** + * Helper function to sleep for a specified number of milliseconds + */ +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms, 'slept')); +} + +/** + * The W3C API has a rate limit of 6000 requests every 10 minutes. + * Hopefully, we won't need to send that many requests, but let's + * throttle requests to 10 per second to err on the safe side. + */ +async function fetchW3CApi(url) { + const fetchParams = { + headers: [['User-Agent', userAgent()]] + }; + try { + const response = await fetch(url, fetchParams); + requestsToW3CApi++; + await sleep(100); + return response; + } + catch (err) { + console.error(`Network error while fetching ${url}`); + console.error(err); + return null; + } +} + +/** + * Most W3C API requests return responses that are paginated. This function + * retrieves and merges all pages for a particular endpoint, using embedded + * information if so requested. + */ +async function fetchW3CPages(endpoint, property, embed) { + let nbPages = 1; + let page = 1; + let baseUrl = (endpoint.startsWith('https') ? '' : W3C_API) + + endpoint + '?embed=' + (embed ? '1' : '0') + + '&page='; + let res = []; + while (page <= nbPages) { + const url = baseUrl + page; + if (verbose) { + console.log(`Fetching ${url}`); + } + const response = await fetchW3CApi(url); + if (!response) { + return null; + } + if (response.status !== 200) { + console.error(`W3C API server error while fetching ${url}: HTTP status ${response.status} received`); + return null; + } + const json = await response.json(); + nbPages = json.pages; + const pageRes = embed ? + json._embedded[property] : + json._links[property]; + if (pageRes) { + res = res.concat(pageRes); + } + page += 1; + } + return res; +} + +/** + * Return true if the entry in Specref contains outdated info about the spec + * compared to the W3C API. + * + * Note the comparison is only about generic spec info (title, ED) as more + * requests to the W3C API are needed to get more info about the spec. + */ +function containsOutdatedInfo(curr, w3cSpec, w3cLatestDate) { + return makeKey(curr) <= w3cLatestDate && + (curr.href !== w3cSpec.shortlink || + curr.title !== w3cSpec.title || + curr.edDraft && w3cSpec['editor-draft'] && + curr.edDraft !== w3cSpec['editor-draft']); } -function isLegacyLevel(shortname) { - return leveled.getLevel(shortname).indexOf("-") < 0; +/** + * Return the latest publication date (format YYYYMMDD) of the given W3C spec + */ +function getLatestDate(w3cSpec) { + const latestDateUrl = w3cSpec._links['latest-version'].href; + return latestDateUrl.match(/versions\/(\d{8})$/)[1]; } -var parser = new xml2js.Parser(); -console.log("Updating W3C references..."); -request({ - url: RDF_FILE, - headers: { - 'User-Agent': userAgent() +/** + * Update the Specref entry from the data returned by the W3C API, fetching + * additional detailed information about versions as needed. + * + * Note: to avoid sending too many requests to the W3C API, we assume that old + * data about the list of editors and deliverers is correct. We will only fetch + * that information for recent versions. + */ +async function updateSpecrefFromW3CApi(curr, w3cSpec, fromDate) { + // Set base information returned by the W3C API + curr.href = w3cSpec.shortlink; + curr.title = w3cSpec.title; + if (w3cSpec['editor-draft']) { + curr.edDraft = w3cSpec['editor-draft']; + } + else if (curr.edDraft) { + console.warn(`- ${w3cSpec.shortname}: ED URL in Specref but not in the W3C API - ${curr.edDraft}`); + } + curr.publisher = 'W3C'; + curr.source = w3cSpec._links.self.href; + + // Fetch the list of specs that supsersede the current entry, if needed. + if (w3cSpec._links['superseded-by']) { + curr.isSuperseded = true; + const supersededBy = await fetchW3CPages( + w3cSpec._links['superseded-by'].href, 'superseded', true); + if (supersededBy) { + curr.obsoletedBy = supersededBy.map(spec => spec.shortname); + } + else { + console.error(`- ${w3cSpec.shortname}: could not retrieve the list of superseding specs from the W3C API`); + } + } + else if (curr.isSuperseded || curr.obsoletedBy) { + console.warn(`- ${w3cSpec.shortname}: superseded in Specref but not in the W3C API.`); } -}, function(err, response, body) { - if (err || response.statusCode !== 200) { - console.log("Can't fetch", RDF_FILE + "..."); + + // Retrieve the list of versions from the W3C API. The list includes + // versions directly associated with the specification, as well as those + // that are relevant to the history of the spec but that are actually + // associated with another specification. Let's drop the latter one (they + // will be handled as part of that other specification). + let versions = await fetchW3CPages(w3cSpec._links['version-history'].href, 'version-history', true); + if (!versions) { + console.error(`- ${w3cSpec.shortname}: could not retrieve versions info from the W3C API`); return; } - - console.log("Fetching", RDF_FILE + "...") - parser.parseString(body, function (err, result) { - var refs = result['rdf:RDF']; - var output = []; - var aliases = {}; - var levels = {}; - var superseders = {}; - - Object.keys(STATUSES).forEach(function(k) { - if (refs[k]) { - var clean = makeCleaner(STATUSES[k]); - refs[k].forEach(function(ref) { - output.push(clean(ref)); - }); - } - }); - - var clean; - if (refs.FirstEdition) { - clean = makeCleaner(void 0); - refs.FirstEdition.forEach(function(ref) { - output.push(clean(ref)); - }); - } - - if (refs.Retired) { - clean = makeCleaner(void 0, true); - refs.Retired.forEach(function(ref) { - output.push(clean(ref)); - }); - } - - if (refs.Superseded) { - clean = makeCleaner(void 0, void 0, true); - refs.Superseded.forEach(function(ref) { - output.push(clean(ref)); - }); - } - - if (refs["rdf:Description"]) { - refs["rdf:Description"].forEach(function(ref) { - var url = convertToHttps(walk(ref, "$", "rdf:about")); - var sn = getShortName(TR_URLS[url] || url); - - var former = walk(ref, "formerShortname"); - if (former) { - url = convertToHttps(walk(ref, "$", "rdf:about")); - sn = getShortName(TR_URLS[url] || url); - former.forEach(function(item) { - if (item == sn) return; - if (aliases[item] && aliases[item] !== sn) { - console.log("Want to alias [" + item + "] to [" + sn + "] but it's already aliased to [" + aliases[item] + "]." ) ; - return; - } - aliases[item] = sn; - }); - return; - } - var supersedes = walk(ref, "supersedes"); - if (supersedes) { - url = convertToHttps(walk(ref, "$", "rdf:about")); - sn = getShortName(TR_URLS[url] || url); - superseders[sn] = supersedes.map(function(item) { - var url = convertToHttps(walk(item, "$", "rdf:resource")); - return getShortName(TR_URLS[url] || url); - }); - } - }); - } - - function isCircular(k) { - var keys = { k: true }; - while (k in aliases) { - if (k in keys) return true; - k = aliases[k]; - keys[k] = true; + versions = versions.filter(version => + version._links.specification.href === w3cSpec._links.self.href + ); + versions.sort((v1, v2) => v1.date.localeCompare(v2.date)); + + const latestVersion = versions[versions.length - 1]; + for (const version of versions) { + version.rawDate = version.date; + const key = makeKey(version); + if (key > fromDate || version === latestVersion) { + // Recent (or last) version, fetch editors and deliverers + // (If that yields an error, we will just preserve whatever info + // already exists in Specref until next time the script runs) + version.editors = await fetchW3CPages(version._links.editors.href, 'editors', false); + if (!version.editors) { + console.error(`- ${w3cSpec.shortname} (${key}): could not retrieve the list of editors from the W3C API`); } - return false; - } - - var circular = []; - Object.keys(aliases).forEach(function(k) { - if (isCircular(k)) { - console.log(k, "=>", aliases[k]); - circular.push(k) + version.deliverers = await fetchW3CPages(version._links.deliverers.href, 'deliverers', true); + if (version.deliverers) { + // Note: the W3C API associates very old specs with a fake + // group named "unknownwg". + version.deliverers = version.deliverers.filter(g => + g.shortname !== 'unknownwg'); } - }); - - Object.keys(aliases).forEach(function(k) { - if (circular.indexOf(k) >= 0 || circular.indexOf(aliases[k]) >= 0) { - delete aliases[k]; + else { + console.error(`- ${w3cSpec.shortname} (${key}): could not retrieve the list of deliverers from the W3C API`); } - }); - - // Fill in missing specs - output.forEach(function(ref) { - var k = ref.shortName; - if (leveled.isLevel(k)) { - k = leveled.getRootShortname(k); - levels[ref.shortName] = k + "-" + makeKey(ref); - delete aliases[k]; + } + if (!curr.versions) { + curr.versions = {}; + } + let currVersion = curr.versions[key]; + if (!currVersion) { + // Unknown version in Specref, let's add it + curr.versions[key] = {}; + currVersion = curr.versions[key]; + } + else if (currVersion.aliasOf) { + // Version was known as being an alias in Specref, let's align with + // the W3C API instead and make it a concrete entry. We'll make the + // other entry an alias of this one at the end if needed + console.log(`- ${w3cSpec.shortname} (${key}): drop alias to ${currVersion.aliasOf} in Specref`); + aliasesToInvert[currVersion.aliasOf] = w3cSpec.shortname + '-' + key; + delete currVersion.aliasOf; + } + if (version.editors?.length > 0) { + currVersion.authors = version.editors + .map(editor => editor.title); + } + currVersion.href = version.uri; + currVersion.title = version.title; + currVersion.rawDate = version.rawDate; + currVersion.status = getStatus(version, versions); + currVersion.publisher = "W3C"; + if (version.deliverers?.length > 0) { + // Note: the W3C API associates very old specs with a fake group + // named "unknownwg". + currVersion.deliveredBy = version.deliverers + .map(group => group._links.homepage.href); + } + if (version.status === 'Retired') { + currVersion.isRetired = true; + } + else if (currVersion.isRetired) { + console.warn(`- ${w3cSpec.shortname} (${version.date}): retired in Specref but not in the W3C API, see ${version._links.self.href}`); + } + if (version.errata) { + currVersion.hasErrata = version.errata; + } + else if (currVersion.hasErrata) { + console.warn(`- ${w3cSpec.shortname} (${version.date}): errata in Specref but not in the W3C API - ${currVersion.hasErrata}`); + } + currVersion.source = version._links.self.href; + } + if (curr.versions) { + curr.versions = helper.sortRefs(curr.versions); + } + + // Complete base info with the info from the latest version + curr.rawDate = latestVersion.date; + curr.status = getStatus(latestVersion, versions); + if (latestVersion.editors?.length > 0) { + curr.authors = latestVersion.editors + .map(editor => editor.title); + } + if (latestVersion.deliverers?.length > 0) { + curr.deliveredBy = latestVersion.deliverers + .map(group => group._links.homepage.href); + } + if (latestVersion.status === 'Retired') { + curr.isRetired = true; + } + else if (curr.isRetired) { + console.warn(`- ${w3cSpec.shortname}: retired in Specref but not in the W3C API.`); + } + if (latestVersion.errata) { + curr.hasErrata = latestVersion.errata; + } + else if (curr.hasErrata) { + if (versions.find(version => version.errata)) { + // The errata link was for a previous version, it shouldn't be + // kept as generic info since it's no longer current. + delete curr.hasErrata; + } + else { + console.warn(`- ${w3cSpec.shortname}: errata in Specref but not in the W3C API - ${curr.hasErrata}`); + } + } +} + + +/** + * Update W3C references + * + * Steps: + * 1. retrieve the list of >1600 specifications from the W3C API. + * 2. use the "latest-version" link under "_links" to determine the last time + * a spec got published. + * 3. If that last time is greater than the last known version in Specref, + * retrieve information about the new versions published since the last known + * version, and update Specref. + * + * Note: We try to limit the number of updates because each update requires + * sending a network requests to the W3C API, which takes time. + * + * Note: In this model, the publication date is used as a synonym of the last + * modification date for the data. That is correct in 99% of all cases. Once in + * a while, the initial data in the W3C API needs fixing, and fixes will be + * missed until the next time the spec gets published. Such errors are rare and + * are usually detected and fixed within a few days after publication. To + * be more resilient, the "sinceDate" parameter may be used to also consider + * versions published after that date as worthy of being updated. + */ +async function updateW3CRefs(sinceDate, verbose) { + // Entries use a compat format for dates + sinceDate = sinceDate.replace(/\-/g, ''); + + // Retrieve the list of W3C specifications from the W3C API. + const specs = await fetchW3CPages('specifications', 'specifications', true); + if (!specs) { + console.error('- Could not retrieve the list of W3C specifications from the W3C API'); + return; + } + const mapped = []; + for (const spec of specs) { + const latestDate = getLatestDate(spec); + + // Note: for historical reasons, the W3C API uses shortnames for a few + // very old specs that start with the specification status, for example + // "WD-mux" instead of "mux". The exact list is hardcoded in + // getShortName. + const shortname = getShortName(spec.shortname); + + // Let's look for a corresponding entry in Specref. If none is found, + // we'll add one. If one is found, we'll consider updating it, unless + // the spec has not been published after the treshold date. + // Note: for historical reasons, the W3C API lists ~30 legacy "specs" + // that are not real specs but rather team submissions. These specs + // had been removed from "tr.rdf" and have never been added to Specref. + // The check on the date skips them ("real" specs published before 2000 + // are already in Specref, and a `curr` will be found for them). + let curr = current[shortname] ?? current[shortname.toUpperCase()]; + if (!curr && !(shortname in bibref.get(shortname)) && + latestDate >= '2000') { + // Some W3C specs have shortnames that do not match the shortname + // that appears in the /TR URL. That is usually because the + // shortname evolved over time for some reason, usually because + // of a change in the way versions/levels were handled. Specref + // often knows these specs under the shortname that appears in the + // /TR URL. One such practical example is "rdf11-turtle", published + // at https://www.w3.org/TR/turtle/ and known as "turtle". + const altShortname = getShortName(spec.shortlink); + curr = current[altShortname] ?? current[altShortname.toUpperCase()]; + if (!curr && !(altShortname in bibref.get(altShortname))) { + console.log(`- ${spec.shortname} (${latestDate}): add the spec to Specref`); + curr = { rawDate: '' }; + current[shortname] = curr; } - var curr = current[k]; - if (curr) { - for (var prop in ref) { - if (typeof ref[prop] !== "undefined") curr[prop] = ref[prop]; + else { + const altSpec = specs.find(spec => spec.shortname === altShortname); + if (altSpec) { + // The alternate shortname also exists in the W3C API. + // That happens rarely, but signals that the spec went + // through multiple cycles in the W3C API, typically one as + // a non-leveled instance, and one as a leveled instance. + // Two known examples: + // - uievents-old / uievents + // - performance-timeline-1 / performance-timeline + // Specref typically merges both entries. Let's ignore the + // "old" W3C API entry when that happens. + const altLatestDate = getLatestDate(altSpec); + if (altLatestDate > latestDate) { + continue; + } } - curr.href = curr.trURL; - delete curr.date; - delete curr.trURL; - delete curr.shortName; - delete curr.aliasOf; - } else { - var clone = _cloneJSON(ref); - clone.href = clone.trURL; - delete clone.trURL; - delete clone.shortName; - current[k] = clone; + console.log(`- ${spec.shortname} (${latestDate}): /TR shortname "${altShortname}" used in Specref`); } - }); - - // Fill in missing previous versions - output.forEach(function(ref) { - var sN = ref.shortName; - if (leveled.isLevel(sN)) { - sN = leveled.getRootShortname(sN); + } + if (curr && !curr.aliasOf && makeKey(curr) > latestDate) { + // Specref has more recent info about the spec than the W3C API. + // That happens when the W3C API uses multiple entries for what + // Specref considers to be the same spec, usually because of + // versioning (examples: performance-timeline, user-timing). + // TODO: Improve information in Specref once specification series + // are supported. + console.log(`- ${spec.shortname} (${latestDate}): more recent info (${makeKey(curr)}) in Specref`); + continue; + } + if (!curr) { + // Ignore the spec, it's either known in Specref from a different + // source or it's a very old spec that did not appear in tr.rdf for + // some reason and that no one raised as worthy of addition since + // Specref started listing W3C specs. + if (latestDate >= '2000') { + console.log(`- ${spec.shortname} (${latestDate}): spec in another source in Specref`); } - var cur = current[sN]; - cur.versions = cur.versions || {}; - var key = makeKey(ref); - var prev = cur.versions[key]; - if (prev) { - if (prev.aliasOf) { - return; - } - for (var prop in ref) { - if (typeof ref[prop] !== "undefined") prev[prop] = ref[prop]; - } - delete prev.date; - delete prev.trURL; - delete prev.shortName; - delete prev.edDraft; - } else { - var clone = _cloneJSON(ref); - delete clone.trURL; - delete clone.shortName; - delete clone.edDraft; - cur.versions[key] = clone; + else { + console.log(`- ${spec.shortname} (${latestDate}): legacy spec not in Specref`); } - }); - - Object.keys(aliases).forEach(function(k) { - var aliasShortname = aliases[k]; - var alias = current[aliasShortname]; - try { - while (alias.aliasOf) { - aliasShortname = alias.aliasOf; - alias = current[aliasShortname]; - } - var old = current[k]; - if (old && old.versions) { - alias.versions = alias.versions || {}; - for (var prop in old.versions) { - if (!alias.versions[prop]) { - alias.versions[prop] = old.versions[prop]; - } - } + continue; + } + + // Record the fact that we found a mapping between the Specref entry + // and the W3C entry. + if (mapped.find(entry => entry === curr)) { + console.error(`- ${spec.shortname} (${latestDate}): second mapping for ${curr.href} in Specref, how come?`); + } + mapped.push(curr); + + // We could update all specs, but an update potentially requires + // sending lots of requests to the W3C API, so the goal is to do that + // only for a limited number of specs. We'll look at the latest version + // of the spec that Specref knows about. If a "more recent" version of + // the spec was published, the entry in Specref needs to be updated. To + // trap potential data fixes that may have been made shortly after + // publication, we'll also force an update if the last publication was + // recent enough. + if (curr.aliasOf) { + // The handful of aliases caught here were typically added manually + // for some reason, usually because a new level of the spec was + // published. For example: "png-2" maps to "PNG" in Specref (that's + // the /TR shortname), and "PNG" is an alias of "png-3" in Specref. + // The aliases are for older versions of the spec in any case, they + // can be ignored. + console.log(`- ${spec.shortname} (${latestDate}): alias of ${curr.aliasOf} in Specref`); + } + else if (latestDate > makeKey(curr)) { + console.log(`- ${spec.shortname} (${latestDate}): add recently published versions to Specref`); + await updateSpecrefFromW3CApi(curr, spec, + makeKey(curr) > sinceDate ? sinceDate : makeKey(curr)); + } + else if (latestDate > sinceDate) { + console.log(`- ${spec.shortname} (${latestDate}): refresh recently published versions in Specref`); + await updateSpecrefFromW3CApi(curr, spec, + makeKey(curr) > sinceDate ? sinceDate : makeKey(curr)); + } + else if (containsOutdatedInfo(curr, spec, latestDate)) { + console.log(`- ${spec.shortname} (${latestDate}): update outdated info in Specref`); + await updateSpecrefFromW3CApi(curr, spec, ''); + } + } + + // In theory, all Specref entries (except aliases) should have a + // corresponding entry in the W3C API. If not, something is wrong! + for (const [shortname, entry] of Object.entries(current)) { + if (mapped.find(e => e === entry)) { + // All good, we already mapped the entry to a W3C entry + continue; + } + if (entry.aliasOf) { + // Aliases are essentially created once and for all. They need to + // be preserved forever, no need to worry too much about them. + continue; + } + if (shortname.match(/-\d{8}$/)) { + // These dated entries were typically added to Specred to preserve + // links that would otherwise have disappeared, following a change + // of shortname. Same as with aliases, we need to preserve them + // forever. + continue; + } + const spec = specs.find(spec => + spec.shortname.toLowerCase() === shortname.toLowerCase() || + spec.shortlink === entry.href); + if (spec) { + continue; + } + if (!entry.href.match(/^https:\/\/www\.w3\.org\/TR\//)) { + console.error(`- ${shortname}: not a /TR spec, should be moved to refs/biblio.json (href: ${entry.href})`); + continue; + } + + // The W3C API listing does not include superseded specs by + // default. Let's check whether the spec is one of them. + const url = W3C_API + 'specifications/' + shortname; + if (verbose) { + console.log(`Fetching ${url}`); + } + const resp = await fetchW3CApi(url); + if (!resp) { + console.error(`- ${shortname}: could not retrieve info from the W3C API`); + } + else if (resp.status === 200) { + const json = await resp.json(); + if (json._links?.['superseded-by']) { + entry.isSuperseded = true; + const supersededBy = await fetchW3CPages( + json._links['superseded-by'].href, 'superseded', true); + if (supersededBy) { + entry.obsoletedBy = supersededBy.map(spec => spec.shortname); } - } catch(e) { - var root = current[leveled.getRootShortname(aliasShortname)]; - if (!root || !root.versions || !root.versions[getKey(aliasShortname)]) { - if (aliasShortname in bibref.get(aliasShortname)) { - return; - } - throw new Error("Missing data for spec " + aliasShortname); + else { + console.error(`- ${shortname}: could not retrieve the list of superseding specs from the W3C API`); } } - current[k] = { aliasOf: leveled.isLevel(aliasShortname) ? leveled.getRootShortname(aliasShortname) : aliasShortname }; - }); - - Object.keys(levels).forEach(function(k) { - current[k] = { aliasOf: levels[k] }; - }); - - Object.keys(superseders).forEach(function(id) { - var obsoletes = superseders[id].filter(function(k) { - return current[k]; - }); - if (!obsoletes.length) return; - current[id].obsoletes = obsoletes; - current[id].obsoletes.forEach(function(k) { - if (typeof current[k] == "object") { - current[k].obsoletedBy = [id]; - } - }); - }); - - console.log("Sorting references..."); - var sorted = {}, needUpdate = []; - Object.keys(current).sort().forEach(function(k) { - var ref = current[k]; - sorted[k] = current[k]; - delete ref.shortName; - if (isGeneratedByThisScript(ref)) { - needUpdate.push(ref) + else if (json.shortname !== shortname) { + console.log(`- ${shortname}: alias of ${json.shortname} in the W3C API`); } - }); - - console.log("updating existing refs.") - needUpdate.forEach(function(ref) { - var latest = bibref.findLatest(ref); - if (latest && !latest.aliasOf && latest.rawDate !== ref.rawDate) { - if (latest.title) ref.title = latest.title; - if (latest.rawDate) ref.rawDate = latest.rawDate; - if (latest.status) ref.status = latest.status; - if (latest.publisher) ref.publisher = latest.publisher; - if (latest.isRetired) ref.isRetired = latest.isRetired; - if (latest.isSuperseded) ref.isSuperseded = latest.isSuperseded; + else { + console.error(`- ${shortname}: exists in the W3C API but not returned in the default listing, why?`); } - }); - helper.writeBiblio(FILENAME, sorted); - helper.tryOverwrite(FILENAME); - }); -}); - -function makeCleaner(status, isRetired, isSuperseded) { - var AUTHORS_DICTIONARY = { - "Edward O'Connor": "Theresa O'Connor" + } + else if (resp.status === 404) { + console.error(`- ${shortname}: unknown to the W3C API, how come?`); + } + else { + console.error(`- ${shortname}: could not retrieve info from the W3C API, HTTP status ${resp.status} received`); + } } - return function(spec) { - var authors = walk(spec, "editor"); - authors = authors ? authors.map(function(e) { - return walk(e, "contact:fullName", 0) || walk(e, "org:name", 0); - }).map(function(a) { - return AUTHORS_DICTIONARY[a] || a; - }) : void 0; - var type = walk(spec, "rdf:type", 0, "$", "rdf:resource"); - var obj = { - authors: authors, - href: convertToHttps(walk(spec, "$", "rdf:about").trim()), - title: walk(spec, "dc:title", 0), - rawDate: walk(spec, "dc:date", 0), - status: status, - publisher: "W3C", - isRetired: isRetired || (type == "https://www.w3.org/2001/02pd/rec54#Retired") || (type == "http://www.w3.org/2001/02pd/rec54#Retired") || void 0, - isSuperseded: isSuperseded, - trURL: convertToHttps(walk(spec, "doc:versionOf", 0, "$", "rdf:resource")), - edDraft: convertToHttps(walk(spec, "ED", 0, "$", "rdf:resource")), - deliveredBy: walk(spec, "org:deliveredBy"), - hasErrata: convertToHttps(walk(spec, "mat:hasErrata", 0, "$", "rdf:resource")), - source: RDF_FILE - }; - obj.deliveredBy = obj.deliveredBy ? obj.deliveredBy.map(function(r) { return convertToHttps(walk(r, "contact:homePage", 0, "$", "rdf:resource")); }) : obj.deliveredBy; - obj.trURL = TR_URLS[obj.trURL] || obj.trURL; - obj.edDraft = convertToHttps(obj.edDraft); - obj.shortName = getShortName(obj.trURL); - return obj; + + console.log("Invert aliases that we turned into concrete entries..."); + for (const [name, alias] of Object.entries(aliasesToInvert)) { + // Look for the previously targeted entry. + // If it does not exist, that's fine, we'll just ignore it. + // Tests will catch any hiccup we may create in any case. + const match = name.match(/^(.+)-([^-]+)/); + if (!match) { + continue; + } + entry = current[match[1]]; + if (!entry || !entry.versions) { + continue; + } + if (entry.versions[match[2]]) { + console.log(`- ${match[1]} (${match[2]}): make alias of ${alias}`); + entry.versions[match[2]] = { aliasOf: alias }; + } } -} -function walk(obj) { - for (var i=1; i < arguments.length; i++) { - var prop = arguments[i] - if (prop in obj) { - obj = obj[prop]; - } else { - return void 0; + console.log("Sorting references..."); + const sorted = helper.sortRefs(current); + + console.log("Updating obsoletes properties..."); + const superseders = {}; + for (const [shortname, entry] of Object.entries(current)) { + if (entry.obsoletes) { + delete entry.obsoletes; } + for (const superseder of (entry.obsoletedBy ?? [])) { + if (!superseders[superseder]) { + superseders[superseder] = []; + } + superseders[superseder].push(shortname); + } + } + for (const [shortname, obsoletes] of Object.entries(superseders)) { + const entry = current[shortname]; + if (!entry) { + continue; + } + obsoletes.sort(); + entry.obsoletes = obsoletes; } - return obj; + + console.log("Writing file..."); + helper.writeBiblio(FILENAME, sorted); + helper.tryOverwrite(FILENAME); } -function _cloneJSON(obj) { - return JSON.parse(JSON.stringify(obj)); +if (!since) { + // Compute the most recent W3C spec date in Specref, and substract a + // few weeks to get a threshold date beyond which we consider that the + // data in Specref no longer needs to be updated. + const latestUpdate = Object.values(current).reduce((version, curr) => + curr.rawDate > version ? curr.rawDate : version, ''); + const twoWeeks = 2 * 7 * 24 * 60 * 60 * 1000; + const thresholdDate = new Date(latestUpdate); + thresholdDate.setTime(thresholdDate.getTime() - twoWeeks); + since = thresholdDate + .toISOString() + .substring(0, 10); } -function isGeneratedByThisScript(ref) { - return ref.source == "https://www.w3.org/2002/01/tr-automation/tr.rdf" || ref.source == "http://www.w3.org/2002/01/tr-automation/tr.rdf" || ref.source == RDF_FILE; +const start = Date.now(); +if (start - (new Date(since)).getTime() > 365 * 24 * 3600 * 1000) { + // A full update can take more than an hour. Let's alert the user + // (text on a red background). + console.warn("\x1b[41mBeware, update may take a while, possibly more than an hour!\x1b[0m"); } + +console.log(`Updating W3C references since ${since}...`); +updateW3CRefs(since) + .then(_ => { + const end = Date.now(); + const duration = Math.round((end - start) / 1000); + console.log(`Done updating W3C references in ${duration} seconds, ${requestsToW3CApi} requests sent to the W3C API`); + });