From 0e858c925976f2b1d64cfb027844ca657983cdb1 Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 16:41:16 +0200 Subject: [PATCH 01/12] Crawl social-image urls during prerender --- .changeset/thirty-garlics-tan.md | 5 + packages/kit/src/core/postbuild/crawl.js | 98 ++++++++++++------- .../core/postbuild/fixtures/meta/input.html | 12 +++ .../core/postbuild/fixtures/meta/output.json | 4 + 4 files changed, 81 insertions(+), 38 deletions(-) create mode 100644 .changeset/thirty-garlics-tan.md create mode 100644 packages/kit/src/core/postbuild/fixtures/meta/input.html create mode 100644 packages/kit/src/core/postbuild/fixtures/meta/output.json diff --git a/.changeset/thirty-garlics-tan.md b/.changeset/thirty-garlics-tan.md new file mode 100644 index 000000000000..01c215ca6b5e --- /dev/null +++ b/.changeset/thirty-garlics-tan.md @@ -0,0 +1,5 @@ +--- +'@sveltejs/kit': patch +--- + +Crawl social-image urls during prerender diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index 4d96d3cce7fc..f88a9d04c22c 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -13,6 +13,13 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/; const WHITESPACE = /[\s\n\r]/; +const CRAWLABLE_META_NAME_ATTRS = new Set([ + 'og:image', + 'og:image:url', + 'og:image:secure_url', + 'twitter:image' +]); + /** * @param {string} html * @param {string} base @@ -80,6 +87,8 @@ export function crawl(html, base) { } const tag = html.slice(start, i).toUpperCase(); + /** @type {Map} */ + const attributes = new Map(); if (tag === 'SCRIPT' || tag === 'STYLE') { while (i < html.length) { @@ -159,44 +168,7 @@ export function crawl(html, base) { } value = decode(value); - - if (name === 'href') { - if (tag === 'BASE') { - base = resolve(base, value); - } else { - href = resolve(base, value); - } - } else if (name === 'id') { - ids.push(value); - } else if (name === 'name') { - if (tag === 'A') ids.push(value); - } else if (name === 'rel') { - rel = value; - } else if (name === 'src') { - if (value) hrefs.push(resolve(base, value)); - } else if (name === 'srcset') { - const candidates = []; - let insideURL = true; - value = value.trim(); - for (let i = 0; i < value.length; i++) { - if ( - value[i] === ',' && - (!insideURL || (insideURL && WHITESPACE.test(value[i + 1]))) - ) { - candidates.push(value.slice(0, i)); - value = value.substring(i + 1).trim(); - i = 0; - insideURL = true; - } else if (WHITESPACE.test(value[i])) { - insideURL = false; - } - } - candidates.push(value); - for (const candidate of candidates) { - const src = candidate.split(WHITESPACE)[0]; - if (src) hrefs.push(resolve(base, src)); - } - } + attributes.set(name, value); } else { i -= 1; } @@ -205,6 +177,56 @@ export function crawl(html, base) { i += 1; } + const href_attr = attributes.get('href'); + const id_attr = attributes.get('id'); + const name_attr = attributes.get('name'); + const rel_attr = attributes.get('rel'); + const src_attr = attributes.get('src'); + const srcset_attr = attributes.get('srcset'); + const content_attr = attributes.get('content'); + + if (href_attr) { + if (tag === 'BASE') base = resolve(base, href_attr); + else href = resolve(base, href_attr); + } + if (id_attr) { + ids.push(id_attr); + } + if (name_attr && tag === 'A') { + ids.push(name_attr); + } + if (rel_attr) { + rel = rel_attr; + } + if (src_attr) { + hrefs.push(resolve(base, src_attr)); + } + if (srcset_attr) { + let value = srcset_attr; + const candidates = []; + let insideURL = true; + value = value.trim(); + for (let i = 0; i < value.length; i++) { + if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) { + candidates.push(value.slice(0, i)); + value = value.substring(i + 1).trim(); + i = 0; + insideURL = true; + } else if (WHITESPACE.test(value[i])) { + insideURL = false; + } + } + candidates.push(value); + for (const candidate of candidates) { + const src = candidate.split(WHITESPACE)[0]; + if (src) hrefs.push(resolve(base, src)); + } + } + + if (tag === 'META' && content_attr && name_attr && CRAWLABLE_META_NAME_ATTRS.has(name_attr)) { + hrefs.push(resolve(base, content_attr)); + } + if (href && !/\bexternal\b/i.test(rel)) { hrefs.push(resolve(base, href)); } diff --git a/packages/kit/src/core/postbuild/fixtures/meta/input.html b/packages/kit/src/core/postbuild/fixtures/meta/input.html new file mode 100644 index 000000000000..c820f42b57ea --- /dev/null +++ b/packages/kit/src/core/postbuild/fixtures/meta/input.html @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/packages/kit/src/core/postbuild/fixtures/meta/output.json b/packages/kit/src/core/postbuild/fixtures/meta/output.json new file mode 100644 index 000000000000..802aa6c65020 --- /dev/null +++ b/packages/kit/src/core/postbuild/fixtures/meta/output.json @@ -0,0 +1,4 @@ +{ + "hrefs": ["https://external.com","/og-image.jpg"], + "ids": [] +} From 9af1ddc173db7cbca1e3a5507c919e5877685439 Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 16:46:14 +0200 Subject: [PATCH 02/12] Formatting & Linting --- packages/kit/src/core/postbuild/crawl.js | 9 +++++++-- .../kit/src/core/postbuild/fixtures/meta/output.json | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index f88a9d04c22c..9d21c731ef24 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -222,8 +222,13 @@ export function crawl(html, base) { if (src) hrefs.push(resolve(base, src)); } } - - if (tag === 'META' && content_attr && name_attr && CRAWLABLE_META_NAME_ATTRS.has(name_attr)) { + + if ( + tag === 'META' && + content_attr && + name_attr && + CRAWLABLE_META_NAME_ATTRS.has(name_attr) + ) { hrefs.push(resolve(base, content_attr)); } diff --git a/packages/kit/src/core/postbuild/fixtures/meta/output.json b/packages/kit/src/core/postbuild/fixtures/meta/output.json index 802aa6c65020..0b459bea951d 100644 --- a/packages/kit/src/core/postbuild/fixtures/meta/output.json +++ b/packages/kit/src/core/postbuild/fixtures/meta/output.json @@ -1,4 +1,4 @@ { - "hrefs": ["https://external.com","/og-image.jpg"], + "hrefs": ["https://external.com", "/og-image.jpg"], "ids": [] } From 494f1e9cda0b69df91f977265f8ce065fc58b4a3 Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 17:36:09 +0200 Subject: [PATCH 03/12] Format changeset & added exhaustive list of crawlable urls --- .changeset/thirty-garlics-tan.md | 2 +- packages/kit/src/core/postbuild/crawl.js | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.changeset/thirty-garlics-tan.md b/.changeset/thirty-garlics-tan.md index 01c215ca6b5e..88d80e334628 100644 --- a/.changeset/thirty-garlics-tan.md +++ b/.changeset/thirty-garlics-tan.md @@ -2,4 +2,4 @@ '@sveltejs/kit': patch --- -Crawl social-image urls during prerender +fix: Crawl social-image urls during prerender diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index 9d21c731ef24..c3684889a62b 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -14,9 +14,21 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/; const WHITESPACE = /[\s\n\r]/; const CRAWLABLE_META_NAME_ATTRS = new Set([ + 'url', + 'identifier-URL', + 'syndication-source', + 'original-source', + 'og:url', + 'msapplication-starturl', 'og:image', 'og:image:url', 'og:image:secure_url', + 'og:video', + 'og:video:url', + 'og:video:secure_url', + 'og:audio', + 'og:audio:url', + 'og:audio:secure_url', 'twitter:image' ]); From fc36b8250fe981f25b29d6309e548fc32b7b28c4 Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 18:44:36 +0200 Subject: [PATCH 04/12] Changed severity to minor as described in #5228 --- .changeset/thirty-garlics-tan.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.changeset/thirty-garlics-tan.md b/.changeset/thirty-garlics-tan.md index 88d80e334628..cae3205c97b0 100644 --- a/.changeset/thirty-garlics-tan.md +++ b/.changeset/thirty-garlics-tan.md @@ -1,5 +1,5 @@ --- -'@sveltejs/kit': patch +'@sveltejs/kit': minor --- -fix: Crawl social-image urls during prerender +feat: Crawl social-image urls during prerender From eeaead07e89ebf85f1a9cb9334e6a3ebd05f8443 Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 18:56:54 +0200 Subject: [PATCH 05/12] Added support for `property` attribute & limited valid names to just social tags --- packages/kit/src/core/postbuild/crawl.js | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index c3684889a62b..dfa7ba763063 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -14,12 +14,7 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/; const WHITESPACE = /[\s\n\r]/; const CRAWLABLE_META_NAME_ATTRS = new Set([ - 'url', - 'identifier-URL', - 'syndication-source', - 'original-source', 'og:url', - 'msapplication-starturl', 'og:image', 'og:image:url', 'og:image:secure_url', @@ -192,6 +187,7 @@ export function crawl(html, base) { const href_attr = attributes.get('href'); const id_attr = attributes.get('id'); const name_attr = attributes.get('name'); + const property_attr = attributes.get('property'); const rel_attr = attributes.get('rel'); const src_attr = attributes.get('src'); const srcset_attr = attributes.get('srcset'); @@ -240,6 +236,15 @@ export function crawl(html, base) { content_attr && name_attr && CRAWLABLE_META_NAME_ATTRS.has(name_attr) + ) { + hrefs.push(resolve(base, content_attr.trim().toLowerCase())); + } + + if ( + tag === 'META' && + content_attr && + property_attr && + CRAWLABLE_META_NAME_ATTRS.has(property_attr.trim().toLowerCase()) ) { hrefs.push(resolve(base, content_attr)); } From fa08943eff4cbb9b5e2bf07d0db611b9e4a3f46a Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 18:57:02 +0200 Subject: [PATCH 06/12] More tests --- packages/kit/src/core/postbuild/fixtures/meta/input.html | 2 ++ packages/kit/src/core/postbuild/fixtures/meta/output.json | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/kit/src/core/postbuild/fixtures/meta/input.html b/packages/kit/src/core/postbuild/fixtures/meta/input.html index c820f42b57ea..72c02f90961a 100644 --- a/packages/kit/src/core/postbuild/fixtures/meta/input.html +++ b/packages/kit/src/core/postbuild/fixtures/meta/input.html @@ -7,6 +7,8 @@ + + diff --git a/packages/kit/src/core/postbuild/fixtures/meta/output.json b/packages/kit/src/core/postbuild/fixtures/meta/output.json index 0b459bea951d..76cfe72ada91 100644 --- a/packages/kit/src/core/postbuild/fixtures/meta/output.json +++ b/packages/kit/src/core/postbuild/fixtures/meta/output.json @@ -1,4 +1,4 @@ { - "hrefs": ["https://external.com", "/og-image.jpg"], + "hrefs": ["https://external.com", "/og-image.jpg", "https://example.com/audio.mp3", "/video.mp4"], "ids": [] } From 15ff1b6c4ba82b8e3d61d271daf9b17701fd11fa Mon Sep 17 00:00:00 2001 From: Loris Sigrist Date: Wed, 10 May 2023 19:08:08 +0200 Subject: [PATCH 07/12] Better changeset message - I'm indecisive --- .changeset/thirty-garlics-tan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/thirty-garlics-tan.md b/.changeset/thirty-garlics-tan.md index cae3205c97b0..bee032c0fe2e 100644 --- a/.changeset/thirty-garlics-tan.md +++ b/.changeset/thirty-garlics-tan.md @@ -2,4 +2,4 @@ '@sveltejs/kit': minor --- -feat: Crawl social-image urls during prerender +feat: Crawl urls in `` tags From 9b72fe7d74ff33d8fe53532efff2fa61e9612711 Mon Sep 17 00:00:00 2001 From: Loris Sigrist <43482866+LorisSigrist@users.noreply.github.com> Date: Thu, 11 May 2023 19:37:36 +0200 Subject: [PATCH 08/12] Update .changeset/thirty-garlics-tan.md Co-authored-by: Ben McCann <322311+benmccann@users.noreply.github.com> --- .changeset/thirty-garlics-tan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/thirty-garlics-tan.md b/.changeset/thirty-garlics-tan.md index bee032c0fe2e..d31c3a55b005 100644 --- a/.changeset/thirty-garlics-tan.md +++ b/.changeset/thirty-garlics-tan.md @@ -2,4 +2,4 @@ '@sveltejs/kit': minor --- -feat: Crawl urls in `` tags +feat: crawl URLs in `` tags From 7df6561d196873d226cd4ef285d2cf84b53ef3a4 Mon Sep 17 00:00:00 2001 From: Rich Harris Date: Tue, 16 May 2023 17:48:28 -0400 Subject: [PATCH 09/12] simplify --- packages/kit/src/core/postbuild/crawl.js | 74 +++++++++++------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index dfa7ba763063..f784bd66e1c4 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -111,9 +111,6 @@ export function crawl(html, base) { } } - let href = ''; - let rel = ''; - while (i < html.length) { const start = i; @@ -184,33 +181,37 @@ export function crawl(html, base) { i += 1; } - const href_attr = attributes.get('href'); - const id_attr = attributes.get('id'); - const name_attr = attributes.get('name'); - const property_attr = attributes.get('property'); - const rel_attr = attributes.get('rel'); - const src_attr = attributes.get('src'); - const srcset_attr = attributes.get('srcset'); - const content_attr = attributes.get('content'); - - if (href_attr) { - if (tag === 'BASE') base = resolve(base, href_attr); - else href = resolve(base, href_attr); - } - if (id_attr) { - ids.push(id_attr); + const href = attributes.get('href'); + const id = attributes.get('id'); + const name = attributes.get('name'); + const property = attributes.get('property'); + const rel = attributes.get('rel'); + const src = attributes.get('src'); + const srcset = attributes.get('srcset'); + const content = attributes.get('content'); + + if (href) { + if (tag === 'BASE') { + base = resolve(base, href); + } else if (!rel || !/\bexternal\b/i.test(rel)) { + hrefs.push(resolve(base, href)); + } } - if (name_attr && tag === 'A') { - ids.push(name_attr); + + if (id) { + ids.push(id); } - if (rel_attr) { - rel = rel_attr; + + if (name && tag === 'A') { + ids.push(name); } - if (src_attr) { - hrefs.push(resolve(base, src_attr)); + + if (src) { + hrefs.push(resolve(base, src)); } - if (srcset_attr) { - let value = srcset_attr; + + if (srcset) { + let value = srcset; const candidates = []; let insideURL = true; value = value.trim(); @@ -231,26 +232,17 @@ export function crawl(html, base) { } } - if ( - tag === 'META' && - content_attr && - name_attr && - CRAWLABLE_META_NAME_ATTRS.has(name_attr) - ) { - hrefs.push(resolve(base, content_attr.trim().toLowerCase())); + if (tag === 'META' && content && name && CRAWLABLE_META_NAME_ATTRS.has(name)) { + hrefs.push(resolve(base, content.trim().toLowerCase())); } if ( tag === 'META' && - content_attr && - property_attr && - CRAWLABLE_META_NAME_ATTRS.has(property_attr.trim().toLowerCase()) + content && + property && + CRAWLABLE_META_NAME_ATTRS.has(property.trim().toLowerCase()) ) { - hrefs.push(resolve(base, content_attr)); - } - - if (href && !/\bexternal\b/i.test(rel)) { - hrefs.push(resolve(base, href)); + hrefs.push(resolve(base, content)); } } } From 3ac548de2d59a8552c3b7db4ae9a2e6a22c773e8 Mon Sep 17 00:00:00 2001 From: Rich Harris Date: Tue, 16 May 2023 17:54:17 -0400 Subject: [PATCH 10/12] simplify --- packages/kit/src/core/postbuild/crawl.js | 33 +++++++++--------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index f784bd66e1c4..a11e3e12dea3 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -94,8 +94,9 @@ export function crawl(html, base) { } const tag = html.slice(start, i).toUpperCase(); - /** @type {Map} */ - const attributes = new Map(); + + /** @type {Record} */ + const attributes = {}; if (tag === 'SCRIPT' || tag === 'STYLE') { while (i < html.length) { @@ -172,7 +173,7 @@ export function crawl(html, base) { } value = decode(value); - attributes.set(name, value); + attributes[name] = value; } else { i -= 1; } @@ -181,14 +182,7 @@ export function crawl(html, base) { i += 1; } - const href = attributes.get('href'); - const id = attributes.get('id'); - const name = attributes.get('name'); - const property = attributes.get('property'); - const rel = attributes.get('rel'); - const src = attributes.get('src'); - const srcset = attributes.get('srcset'); - const content = attributes.get('content'); + const { href, id, name, property, rel, src, srcset, content } = attributes; if (href) { if (tag === 'BASE') { @@ -232,17 +226,14 @@ export function crawl(html, base) { } } - if (tag === 'META' && content && name && CRAWLABLE_META_NAME_ATTRS.has(name)) { - hrefs.push(resolve(base, content.trim().toLowerCase())); - } + if (tag === 'META' && content) { + if (name && CRAWLABLE_META_NAME_ATTRS.has(name)) { + hrefs.push(resolve(base, content.trim().toLowerCase())); + } - if ( - tag === 'META' && - content && - property && - CRAWLABLE_META_NAME_ATTRS.has(property.trim().toLowerCase()) - ) { - hrefs.push(resolve(base, content)); + if (property && CRAWLABLE_META_NAME_ATTRS.has(property.trim().toLowerCase())) { + hrefs.push(resolve(base, content)); + } } } } From 28cd18ee0698ee4a1bdd4c1687b1d62805a8a86a Mon Sep 17 00:00:00 2001 From: Loris Sigrist <43482866+LorisSigrist@users.noreply.github.com> Date: Wed, 17 May 2023 11:09:37 +0200 Subject: [PATCH 11/12] Removed redundant data-sanitation --- packages/kit/src/core/postbuild/crawl.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index a11e3e12dea3..7d264b3cac93 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -228,10 +228,10 @@ export function crawl(html, base) { if (tag === 'META' && content) { if (name && CRAWLABLE_META_NAME_ATTRS.has(name)) { - hrefs.push(resolve(base, content.trim().toLowerCase())); + hrefs.push(resolve(base, content)); } - if (property && CRAWLABLE_META_NAME_ATTRS.has(property.trim().toLowerCase())) { + if (property && CRAWLABLE_META_NAME_ATTRS.has(property)) { hrefs.push(resolve(base, content)); } } From 774bc0489ac142f1dc17d5f9dbd43b988fa7989d Mon Sep 17 00:00:00 2001 From: Rich Harris Date: Wed, 17 May 2023 07:40:10 -0400 Subject: [PATCH 12/12] DRY out --- packages/kit/src/core/postbuild/crawl.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/kit/src/core/postbuild/crawl.js b/packages/kit/src/core/postbuild/crawl.js index 7d264b3cac93..caffef583c55 100644 --- a/packages/kit/src/core/postbuild/crawl.js +++ b/packages/kit/src/core/postbuild/crawl.js @@ -227,11 +227,9 @@ export function crawl(html, base) { } if (tag === 'META' && content) { - if (name && CRAWLABLE_META_NAME_ATTRS.has(name)) { - hrefs.push(resolve(base, content)); - } + const attr = name ?? property; - if (property && CRAWLABLE_META_NAME_ATTRS.has(property)) { + if (attr && CRAWLABLE_META_NAME_ATTRS.has(attr)) { hrefs.push(resolve(base, content)); } }