Skip to content

Commit

Permalink
core(charset audit): loosen CHARSET_HTML_REGEX and CHARSET_HTTP_REGEX
Browse files Browse the repository at this point in the history
While it would be overkill to implement full-blown HTML/HTTP parsers, by
making the regular expressions case-insensitive can can reduce the amount
of false negatives for the charset audit.

This patch also applies some drive-by nits/simplifications.

Ref. GoogleChrome#10023, GoogleChrome#10284.
  • Loading branch information
mathiasbynens committed Feb 27, 2020
1 parent f150573 commit ee80522
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 13 deletions.
8 changes: 4 additions & 4 deletions lighthouse-core/audits/dobetterweb/charset.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

/**
* @fileoverview Audits a page to ensure charset it configured properly.
* It must be defined within the first 1024 bytes of the HTML document, defined in the HTTP header, or the document source starts with a BOM.
* It must be defined within the first 1024 bytes of the HTML document, defined in the HTTP header, or the document source must start with a BOM.
*
* @see: https://github.com/GoogleChrome/lighthouse/issues/10023
*/
Expand All @@ -32,8 +32,8 @@ const str_ = i18n.createMessageInstanceIdFn(__filename, UIStrings);
const CONTENT_TYPE_HEADER = 'content-type';
// /^[a-zA-Z0-9-_:.()]{2,}$/ matches all known IANA charset names (https://www.iana.org/assignments/character-sets/character-sets.xhtml)
const IANA_REGEX = /^[a-zA-Z0-9-_:.()]{2,}$/;
const CHARSET_HTML_REGEX = /<meta[^>]+charset[^<]+>/;
const CHARSET_HTTP_REGEX = /charset\s*=\s*[a-zA-Z0-9-_:.()]{2,}/;
const CHARSET_HTML_REGEX = /<meta[^>]+charset[^<]+>/i;
const CHARSET_HTTP_REGEX = /charset\s*=\s*[a-zA-Z0-9-_:.()]{2,}/i;

class CharsetDefined extends Audit {
/**
Expand Down Expand Up @@ -69,7 +69,7 @@ class CharsetDefined extends Audit {
}

// Check if there is a BOM byte marker
const BOM_FIRSTCHAR = 65279;
const BOM_FIRSTCHAR = 0xFEFF;
isCharsetSet = isCharsetSet || artifacts.MainDocumentContent.charCodeAt(0) === BOM_FIRSTCHAR;

// Check if charset-ish meta tag is defined within the first 1024 characters(~1024 bytes) of the HTML document
Expand Down
2 changes: 1 addition & 1 deletion lighthouse-core/lib/i18n/locales/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@
"message": "Avoids Application Cache"
},
"lighthouse-core/audits/dobetterweb/charset.js | description": {
"message": "A character encoding declaration is required. It can be done with a <meta> tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
"message": "A character encoding declaration is required. It can be done with a <meta> tag in the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
},
"lighthouse-core/audits/dobetterweb/charset.js | failureTitle": {
"message": "Charset declaration is missing or occurs too late in the HTML"
Expand Down
2 changes: 1 addition & 1 deletion lighthouse-core/lib/i18n/locales/en-XL.json
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@
"message": "Âv́ôíd̂ś Âṕp̂ĺîćât́îón̂ Ćâćĥé"
},
"lighthouse-core/audits/dobetterweb/charset.js | description": {
"message": "Â ćĥár̂áĉt́êŕ êńĉód̂ín̂ǵ d̂éĉĺâŕât́îón̂ íŝ ŕêq́ûír̂éd̂. Ít̂ ćâń b̂é d̂ón̂é ŵít̂h́ â <ḿêt́â> t́âǵîń t̂h́ê f́îŕŝt́ 1024 b̂ýt̂éŝ óf̂ t́ĥé ĤT́M̂Ĺ ôŕ îń t̂h́ê Ćôńt̂én̂t́-T̂ýp̂é ĤT́T̂Ṕ r̂éŝṕôńŝé ĥéâd́êŕ. [L̂éâŕn̂ ḿôŕê](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
"message": "Â ćĥár̂áĉt́êŕ êńĉód̂ín̂ǵ d̂éĉĺâŕât́îón̂ íŝ ŕêq́ûír̂éd̂. Ít̂ ćâń b̂é d̂ón̂é ŵít̂h́ â <ḿêt́â> t́âǵ îń t̂h́ê f́îŕŝt́ 1024 b̂ýt̂éŝ óf̂ t́ĥé ĤT́M̂Ĺ ôŕ îń t̂h́ê Ćôńt̂én̂t́-T̂ýp̂é ĤT́T̂Ṕ r̂éŝṕôńŝé ĥéâd́êŕ. [L̂éâŕn̂ ḿôŕê](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
},
"lighthouse-core/audits/dobetterweb/charset.js | failureTitle": {
"message": "Ĉh́âŕŝét̂ d́êćl̂ár̂át̂íôń îś m̂íŝśîńĝ ór̂ óĉćûŕŝ t́ôó l̂át̂é îń t̂h́ê H́T̂ḾL̂"
Expand Down
6 changes: 3 additions & 3 deletions lighthouse-core/test/audits/dobetterweb/charset-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ describe('Charset defined audit', () => {
});

it('fails when the page has charset defined too late in the page', async () => {
const bigString = new Array(1024).fill(' ').join('');
const bigString = ' '.repeat(1024);
const htmlContent = HTML_PRE + bigString + '<meta charset="utf-8" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
Expand All @@ -86,7 +86,7 @@ describe('Charset defined audit', () => {
});

it('passes when the page has charset defined almost too late in the page', async () => {
const bigString = new Array(900).fill(' ').join('');
const bigString = ' '.repeat(900);
const htmlContent = HTML_PRE + bigString + '<meta charset="utf-8" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
Expand All @@ -97,7 +97,7 @@ describe('Charset defined audit', () => {
it('fails when charset only partially defined in the first 1024 bytes of the page', async () => {
const charsetHTML = '<meta charset="utf-8" />';
// 1024 bytes should be halfway through the meta tag
const bigString = new Array(1024 - HTML_PRE.length - charsetHTML.length / 2).fill(' ').join('');
const bigString = ' '.repeat(1024 - HTML_PRE.length - charsetHTML.length / 2);
const htmlContent = HTML_PRE + bigString + charsetHTML + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
Expand Down
4 changes: 2 additions & 2 deletions lighthouse-core/test/results/sample_v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -2847,7 +2847,7 @@
"charset": {
"id": "charset",
"title": "Charset declaration is missing or occurs too late in the HTML",
"description": "A character encoding declaration is required. It can be done with a <meta> tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).",
"description": "A character encoding declaration is required. It can be done with a <meta> tag in the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).",
"score": 0,
"scoreDisplayMode": "binary"
},
Expand Down Expand Up @@ -6936,4 +6936,4 @@
}
}
]
}
}
4 changes: 2 additions & 2 deletions proto/sample_v2_round_trip.json
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@
"title": "Document has a valid `rel=canonical`"
},
"charset": {
"description": "A character encoding declaration is required. It can be done with a <meta> tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).",
"description": "A character encoding declaration is required. It can be done with a <meta> tag in the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).",
"id": "charset",
"score": 0.0,
"scoreDisplayMode": "binary",
Expand Down Expand Up @@ -5404,4 +5404,4 @@
"total": 12345.6789
},
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3358.0 Safari/537.36"
}
}

0 comments on commit ee80522

Please sign in to comment.