Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new_audit: add charset declaration audit #10284

Merged
merged 23 commits into from
Feb 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lighthouse-cli/test/cli/__snapshots__/index-test.js.snap
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ Object {
Object {
"path": "dobetterweb/doctype",
},
Object {
"path": "dobetterweb/charset",
},
Object {
"path": "dobetterweb/dom-size",
},
Expand Down Expand Up @@ -704,6 +707,10 @@ Object {
"id": "doctype",
"weight": 1,
},
Object {
"id": "charset",
"weight": 1,
},
Object {
"id": "no-vulnerable-libraries",
"weight": 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ const expectations = [
{
name: '',
content: '',
charset: 'utf-8',
},
{
name: 'viewport',
Expand Down
96 changes: 96 additions & 0 deletions lighthouse-core/audits/dobetterweb/charset.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/**
* @license Copyright 2020 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/

/**
* @fileoverview Audits a page to ensure charset it configured properly.
* It must be defined within the first 1024 bytes of the HTML document, defined in the HTTP header, or the document source starts with a BOM.
*
* @see: https://github.com/GoogleChrome/lighthouse/issues/10023
*/
Beytoven marked this conversation as resolved.
Show resolved Hide resolved
'use strict';

const Audit = require('../audit.js');
const i18n = require('../../lib/i18n/i18n.js');
const MainResource = require('../../computed/main-resource.js');

const UIStrings = {
/** Title of a Lighthouse audit that provides detail on if the charset is set properly for a page. This title is shown when the charset is defined correctly. Charset defines the character encoding (eg UTF-8) of the page content. */
title: 'Properly defines charset',
/** Title of a Lighthouse audit that provides detail on if the charset is set properly for a page. This title is shown when the charset meta tag is missing or defined too late in the page. */
failureTitle: 'Charset declaration is missing or occurs too late in the HTML',
/** Description of a Lighthouse audit that tells the user why the charset needs to be defined early on. */
description: 'A character encoding declaration is required. It can be done with a <meta> tag' +
'in the first 1024 bytes of the HTML or in the Content-Type HTTP response header. ' +
'[Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).',
};

const str_ = i18n.createMessageInstanceIdFn(__filename, UIStrings);

const CONTENT_TYPE_HEADER = 'content-type';
// /^[a-zA-Z0-9-_:.()]{2,}$/ matches all known IANA charset names (https://www.iana.org/assignments/character-sets/character-sets.xhtml)
const IANA_REGEX = /^[a-zA-Z0-9-_:.()]{2,}$/;
const CHARSET_HTML_REGEX = /<meta[^>]+charset[^<]+>/;
const CHARSET_HTTP_REGEX = /charset\s*=\s*[a-zA-Z0-9-_:.()]{2,}/;

class CharsetDefined extends Audit {
/**
* @return {LH.Audit.Meta}
*/
static get meta() {
return {
id: 'charset',
title: str_(UIStrings.title),
failureTitle: str_(UIStrings.failureTitle),
description: str_(UIStrings.description),
requiredArtifacts: ['MainDocumentContent', 'URL', 'devtoolsLogs', 'MetaElements'],
};
}

/**
* @param {LH.Artifacts} artifacts
* @param {LH.Audit.Context} context
* @return {Promise<LH.Audit.Product>}
*/
static async audit(artifacts, context) {
const devtoolsLog = artifacts.devtoolsLogs[Audit.DEFAULT_PASS];
const mainResource = await MainResource.request({devtoolsLog, URL: artifacts.URL}, context);
let isCharsetSet = false;
// Check the http header 'content-type' to see if charset is defined there
if (mainResource.responseHeaders) {
const contentTypeHeader = mainResource.responseHeaders
.find(header => header.name.toLowerCase() === CONTENT_TYPE_HEADER);

if (contentTypeHeader) {
isCharsetSet = CHARSET_HTTP_REGEX.test(contentTypeHeader.value);
}
}

// Check if there is a BOM byte marker
const BOM_FIRSTCHAR = 65279;
isCharsetSet = isCharsetSet || artifacts.MainDocumentContent.charCodeAt(0) === BOM_FIRSTCHAR;

// Check if charset-ish meta tag is defined within the first 1024 characters(~1024 bytes) of the HTML document
if (CHARSET_HTML_REGEX.test(artifacts.MainDocumentContent.slice(0, 1024))) {
// If so, double-check the DOM attributes, considering both legacy http-equiv and html5 charset styles.
isCharsetSet = isCharsetSet || artifacts.MetaElements.some(meta => {
Beytoven marked this conversation as resolved.
Show resolved Hide resolved
return (meta.charset && IANA_REGEX.test(meta.charset)) ||
(meta.httpEquiv === 'content-type' &&
meta.content &&
CHARSET_HTTP_REGEX.test(meta.content));
});
}

return {
score: Number(isCharsetSet),
};
}
}

module.exports = CharsetDefined;
module.exports.UIStrings = UIStrings;
module.exports.CHARSET_HTML_REGEX = CHARSET_HTML_REGEX;
module.exports.CHARSET_HTTP_REGEX = CHARSET_HTTP_REGEX;
module.exports.IANA_REGEX = IANA_REGEX;
2 changes: 2 additions & 0 deletions lighthouse-core/config/default-config.js
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ const defaultConfig = {
'byte-efficiency/efficient-animated-content',
'dobetterweb/appcache-manifest',
'dobetterweb/doctype',
'dobetterweb/charset',
'dobetterweb/dom-size',
'dobetterweb/external-anchors-use-rel-noopener',
'dobetterweb/geolocation-on-start',
Expand Down Expand Up @@ -505,6 +506,7 @@ const defaultConfig = {
{id: 'external-anchors-use-rel-noopener', weight: 1},
{id: 'geolocation-on-start', weight: 1},
{id: 'doctype', weight: 1},
{id: 'charset', weight: 1},
{id: 'no-vulnerable-libraries', weight: 1},
{id: 'js-libraries', weight: 0},
{id: 'notification-on-start', weight: 1},
Expand Down
2 changes: 2 additions & 0 deletions lighthouse-core/gather/gatherers/meta-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class MetaElements extends Gatherer {
name: meta.name.toLowerCase(),
content: meta.content,
property: meta.attributes.property ? meta.attributes.property.value : undefined,
httpEquiv: meta.httpEquiv ? meta.httpEquiv.toLowerCase() : undefined,
charset: meta.attributes.charset ? meta.attributes.charset.value : undefined,
};
});
})()`, {useIsolation: true});
Expand Down
9 changes: 9 additions & 0 deletions lighthouse-core/lib/i18n/locales/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,15 @@
"lighthouse-core/audits/dobetterweb/appcache-manifest.js | title": {
"message": "Avoids Application Cache"
},
"lighthouse-core/audits/dobetterweb/charset.js | description": {
"message": "A character encoding declaration is required. It can be done with a <meta> tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
},
"lighthouse-core/audits/dobetterweb/charset.js | failureTitle": {
"message": "Charset declaration is missing or occurs too late in the HTML"
},
"lighthouse-core/audits/dobetterweb/charset.js | title": {
"message": "Properly defines charset"
},
"lighthouse-core/audits/dobetterweb/doctype.js | description": {
"message": "Specifying a doctype prevents the browser from switching to quirks-mode. [Learn more](https://web.dev/doctype)."
},
Expand Down
9 changes: 9 additions & 0 deletions lighthouse-core/lib/i18n/locales/en-XL.json
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,15 @@
"lighthouse-core/audits/dobetterweb/appcache-manifest.js | title": {
"message": "Âv́ôíd̂ś Âṕp̂ĺîćât́îón̂ Ćâćĥé"
},
"lighthouse-core/audits/dobetterweb/charset.js | description": {
"message": "Â ćĥár̂áĉt́êŕ êńĉód̂ín̂ǵ d̂éĉĺâŕât́îón̂ íŝ ŕêq́ûír̂éd̂. Ít̂ ćâń b̂é d̂ón̂é ŵít̂h́ â <ḿêt́â> t́âǵîń t̂h́ê f́îŕŝt́ 1024 b̂ýt̂éŝ óf̂ t́ĥé ĤT́M̂Ĺ ôŕ îń t̂h́ê Ćôńt̂én̂t́-T̂ýp̂é ĤT́T̂Ṕ r̂éŝṕôńŝé ĥéâd́êŕ. [L̂éâŕn̂ ḿôŕê](https://www.w3.org/International/questions/qa-html-encoding-declarations)."
},
"lighthouse-core/audits/dobetterweb/charset.js | failureTitle": {
"message": "Ĉh́âŕŝét̂ d́êćl̂ár̂át̂íôń îś m̂íŝśîńĝ ór̂ óĉćûŕŝ t́ôó l̂át̂é îń t̂h́ê H́T̂ḾL̂"
},
"lighthouse-core/audits/dobetterweb/charset.js | title": {
"message": "P̂ŕôṕêŕl̂ý d̂éf̂ín̂éŝ ćĥár̂śêt́"
},
"lighthouse-core/audits/dobetterweb/doctype.js | description": {
"message": "Ŝṕêćîf́ŷín̂ǵ â d́ôćt̂ýp̂é p̂ŕêv́êńt̂ś t̂h́ê b́r̂óŵśêŕ f̂ŕôḿ ŝẃît́ĉh́îńĝ t́ô q́ûír̂ḱŝ-ḿôd́ê. [Ĺêár̂ń m̂ór̂é](https://web.dev/doctype)."
},
Expand Down
165 changes: 165 additions & 0 deletions lighthouse-core/test/audits/dobetterweb/charset-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/**
* @license Copyright 2020 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/
'use strict';

const CharsetDefinedAudit = require('../../../audits/dobetterweb/charset.js');
const assert = require('assert');
const networkRecordsToDevtoolsLog = require('../../network-records-to-devtools-log.js');

/* eslint-env jest */

const HTML_PRE = '<!doctype html><head>';
const HTML_POST = '</head><body><h1>hello';

function generateArtifacts(htmlContent, contentTypeValue = 'text/html') {
const finalUrl = 'https://example.com/';
const mainResource = {
url: finalUrl,
responseHeaders: [
{name: 'content-type', value: contentTypeValue},
],
};
const devtoolsLog = networkRecordsToDevtoolsLog([mainResource]);
const context = {computedCache: new Map()};
return [{
devtoolsLogs: {[CharsetDefinedAudit.DEFAULT_PASS]: devtoolsLog},
URL: {finalUrl},
MainDocumentContent: htmlContent,
MetaElements: [],
}, context];
}

describe('Charset defined audit', () => {
it('succeeds where the page contains the charset meta tag', async () => {
const htmlContent = HTML_PRE + '<meta charset="utf-8" >' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 1);
});

it('succeeds when the page has the charset defined in the content-type meta tag', async () => {
const htmlContent = HTML_PRE +
'<meta http-equiv="Content-type" content="text/html; charset=utf-8" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [
{name: '', content: 'text/html; charset=utf-8', httpEquiv: 'content-type'},
];
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 1);
});

it('succeeds when the page has the charset defined in the content-type http header', async () => {
const htmlContent = HTML_PRE +
'<meta http-equiv="Content-type" content="text/html" />' + HTML_POST;
const contentTypeVal = 'text/html; charset=UTF-8';
const [artifacts, context] = generateArtifacts(htmlContent, contentTypeVal);
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 1);
});

it('succeeds when the page has the charset defined via BOM', async () => {
const htmlContent = '\ufeff' + HTML_PRE +
'<meta http-equiv="Content-type" content="text/html" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 1);
});

it('fails when the page does not have charset defined', async () => {
const htmlContent = HTML_PRE + '<meta http-equiv="Content-type" content="text/html" />';
const [artifacts, context] = generateArtifacts(htmlContent);
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 0);
});

it('fails when the page has charset defined too late in the page', async () => {
const bigString = new Array(1024).fill(' ').join('');
const htmlContent = HTML_PRE + bigString + '<meta charset="utf-8" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 0);
});

it('passes when the page has charset defined almost too late in the page', async () => {
const bigString = new Array(900).fill(' ').join('');
const htmlContent = HTML_PRE + bigString + '<meta charset="utf-8" />' + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 1);
});

it('fails when charset only partially defined in the first 1024 bytes of the page', async () => {
const charsetHTML = '<meta charset="utf-8" />';
// 1024 bytes should be halfway through the meta tag
const bigString = new Array(1024 - HTML_PRE.length - charsetHTML.length / 2).fill(' ').join('');
const htmlContent = HTML_PRE + bigString + charsetHTML + HTML_POST;
const [artifacts, context] = generateArtifacts(htmlContent);
artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}];
const auditResult = await CharsetDefinedAudit.audit(artifacts, context);
assert.equal(auditResult.score, 0);
});
});

describe('Charset regex check', () => {
const HTML_REGEX = CharsetDefinedAudit.CHARSET_HTML_REGEX;
const HTTP_REGEX = CharsetDefinedAudit.CHARSET_HTTP_REGEX;
const IANA_REGEX = CharsetDefinedAudit.IANA_REGEX;

it('handles html correctly', () => {
// Positive cases
assert.equal(HTML_REGEX.test('<meta charset=utf-8 />'), true);
assert.equal(HTML_REGEX.test(`<!doctype html><meta charset=utf-8 /><body>`), true);
assert.equal(HTML_REGEX.test(`<!doctype html><meta charset=utf-8 /><body>`), true);
assert.equal(HTML_REGEX.test(`<!doctype html><meta charset=utf-8><body>`), true);
assert.equal(HTML_REGEX.test(`<!doctype html><meta charset=UTF-8><body>`), true);
assert.equal(HTML_REGEX.test(
`<!doctype html><meta http-equiv="Content-type" content="text/html; charset=utf-8"/><body>'`),
true);
assert.equal(HTML_REGEX.test(
`<!doctype html><meta content="text/html; charset=utf-8" http-equiv="Content-type"/><body>'`),
true);

// Negative cases
assert.equal(HTML_REGEX.test(`<!doctype html><meta description=hello><body>`), false);
assert.equal(HTML_REGEX.test(`<!doctype html><meta charset=utf-8<body>`), false);
assert.equal(HTML_REGEX.test(
`<!doctype html><meta http-equiv="Content-type" content="text/html; nope-tf8" /><body>'`),
false);
assert.equal(HTML_REGEX.test(
`<!doctype html><meta http-equiv="Content-type" content="text/html; charset=utf-8" <body>'`),
false);
});

it('handles http header correctly', () => {
// Positive cases
assert.equal(HTTP_REGEX.test('text/html; charset=UTF-8'), true);
assert.equal(HTTP_REGEX.test('text/html; charset = UTF-8'), true);

// Negative cases
assert.equal(HTTP_REGEX.test('text/html; charset='), false);
assert.equal(HTTP_REGEX.test('text/html; charset=x'), false);
assert.equal(HTTP_REGEX.test('text/html; charset= '), false);
});

it('handles charset name validation correctly', () => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

supa hot 🔥

// Positive cases
assert.equal(IANA_REGEX.test('utf-8'), true);
assert.equal(IANA_REGEX.test('utf-16'), true);
assert.equal(IANA_REGEX.test('IT'), true);
assert.equal(IANA_REGEX.test('NS_4551-1'), true);
assert.equal(IANA_REGEX.test('ISO_646.basic:1983'), true);
assert.equal(IANA_REGEX.test('NF_Z_62-010_(1973)'), true);

// Negative cases
assert.equal(IANA_REGEX.test('a'), false);
assert.equal(IANA_REGEX.test(''), false);
assert.equal(IANA_REGEX.test('utf+8'), false);
assert.equal(IANA_REGEX.test('utf-16*'), false);
});
});
23 changes: 23 additions & 0 deletions lighthouse-core/test/results/sample_v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -2782,6 +2782,13 @@
"score": 1,
"scoreDisplayMode": "binary"
},
"charset": {
"id": "charset",
"title": "Charset declaration is missing or occurs too late in the HTML",
"description": "A character encoding declaration is required. It can be done with a <meta> tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).",
"score": 0,
"scoreDisplayMode": "binary"
},
"dom-size": {
"id": "dom-size",
"title": "Avoids an excessive DOM size",
Expand Down Expand Up @@ -4042,6 +4049,10 @@
"id": "doctype",
"weight": 1
},
{
"id": "charset",
"weight": 1
},
{
"id": "no-vulnerable-libraries",
"weight": 1
Expand Down Expand Up @@ -5223,6 +5234,12 @@
"duration": 100,
"entryType": "measure"
},
{
"startTime": 0,
"name": "lh:audit:charset",
"duration": 100,
"entryType": "measure"
},
{
"startTime": 0,
"name": "lh:audit:dom-size",
Expand Down Expand Up @@ -6349,6 +6366,12 @@
"lighthouse-core/audits/dobetterweb/doctype.js | description": [
"audits.doctype.description"
],
"lighthouse-core/audits/dobetterweb/charset.js | failureTitle": [
"audits.charset.title"
],
"lighthouse-core/audits/dobetterweb/charset.js | description": [
"audits.charset.description"
],
"lighthouse-core/audits/dobetterweb/dom-size.js | title": [
"audits[dom-size].title"
],
Expand Down
Loading