From a3cc01b13e7b098304815ac1d329375e34a59935 Mon Sep 17 00:00:00 2001 From: Andreu Botella Date: Wed, 30 Dec 2020 09:46:29 +0100 Subject: [PATCH] Make `document.characterSet` not reflecting byte order marks. The process of decoding the network byte stream to Unicode is backed by an instance of `encoding_rs::Decoder`, which will switch the encoding it uses if it finds a BOM in the byte stream. However, this change in encoding is not communicated back to the caller and so `document.characterSet` gives the wrong result. This change fixes that. See whatwg/html#5359 and whatwg/encoding#203 for the spec-level backing for this change. --- components/script/dom/domparser.rs | 4 +- components/script/dom/servoparser/mod.rs | 62 +++++++++++++++++-- components/script/dom/xmlhttprequest.rs | 4 +- components/script/script_thread.rs | 6 +- .../encoding/bom-handling.html.ini | 4 -- .../metadata/encoding/bom-handling.html.ini | 4 -- .../encoding/utf-32-from-win1252.html.ini | 6 -- tests/wpt/metadata/encoding/utf-32.html.ini | 26 -------- 8 files changed, 64 insertions(+), 52 deletions(-) delete mode 100644 tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini delete mode 100644 tests/wpt/metadata/encoding/bom-handling.html.ini delete mode 100644 tests/wpt/metadata/encoding/utf-32.html.ini diff --git a/components/script/dom/domparser.rs b/components/script/dom/domparser.rs index 6a0328fc43840..876d87eef25fb 100644 --- a/components/script/dom/domparser.rs +++ b/components/script/dom/domparser.rs @@ -77,7 +77,7 @@ impl DOMParserMethods for DOMParser { None, Default::default(), ); - ServoParser::parse_html_document(&document, s, url); + ServoParser::parse_html_document(&document, Some(s), url); document.set_ready_state(DocumentReadyState::Complete); Ok(document) }, @@ -97,7 +97,7 @@ impl DOMParserMethods for DOMParser { None, Default::default(), ); - ServoParser::parse_xml_document(&document, s, url); + ServoParser::parse_xml_document(&document, Some(s), url); document.set_ready_state(DocumentReadyState::Complete); Ok(document) }, diff --git a/components/script/dom/servoparser/mod.rs b/components/script/dom/servoparser/mod.rs index 099f6ef809cef..7f685cc99481d 100644 --- a/components/script/dom/servoparser/mod.rs +++ b/components/script/dom/servoparser/mod.rs @@ -83,6 +83,12 @@ pub struct ServoParser { reflector: Reflector, /// The document associated with this parser. document: Dom, + /// The BOM sniffing state. + /// + /// `None` means we've found the BOM, we've found there isn't one, or + /// we're not parsing from a byte stream. `Some` contains the BOM bytes + /// found so far. + bom_sniff: DomRefCell>>, /// The decoder used for the network input. network_decoder: DomRefCell>, /// Input received from network. @@ -142,7 +148,7 @@ impl ServoParser { self.can_write() || self.tokenizer.try_borrow_mut().is_ok() } - pub fn parse_html_document(document: &Document, input: DOMString, url: ServoUrl) { + pub fn parse_html_document(document: &Document, input: Option, url: ServoUrl) { let parser = if pref!(dom.servoparser.async_html_tokenizer.enabled) { ServoParser::new( document, @@ -163,7 +169,13 @@ impl ServoParser { ParserKind::Normal, ) }; - parser.parse_string_chunk(String::from(input)); + + // Set as the document's current parser and initialize with `input`, if given. + if let Some(input) = input { + parser.parse_string_chunk(String::from(input)); + } else { + parser.document.set_current_parser(Some(&parser)); + } } // https://html.spec.whatwg.org/multipage/#parsing-html-fragments @@ -242,17 +254,24 @@ impl ServoParser { LastChunkState::NotReceived, ParserKind::ScriptCreated, ); + *parser.bom_sniff.borrow_mut() = None; document.set_current_parser(Some(&parser)); } - pub fn parse_xml_document(document: &Document, input: DOMString, url: ServoUrl) { + pub fn parse_xml_document(document: &Document, input: Option, url: ServoUrl) { let parser = ServoParser::new( document, Tokenizer::Xml(self::xml::Tokenizer::new(document, url)), LastChunkState::NotReceived, ParserKind::Normal, ); - parser.parse_string_chunk(String::from(input)); + + // Set as the document's current parser and initialize with `input`, if given. + if let Some(input) = input { + parser.parse_string_chunk(String::from(input)); + } else { + parser.document.set_current_parser(Some(&parser)); + } } pub fn script_nesting_level(&self) -> usize { @@ -402,6 +421,7 @@ impl ServoParser { ServoParser { reflector: Reflector::new(), document: Dom::from_ref(document), + bom_sniff: DomRefCell::new(Some(Vec::with_capacity(3))), network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))), network_input: DomRefCell::new(BufferQueue::new()), script_input: DomRefCell::new(BufferQueue::new()), @@ -463,6 +483,35 @@ impl ServoParser { } fn push_bytes_input_chunk(&self, chunk: Vec) { + // BOM sniff. This is needed because NetworkDecoder will switch the + // encoding based on the BOM, but it won't change + // `self.document.encoding` in the process. + { + let set_bom_to_none = if let Some(partial_bom) = self.bom_sniff.borrow_mut().as_mut() { + if partial_bom.len() + chunk.len() >= 3 { + partial_bom.extend(chunk.iter().take(3 - partial_bom.len()).copied()); + //println!("Full BOM: {:?}", partial_bom); + if let Some((encoding, _)) = Encoding::for_bom(&partial_bom) { + //println!("Encoding: {}", encoding.name()); + self.document.set_encoding(encoding); + } else { + //println!("Bytes are not a BOM."); + }; + true + } else { + partial_bom.extend(chunk.iter().copied()); + //println!("Partial BOM: {:?}", partial_bom); + false + } + } else { + //println!("partial_bom is None"); + false + }; + if set_bom_to_none { + *self.bom_sniff.borrow_mut() = None; + } + } + // For byte input, we convert it to text using the network decoder. let chunk = self .network_decoder @@ -474,6 +523,11 @@ impl ServoParser { } fn push_string_input_chunk(&self, chunk: String) { + // If the input is a string, we don't have a BOM. + if self.bom_sniff.borrow().is_some() { + *self.bom_sniff.borrow_mut() = None; + } + // The input has already been decoded as a string, so doesn't need // to be decoded by the network decoder again. let chunk = StrTendril::from(chunk); diff --git a/components/script/dom/xmlhttprequest.rs b/components/script/dom/xmlhttprequest.rs index 626f370316e5a..772e00982bec9 100644 --- a/components/script/dom/xmlhttprequest.rs +++ b/components/script/dom/xmlhttprequest.rs @@ -1488,7 +1488,7 @@ impl XMLHttpRequest { let (decoded, _, _) = charset.decode(&response); let document = self.new_doc(IsHTMLDocument::HTMLDocument); // TODO: Disable scripting while parsing - ServoParser::parse_html_document(&document, DOMString::from(decoded), wr.get_url()); + ServoParser::parse_html_document(&document, Some(DOMString::from(decoded)), wr.get_url()); document } @@ -1499,7 +1499,7 @@ impl XMLHttpRequest { let (decoded, _, _) = charset.decode(&response); let document = self.new_doc(IsHTMLDocument::NonHTMLDocument); // TODO: Disable scripting while parsing - ServoParser::parse_xml_document(&document, DOMString::from(decoded), wr.get_url()); + ServoParser::parse_xml_document(&document, Some(DOMString::from(decoded)), wr.get_url()); document } diff --git a/components/script/script_thread.rs b/components/script/script_thread.rs index b41c5885c8330..68bbc5247640b 100644 --- a/components/script/script_thread.rs +++ b/components/script/script_thread.rs @@ -3415,15 +3415,13 @@ impl ScriptThread { (incomplete.browsing_context_id, incomplete.pipeline_id, None), ); - let parse_input = DOMString::new(); - document.set_https_state(metadata.https_state); document.set_navigation_start(incomplete.navigation_start_precise); if is_html_document == IsHTMLDocument::NonHTMLDocument { - ServoParser::parse_xml_document(&document, parse_input, final_url); + ServoParser::parse_xml_document(&document, None, final_url); } else { - ServoParser::parse_html_document(&document, parse_input, final_url); + ServoParser::parse_html_document(&document, None, final_url); } if incomplete.activity == DocumentActivity::FullyActive { diff --git a/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini b/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini deleted file mode 100644 index 813b55eb6ecf2..0000000000000 --- a/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini +++ /dev/null @@ -1,4 +0,0 @@ -[bom-handling.html] - [document.characterSet should match the BOM] - expected: FAIL - diff --git a/tests/wpt/metadata/encoding/bom-handling.html.ini b/tests/wpt/metadata/encoding/bom-handling.html.ini deleted file mode 100644 index 813b55eb6ecf2..0000000000000 --- a/tests/wpt/metadata/encoding/bom-handling.html.ini +++ /dev/null @@ -1,4 +0,0 @@ -[bom-handling.html] - [document.characterSet should match the BOM] - expected: FAIL - diff --git a/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini b/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini index e4de65f4aa25b..267c2e752e7e7 100644 --- a/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini +++ b/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini @@ -1,10 +1,4 @@ [utf-32-from-win1252.html] - [Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE] - expected: FAIL - - [Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE] - expected: FAIL - [Expect resources/utf-32-big-endian-bom.html to parse as windows-1252] expected: FAIL diff --git a/tests/wpt/metadata/encoding/utf-32.html.ini b/tests/wpt/metadata/encoding/utf-32.html.ini deleted file mode 100644 index e765bc39ad075..0000000000000 --- a/tests/wpt/metadata/encoding/utf-32.html.ini +++ /dev/null @@ -1,26 +0,0 @@ -[utf-32.html] - type: testharness - [Expect resources/utf-32-big-endian-bom.html to parse as windows-1252] - expected: FAIL - - [Expect resources/utf-32-big-endian-bom.xml to parse as windows-1252] - expected: FAIL - - [Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252] - expected: FAIL - - [Expect resources/utf-32-big-endian-nobom.xml to parse as windows-1252] - expected: FAIL - - [Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE] - expected: FAIL - - [Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE] - expected: FAIL - - [Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252] - expected: FAIL - - [Expect resources/utf-32-little-endian-nobom.xml to parse as windows-1252] - expected: FAIL -