From a3cc01b13e7b098304815ac1d329375e34a59935 Mon Sep 17 00:00:00 2001
From: Andreu Botella <abb@randomunok.com>
Date: Wed, 30 Dec 2020 09:46:29 +0100
Subject: [PATCH] Make `document.characterSet` not reflecting byte order marks.

The process of decoding the network byte stream to Unicode is backed by
an instance of `encoding_rs::Decoder`, which will switch the encoding it
uses if it finds a BOM in the byte stream. However, this change in
encoding is not communicated back to the caller and so
`document.characterSet` gives the wrong result. This change fixes that.

See whatwg/html#5359 and whatwg/encoding#203 for the spec-level backing
for this change.
---
 components/script/dom/domparser.rs            |  4 +-
 components/script/dom/servoparser/mod.rs      | 62 +++++++++++++++++--
 components/script/dom/xmlhttprequest.rs       |  4 +-
 components/script/script_thread.rs            |  6 +-
 .../encoding/bom-handling.html.ini            |  4 --
 .../metadata/encoding/bom-handling.html.ini   |  4 --
 .../encoding/utf-32-from-win1252.html.ini     |  6 --
 tests/wpt/metadata/encoding/utf-32.html.ini   | 26 --------
 8 files changed, 64 insertions(+), 52 deletions(-)
 delete mode 100644 tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini
 delete mode 100644 tests/wpt/metadata/encoding/bom-handling.html.ini
 delete mode 100644 tests/wpt/metadata/encoding/utf-32.html.ini
diff --git a/components/script/dom/domparser.rs b/components/script/dom/domparser.rs
index 6a0328fc43840..876d87eef25fb 100644
--- a/components/script/dom/domparser.rs
+++ b/components/script/dom/domparser.rs
@@ -77,7 +77,7 @@ impl DOMParserMethods for DOMParser {
                     None,
                     Default::default(),
                 );
-                ServoParser::parse_html_document(&document, s, url);
+                ServoParser::parse_html_document(&document, Some(s), url);
                 document.set_ready_state(DocumentReadyState::Complete);
                 Ok(document)
             },
@@ -97,7 +97,7 @@ impl DOMParserMethods for DOMParser {
                     None,
                     Default::default(),
                 );
-                ServoParser::parse_xml_document(&document, s, url);
+                ServoParser::parse_xml_document(&document, Some(s), url);
                 document.set_ready_state(DocumentReadyState::Complete);
                 Ok(document)
             },
diff --git a/components/script/dom/servoparser/mod.rs b/components/script/dom/servoparser/mod.rs
index 099f6ef809cef..7f685cc99481d 100644
--- a/components/script/dom/servoparser/mod.rs
+++ b/components/script/dom/servoparser/mod.rs
@@ -83,6 +83,12 @@ pub struct ServoParser {
     reflector: Reflector,
     /// The document associated with this parser.
     document: Dom<Document>,
+    /// The BOM sniffing state.
+    ///
+    /// `None` means we've found the BOM, we've found there isn't one, or
+    /// we're not parsing from a byte stream. `Some` contains the BOM bytes
+    /// found so far.
+    bom_sniff: DomRefCell<Option<Vec<u8>>>,
     /// The decoder used for the network input.
     network_decoder: DomRefCell<Option<NetworkDecoder>>,
     /// Input received from network.
@@ -142,7 +148,7 @@ impl ServoParser {
         self.can_write() || self.tokenizer.try_borrow_mut().is_ok()
     }
 
-    pub fn parse_html_document(document: &Document, input: DOMString, url: ServoUrl) {
+    pub fn parse_html_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
         let parser = if pref!(dom.servoparser.async_html_tokenizer.enabled) {
             ServoParser::new(
                 document,
@@ -163,7 +169,13 @@ impl ServoParser {
                 ParserKind::Normal,
             )
         };
-        parser.parse_string_chunk(String::from(input));
+
+        // Set as the document's current parser and initialize with `input`, if given.
+        if let Some(input) = input {
+            parser.parse_string_chunk(String::from(input));
+        } else {
+            parser.document.set_current_parser(Some(&parser));
+        }
     }
 
     // https://html.spec.whatwg.org/multipage/#parsing-html-fragments
@@ -242,17 +254,24 @@ impl ServoParser {
             LastChunkState::NotReceived,
             ParserKind::ScriptCreated,
         );
+        *parser.bom_sniff.borrow_mut() = None;
         document.set_current_parser(Some(&parser));
     }
 
-    pub fn parse_xml_document(document: &Document, input: DOMString, url: ServoUrl) {
+    pub fn parse_xml_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
         let parser = ServoParser::new(
             document,
             Tokenizer::Xml(self::xml::Tokenizer::new(document, url)),
             LastChunkState::NotReceived,
             ParserKind::Normal,
         );
-        parser.parse_string_chunk(String::from(input));
+
+        // Set as the document's current parser and initialize with `input`, if given.
+        if let Some(input) = input {
+            parser.parse_string_chunk(String::from(input));
+        } else {
+            parser.document.set_current_parser(Some(&parser));
+        }
     }
 
     pub fn script_nesting_level(&self) -> usize {
@@ -402,6 +421,7 @@ impl ServoParser {
         ServoParser {
             reflector: Reflector::new(),
             document: Dom::from_ref(document),
+            bom_sniff: DomRefCell::new(Some(Vec::with_capacity(3))),
             network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))),
             network_input: DomRefCell::new(BufferQueue::new()),
             script_input: DomRefCell::new(BufferQueue::new()),
@@ -463,6 +483,35 @@ impl ServoParser {
     }
 
     fn push_bytes_input_chunk(&self, chunk: Vec<u8>) {
+        // BOM sniff. This is needed because NetworkDecoder will switch the
+        // encoding based on the BOM, but it won't change
+        // `self.document.encoding` in the process.
+        {
+            let set_bom_to_none = if let Some(partial_bom) = self.bom_sniff.borrow_mut().as_mut() {
+                if partial_bom.len() + chunk.len() >= 3 {
+                    partial_bom.extend(chunk.iter().take(3 - partial_bom.len()).copied());
+                    //println!("Full BOM: {:?}", partial_bom);
+                    if let Some((encoding, _)) = Encoding::for_bom(&partial_bom) {
+                        //println!("Encoding: {}", encoding.name());
+                        self.document.set_encoding(encoding);
+                    } else {
+                        //println!("Bytes are not a BOM.");
+                    };
+                    true
+                } else {
+                    partial_bom.extend(chunk.iter().copied());
+                    //println!("Partial BOM: {:?}", partial_bom);
+                    false
+                }
+            } else {
+                //println!("partial_bom is None");
+                false
+            };
+            if set_bom_to_none {
+                *self.bom_sniff.borrow_mut() = None;
+            }
+        }
+
         // For byte input, we convert it to text using the network decoder.
         let chunk = self
             .network_decoder
@@ -474,6 +523,11 @@ impl ServoParser {
     }
 
     fn push_string_input_chunk(&self, chunk: String) {
+        // If the input is a string, we don't have a BOM.
+        if self.bom_sniff.borrow().is_some() {
+            *self.bom_sniff.borrow_mut() = None;
+        }
+
         // The input has already been decoded as a string, so doesn't need
         // to be decoded by the network decoder again.
         let chunk = StrTendril::from(chunk);
diff --git a/components/script/dom/xmlhttprequest.rs b/components/script/dom/xmlhttprequest.rs
index 626f370316e5a..772e00982bec9 100644
--- a/components/script/dom/xmlhttprequest.rs
+++ b/components/script/dom/xmlhttprequest.rs
@@ -1488,7 +1488,7 @@ impl XMLHttpRequest {
         let (decoded, _, _) = charset.decode(&response);
         let document = self.new_doc(IsHTMLDocument::HTMLDocument);
         // TODO: Disable scripting while parsing
-        ServoParser::parse_html_document(&document, DOMString::from(decoded), wr.get_url());
+        ServoParser::parse_html_document(&document, Some(DOMString::from(decoded)), wr.get_url());
         document
     }
 
@@ -1499,7 +1499,7 @@ impl XMLHttpRequest {
         let (decoded, _, _) = charset.decode(&response);
         let document = self.new_doc(IsHTMLDocument::NonHTMLDocument);
         // TODO: Disable scripting while parsing
-        ServoParser::parse_xml_document(&document, DOMString::from(decoded), wr.get_url());
+        ServoParser::parse_xml_document(&document, Some(DOMString::from(decoded)), wr.get_url());
         document
     }
 
diff --git a/components/script/script_thread.rs b/components/script/script_thread.rs
index b41c5885c8330..68bbc5247640b 100644
--- a/components/script/script_thread.rs
+++ b/components/script/script_thread.rs
@@ -3415,15 +3415,13 @@ impl ScriptThread {
             (incomplete.browsing_context_id, incomplete.pipeline_id, None),
         );
 
-        let parse_input = DOMString::new();
-
         document.set_https_state(metadata.https_state);
         document.set_navigation_start(incomplete.navigation_start_precise);
 
         if is_html_document == IsHTMLDocument::NonHTMLDocument {
-            ServoParser::parse_xml_document(&document, parse_input, final_url);
+            ServoParser::parse_xml_document(&document, None, final_url);
         } else {
-            ServoParser::parse_html_document(&document, parse_input, final_url);
+            ServoParser::parse_html_document(&document, None, final_url);
         }
 
         if incomplete.activity == DocumentActivity::FullyActive {
diff --git a/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini b/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini
deleted file mode 100644
index 813b55eb6ecf2..0000000000000
--- a/tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-[bom-handling.html]
-  [document.characterSet should match the BOM]
-    expected: FAIL
-
diff --git a/tests/wpt/metadata/encoding/bom-handling.html.ini b/tests/wpt/metadata/encoding/bom-handling.html.ini
deleted file mode 100644
index 813b55eb6ecf2..0000000000000
--- a/tests/wpt/metadata/encoding/bom-handling.html.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-[bom-handling.html]
-  [document.characterSet should match the BOM]
-    expected: FAIL
-
diff --git a/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini b/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini
index e4de65f4aa25b..267c2e752e7e7 100644
--- a/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini
+++ b/tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini
@@ -1,10 +1,4 @@
 [utf-32-from-win1252.html]
-  [Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE]
-    expected: FAIL
-
-  [Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE]
-    expected: FAIL
-
   [Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
     expected: FAIL
 
diff --git a/tests/wpt/metadata/encoding/utf-32.html.ini b/tests/wpt/metadata/encoding/utf-32.html.ini
deleted file mode 100644
index e765bc39ad075..0000000000000
--- a/tests/wpt/metadata/encoding/utf-32.html.ini
+++ /dev/null
@@ -1,26 +0,0 @@
-[utf-32.html]
-  type: testharness
-  [Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
-    expected: FAIL
-
-  [Expect resources/utf-32-big-endian-bom.xml to parse as windows-1252]
-    expected: FAIL
-
-  [Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252]
-    expected: FAIL
-
-  [Expect resources/utf-32-big-endian-nobom.xml to parse as windows-1252]
-    expected: FAIL
-
-  [Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE]
-    expected: FAIL
-
-  [Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE]
-    expected: FAIL
-
-  [Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252]
-    expected: FAIL
-
-  [Expect resources/utf-32-little-endian-nobom.xml to parse as windows-1252]
-    expected: FAIL
-