Skip to content

Commit

Permalink
Make document.characterSet not reflecting byte order marks.
Browse files Browse the repository at this point in the history
The process of decoding the network byte stream to Unicode is backed by
an instance of `encoding_rs::Decoder`, which will switch the encoding it
uses if it finds a BOM in the byte stream. However, this change in
encoding is not communicated back to the caller and so
`document.characterSet` gives the wrong result. This change fixes that.

See whatwg/html#5359 and whatwg/encoding#203 for the spec-level backing
for this change.
  • Loading branch information
Andreu Botella committed Dec 30, 2020
1 parent 388bf93 commit a3cc01b
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 52 deletions.
4 changes: 2 additions & 2 deletions components/script/dom/domparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ impl DOMParserMethods for DOMParser {
None,
Default::default(),
);
ServoParser::parse_html_document(&document, s, url);
ServoParser::parse_html_document(&document, Some(s), url);
document.set_ready_state(DocumentReadyState::Complete);
Ok(document)
},
Expand All @@ -97,7 +97,7 @@ impl DOMParserMethods for DOMParser {
None,
Default::default(),
);
ServoParser::parse_xml_document(&document, s, url);
ServoParser::parse_xml_document(&document, Some(s), url);
document.set_ready_state(DocumentReadyState::Complete);
Ok(document)
},
Expand Down
62 changes: 58 additions & 4 deletions components/script/dom/servoparser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ pub struct ServoParser {
reflector: Reflector,
/// The document associated with this parser.
document: Dom<Document>,
/// The BOM sniffing state.
///
/// `None` means we've found the BOM, we've found there isn't one, or
/// we're not parsing from a byte stream. `Some` contains the BOM bytes
/// found so far.
bom_sniff: DomRefCell<Option<Vec<u8>>>,
/// The decoder used for the network input.
network_decoder: DomRefCell<Option<NetworkDecoder>>,
/// Input received from network.
Expand Down Expand Up @@ -142,7 +148,7 @@ impl ServoParser {
self.can_write() || self.tokenizer.try_borrow_mut().is_ok()
}

pub fn parse_html_document(document: &Document, input: DOMString, url: ServoUrl) {
pub fn parse_html_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
let parser = if pref!(dom.servoparser.async_html_tokenizer.enabled) {
ServoParser::new(
document,
Expand All @@ -163,7 +169,13 @@ impl ServoParser {
ParserKind::Normal,
)
};
parser.parse_string_chunk(String::from(input));

// Set as the document's current parser and initialize with `input`, if given.
if let Some(input) = input {
parser.parse_string_chunk(String::from(input));
} else {
parser.document.set_current_parser(Some(&parser));
}
}

// https://html.spec.whatwg.org/multipage/#parsing-html-fragments
Expand Down Expand Up @@ -242,17 +254,24 @@ impl ServoParser {
LastChunkState::NotReceived,
ParserKind::ScriptCreated,
);
*parser.bom_sniff.borrow_mut() = None;
document.set_current_parser(Some(&parser));
}

pub fn parse_xml_document(document: &Document, input: DOMString, url: ServoUrl) {
pub fn parse_xml_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
let parser = ServoParser::new(
document,
Tokenizer::Xml(self::xml::Tokenizer::new(document, url)),
LastChunkState::NotReceived,
ParserKind::Normal,
);
parser.parse_string_chunk(String::from(input));

// Set as the document's current parser and initialize with `input`, if given.
if let Some(input) = input {
parser.parse_string_chunk(String::from(input));
} else {
parser.document.set_current_parser(Some(&parser));
}
}

pub fn script_nesting_level(&self) -> usize {
Expand Down Expand Up @@ -402,6 +421,7 @@ impl ServoParser {
ServoParser {
reflector: Reflector::new(),
document: Dom::from_ref(document),
bom_sniff: DomRefCell::new(Some(Vec::with_capacity(3))),
network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))),
network_input: DomRefCell::new(BufferQueue::new()),
script_input: DomRefCell::new(BufferQueue::new()),
Expand Down Expand Up @@ -463,6 +483,35 @@ impl ServoParser {
}

fn push_bytes_input_chunk(&self, chunk: Vec<u8>) {
// BOM sniff. This is needed because NetworkDecoder will switch the
// encoding based on the BOM, but it won't change
// `self.document.encoding` in the process.
{
let set_bom_to_none = if let Some(partial_bom) = self.bom_sniff.borrow_mut().as_mut() {
if partial_bom.len() + chunk.len() >= 3 {
partial_bom.extend(chunk.iter().take(3 - partial_bom.len()).copied());
//println!("Full BOM: {:?}", partial_bom);
if let Some((encoding, _)) = Encoding::for_bom(&partial_bom) {
//println!("Encoding: {}", encoding.name());
self.document.set_encoding(encoding);
} else {
//println!("Bytes are not a BOM.");
};
true
} else {
partial_bom.extend(chunk.iter().copied());
//println!("Partial BOM: {:?}", partial_bom);
false
}
} else {
//println!("partial_bom is None");
false
};
if set_bom_to_none {
*self.bom_sniff.borrow_mut() = None;
}
}

// For byte input, we convert it to text using the network decoder.
let chunk = self
.network_decoder
Expand All @@ -474,6 +523,11 @@ impl ServoParser {
}

fn push_string_input_chunk(&self, chunk: String) {
// If the input is a string, we don't have a BOM.
if self.bom_sniff.borrow().is_some() {
*self.bom_sniff.borrow_mut() = None;
}

// The input has already been decoded as a string, so doesn't need
// to be decoded by the network decoder again.
let chunk = StrTendril::from(chunk);
Expand Down
4 changes: 2 additions & 2 deletions components/script/dom/xmlhttprequest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,7 @@ impl XMLHttpRequest {
let (decoded, _, _) = charset.decode(&response);
let document = self.new_doc(IsHTMLDocument::HTMLDocument);
// TODO: Disable scripting while parsing
ServoParser::parse_html_document(&document, DOMString::from(decoded), wr.get_url());
ServoParser::parse_html_document(&document, Some(DOMString::from(decoded)), wr.get_url());
document
}

Expand All @@ -1499,7 +1499,7 @@ impl XMLHttpRequest {
let (decoded, _, _) = charset.decode(&response);
let document = self.new_doc(IsHTMLDocument::NonHTMLDocument);
// TODO: Disable scripting while parsing
ServoParser::parse_xml_document(&document, DOMString::from(decoded), wr.get_url());
ServoParser::parse_xml_document(&document, Some(DOMString::from(decoded)), wr.get_url());
document
}

Expand Down
6 changes: 2 additions & 4 deletions components/script/script_thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3415,15 +3415,13 @@ impl ScriptThread {
(incomplete.browsing_context_id, incomplete.pipeline_id, None),
);

let parse_input = DOMString::new();

document.set_https_state(metadata.https_state);
document.set_navigation_start(incomplete.navigation_start_precise);

if is_html_document == IsHTMLDocument::NonHTMLDocument {
ServoParser::parse_xml_document(&document, parse_input, final_url);
ServoParser::parse_xml_document(&document, None, final_url);
} else {
ServoParser::parse_html_document(&document, parse_input, final_url);
ServoParser::parse_html_document(&document, None, final_url);
}

if incomplete.activity == DocumentActivity::FullyActive {
Expand Down
4 changes: 0 additions & 4 deletions tests/wpt/metadata-layout-2020/encoding/bom-handling.html.ini

This file was deleted.

4 changes: 0 additions & 4 deletions tests/wpt/metadata/encoding/bom-handling.html.ini

This file was deleted.

6 changes: 0 additions & 6 deletions tests/wpt/metadata/encoding/utf-32-from-win1252.html.ini
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
[utf-32-from-win1252.html]
[Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE]
expected: FAIL

[Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE]
expected: FAIL

[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
expected: FAIL

Expand Down
26 changes: 0 additions & 26 deletions tests/wpt/metadata/encoding/utf-32.html.ini

This file was deleted.

0 comments on commit a3cc01b

Please sign in to comment.