diff --git a/Sources/Tokenizer/CharRefTokenizer.swift b/Sources/Tokenizer/CharRefTokenizer.swift index bebfa29..9ab5f36 100644 --- a/Sources/Tokenizer/CharRefTokenizer.swift +++ b/Sources/Tokenizer/CharRefTokenizer.swift @@ -1,3 +1,152 @@ +private enum CharRefState { + case initial + case named + case ambiguousAmpersand + case numeric + case hexadecimalStart(uppercase: Bool) + case decimalStart + case hexadecimal + case decimal + case numericEnd +} + +typealias CharRef = [Character] + +private enum CharRefProcessResult: ~Copyable { + case done(CharRef?) + case progress +} + struct CharRefTokenizer { - mutating func tokenize(_ input: inout String.Iterator) {} + private var state: CharRefState = .initial + // TODO: Consider type + private var num: Int = 0 + + mutating func tokenize(tokenizer: inout Tokenizer, input: inout String.Iterator) -> CharRef? { + while true { + switch self.step(tokenizer: &tokenizer, input: &input) { + case .done(let charRef): return charRef + case .progress: break + } + } + } + + private mutating func step(tokenizer: inout Tokenizer, input: inout String.Iterator) -> CharRefProcessResult { + switch self.state { + case .initial: + switch tokenizer.peek(input) { + case let c? where c.isASCII && (c.isLetter || c.isWholeNumber): + self.state = .named + return .progress + case "#": + tokenizer.discardChar(&input) + self.state = .numeric + return .progress + case _?: return .done(["&"]) + case nil: return .done(nil) + } + case .named: fatalError("not implemented") + case .ambiguousAmpersand: + switch tokenizer.peek(input) { + case let c? where c.isASCII && c.isLetter: + tokenizer.discardChar(&input) + tokenizer.processCharRef([c]) + return .progress + case ";": + tokenizer.emitError(.unknownNamedCharRef) + return .done(nil) + case _?: return .done(nil) + case nil: return .done(nil) + } + case .numeric: + guard let c = tokenizer.peek(input) else { return .done(nil) } + switch c { + case "X": + tokenizer.discardChar(&input) + self.state = .hexadecimalStart(uppercase: true) + return .progress + case "x": + tokenizer.discardChar(&input) + self.state = .hexadecimalStart(uppercase: false) + return .progress + case _: + self.state = .decimalStart + return .progress + } + case .hexadecimalStart(let uppercase): + guard let c = tokenizer.peek(input) else { return .done(nil) } + guard c.isHexDigit else { + tokenizer.emitError(.absenceDigits) + return .done(["&", "#", uppercase ? "X" : "x"]) + } + self.state = .hexadecimal + return .progress + case .decimalStart: + guard let c = tokenizer.peek(input) else { return .done(nil) } + guard c.isASCII && c.isWholeNumber else { + tokenizer.emitError(.absenceDigits) + return .done(["&", "#"]) + } + self.state = .decimal + return .progress + case .hexadecimal: + guard let c = tokenizer.peek(input) else { return .done(nil) } + if let n = c.hexDigitValue { + tokenizer.discardChar(&input) + self.num = self.num * 16 + n + return .progress + } else if ";" ~= c { + tokenizer.discardChar(&input) + self.state = .numericEnd + return .progress + } else { + tokenizer.emitError(.missingSemicolon) + self.state = .numericEnd + return .progress + } + case .decimal: + guard let c = tokenizer.peek(input) else { return .done(nil) } + if c.isASCII, let n = c.wholeNumberValue { + tokenizer.discardChar(&input) + self.num = self.num * 10 + n + return .progress + } else if ";" ~= c { + tokenizer.discardChar(&input) + self.state = .numericEnd + return .progress + } else { + tokenizer.emitError(.missingSemicolon) + self.state = .numericEnd + return .progress + } + case .numericEnd: + switch self.num { + case 0: + tokenizer.emitError(.nullCharRef) + return .done(["\u{FFFD}"]) + case (0x10FFFF + 1)...: + tokenizer.emitError(.charRefOutOfRange) + return .done(["\u{FFFD}"]) + case 0xD800...0xDBFF, 0xDC00...0xDFFF: + tokenizer.emitError(.surrogateCharRef) + return .done(["\u{FFFD}"]) + case 0xFDD0...0xFDEF, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, + 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, + 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, + 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, + 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF: + tokenizer.emitError(.noncharacterCharRef) + guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") } + return .done([Character(scalar)]) + case 0x0D, 0...0x1F, 0x7F...0x9F: + tokenizer.emitError(.controlCharRef) + // TODO: If the number is one of the numbers in the first column of the following table, then find the row with that number in the first column, and set the character reference code to the number in the second column of that row. + guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") } + return .done([Character(scalar)]) + case let n: + guard let scalar = Unicode.Scalar(n) else { preconditionFailure("unreachable") } + return .done([Character(scalar)]) + } + } + } } diff --git a/Sources/Tokenizer/Tokenizer.swift b/Sources/Tokenizer/Tokenizer.swift index 1261156..71245b1 100644 --- a/Sources/Tokenizer/Tokenizer.swift +++ b/Sources/Tokenizer/Tokenizer.swift @@ -88,7 +88,10 @@ public struct Tokenizer: ~Copyable { // swift-format-ignore private mutating func step(_ input: inout String.Iterator) -> ProcessResult { - self.charRefTokenizer?.tokenize(&input) + if var charRefTokenizer { + self.processCharRef(charRefTokenizer.tokenize(tokenizer: &self, input: &input)) + self.charRefTokenizer = nil + } switch self.state { case .data: while true { @@ -844,6 +847,25 @@ public struct Tokenizer: ~Copyable { } } + @inline(__always) + mutating func processCharRef(_ charRef: consuming CharRef?) { + if let charRef { + switch self.state { + case .data, .rcdata: for c in charRef { #go(emit: c) } + case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted: + for c in charRef { #go(appendAttrValue: c) } + case _: preconditionFailure("unreachable") + } + } else { + switch self.state { + case .data, .rcdata: #go(emit: "&") + case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted: + #go(appendAttrValue: "&") + case _: preconditionFailure("unreachable") + } + } + } + @inline(__always) private mutating func getChar(from input: inout String.Iterator) -> Character? { guard let reconsumeChar else { @@ -867,6 +889,23 @@ public struct Tokenizer: ~Copyable { return reconsumeChar } + @inline(__always) + func peek(_ input: borrowing String.Iterator) -> Character? { + guard let reconsumeChar else { + var input = copy input + return input.next() + } + return reconsumeChar + } + + @inline(__always) + mutating func discardChar(_ input: inout String.Iterator) { + switch self.reconsumeChar { + case .some: self.reconsumeChar = nil + case .none: _ = input.next() + } + } + @inline(__always) private mutating func startsExact( _ input: inout String.Iterator, @@ -933,7 +972,7 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - private mutating func emitError(_ error: consuming ParseError) { + mutating func emitError(_ error: consuming ParseError) { self.sink.process(.error(consume error)) }