Skip to content

Commit

Permalink
feat: implement CharRefTokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
kkebo committed Nov 20, 2023
1 parent bfcc87a commit b78e52c
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 3 deletions.
151 changes: 150 additions & 1 deletion Sources/Tokenizer/CharRefTokenizer.swift
Original file line number Diff line number Diff line change
@@ -1,3 +1,152 @@
private enum CharRefState {
case initial
case named
case ambiguousAmpersand
case numeric
case hexadecimalStart(uppercase: Bool)
case decimalStart
case hexadecimal
case decimal
case numericEnd
}

typealias CharRef = [Character]

private enum CharRefProcessResult: ~Copyable {
case done(CharRef?)
case progress
}

struct CharRefTokenizer {
mutating func tokenize(_ input: inout String.Iterator) {}
private var state: CharRefState = .initial
// TODO: Consider type
private var num: Int = 0

mutating func tokenize(tokenizer: inout Tokenizer<some TokenSink>, input: inout String.Iterator) -> CharRef? {
while true {
switch self.step(tokenizer: &tokenizer, input: &input) {
case .done(let charRef): return charRef
case .progress: break
}
}
}

private mutating func step(tokenizer: inout Tokenizer<some TokenSink>, input: inout String.Iterator) -> CharRefProcessResult {
switch self.state {
case .initial:
switch tokenizer.peek(input) {
case let c? where c.isASCII && (c.isLetter || c.isWholeNumber):
self.state = .named
return .progress
case "#":
tokenizer.discardChar(&input)
self.state = .numeric
return .progress
case _?: return .done(["&"])
case nil: return .done(nil)
}
case .named: fatalError("not implemented")
case .ambiguousAmpersand:
switch tokenizer.peek(input) {
case let c? where c.isASCII && c.isLetter:
tokenizer.discardChar(&input)
tokenizer.processCharRef([c])
return .progress
case ";":
tokenizer.emitError(.unknownNamedCharRef)
return .done(nil)
case _?: return .done(nil)
case nil: return .done(nil)
}
case .numeric:
guard let c = tokenizer.peek(input) else { return .done(nil) }
switch c {
case "X":
tokenizer.discardChar(&input)
self.state = .hexadecimalStart(uppercase: true)
return .progress
case "x":
tokenizer.discardChar(&input)
self.state = .hexadecimalStart(uppercase: false)
return .progress
case _:
self.state = .decimalStart
return .progress
}
case .hexadecimalStart(let uppercase):
guard let c = tokenizer.peek(input) else { return .done(nil) }
guard c.isHexDigit else {
tokenizer.emitError(.absenceDigits)
return .done(["&", "#", uppercase ? "X" : "x"])
}
self.state = .hexadecimal
return .progress
case .decimalStart:
guard let c = tokenizer.peek(input) else { return .done(nil) }
guard c.isASCII && c.isWholeNumber else {
tokenizer.emitError(.absenceDigits)
return .done(["&", "#"])
}
self.state = .decimal
return .progress
case .hexadecimal:
guard let c = tokenizer.peek(input) else { return .done(nil) }
if let n = c.hexDigitValue {
tokenizer.discardChar(&input)
self.num = self.num * 16 + n
return .progress
} else if ";" ~= c {
tokenizer.discardChar(&input)
self.state = .numericEnd
return .progress
} else {
tokenizer.emitError(.missingSemicolon)
self.state = .numericEnd
return .progress
}
case .decimal:
guard let c = tokenizer.peek(input) else { return .done(nil) }
if c.isASCII, let n = c.wholeNumberValue {
tokenizer.discardChar(&input)
self.num = self.num * 10 + n
return .progress
} else if ";" ~= c {
tokenizer.discardChar(&input)
self.state = .numericEnd
return .progress
} else {
tokenizer.emitError(.missingSemicolon)
self.state = .numericEnd
return .progress
}
case .numericEnd:
switch self.num {
case 0:
tokenizer.emitError(.nullCharRef)
return .done(["\u{FFFD}"])
case (0x10FFFF + 1)...:
tokenizer.emitError(.charRefOutOfRange)
return .done(["\u{FFFD}"])
case 0xD800...0xDBFF, 0xDC00...0xDFFF:
tokenizer.emitError(.surrogateCharRef)
return .done(["\u{FFFD}"])
case 0xFDD0...0xFDEF, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF,
0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF,
0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF,
0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF:
tokenizer.emitError(.noncharacterCharRef)
guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") }
return .done([Character(scalar)])
case 0x0D, 0...0x1F, 0x7F...0x9F:
tokenizer.emitError(.controlCharRef)
// TODO: If the number is one of the numbers in the first column of the following table, then find the row with that number in the first column, and set the character reference code to the number in the second column of that row.
guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") }
return .done([Character(scalar)])
case let n:
guard let scalar = Unicode.Scalar(n) else { preconditionFailure("unreachable") }
return .done([Character(scalar)])
}
}
}
}
43 changes: 41 additions & 2 deletions Sources/Tokenizer/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {

// swift-format-ignore
private mutating func step(_ input: inout String.Iterator) -> ProcessResult {
self.charRefTokenizer?.tokenize(&input)
if var charRefTokenizer {
self.processCharRef(charRefTokenizer.tokenize(tokenizer: &self, input: &input))
self.charRefTokenizer = nil
}

switch self.state {
case .data: while true {
Expand Down Expand Up @@ -844,6 +847,25 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
}
}

@inline(__always)
mutating func processCharRef(_ charRef: consuming CharRef?) {
if let charRef {
switch self.state {
case .data, .rcdata: for c in charRef { #go(emit: c) }
case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted:
for c in charRef { #go(appendAttrValue: c) }
case _: preconditionFailure("unreachable")
}
} else {
switch self.state {
case .data, .rcdata: #go(emit: "&")
case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted:
#go(appendAttrValue: "&")
case _: preconditionFailure("unreachable")
}
}
}

@inline(__always)
private mutating func getChar(from input: inout String.Iterator) -> Character? {
guard let reconsumeChar else {
Expand All @@ -867,6 +889,23 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
return reconsumeChar
}

@inline(__always)
func peek(_ input: borrowing String.Iterator) -> Character? {
guard let reconsumeChar else {
var input = copy input
return input.next()
}
return reconsumeChar
}

@inline(__always)
mutating func discardChar(_ input: inout String.Iterator) {
switch self.reconsumeChar {
case .some: self.reconsumeChar = nil
case .none: _ = input.next()
}
}

@inline(__always)
private mutating func startsExact(
_ input: inout String.Iterator,
Expand Down Expand Up @@ -933,7 +972,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
}

@inline(__always)
private mutating func emitError(_ error: consuming ParseError) {
mutating func emitError(_ error: consuming ParseError) {
self.sink.process(.error(consume error))
}

Expand Down

0 comments on commit b78e52c

Please sign in to comment.