-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
191 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,152 @@ | ||
private enum CharRefState { | ||
case initial | ||
case named | ||
case ambiguousAmpersand | ||
case numeric | ||
case hexadecimalStart(uppercase: Bool) | ||
case decimalStart | ||
case hexadecimal | ||
case decimal | ||
case numericEnd | ||
} | ||
|
||
typealias CharRef = [Character] | ||
|
||
private enum CharRefProcessResult: ~Copyable { | ||
case done(CharRef?) | ||
case progress | ||
} | ||
|
||
struct CharRefTokenizer { | ||
mutating func tokenize(_ input: inout String.Iterator) {} | ||
private var state: CharRefState = .initial | ||
// TODO: Consider type | ||
private var num: Int = 0 | ||
|
||
mutating func tokenize(tokenizer: inout Tokenizer<some TokenSink>, input: inout String.Iterator) -> CharRef? { | ||
while true { | ||
switch self.step(tokenizer: &tokenizer, input: &input) { | ||
case .done(let charRef): return charRef | ||
case .progress: break | ||
} | ||
} | ||
} | ||
|
||
private mutating func step(tokenizer: inout Tokenizer<some TokenSink>, input: inout String.Iterator) -> CharRefProcessResult { | ||
switch self.state { | ||
case .initial: | ||
switch tokenizer.peek(input) { | ||
case let c? where c.isASCII && (c.isLetter || c.isWholeNumber): | ||
self.state = .named | ||
return .progress | ||
case "#": | ||
tokenizer.discardChar(&input) | ||
self.state = .numeric | ||
return .progress | ||
case _?: return .done(["&"]) | ||
case nil: return .done(nil) | ||
} | ||
case .named: fatalError("not implemented") | ||
case .ambiguousAmpersand: | ||
switch tokenizer.peek(input) { | ||
case let c? where c.isASCII && c.isLetter: | ||
tokenizer.discardChar(&input) | ||
tokenizer.processCharRef([c]) | ||
return .progress | ||
case ";": | ||
tokenizer.emitError(.unknownNamedCharRef) | ||
return .done(nil) | ||
case _?: return .done(nil) | ||
case nil: return .done(nil) | ||
} | ||
case .numeric: | ||
guard let c = tokenizer.peek(input) else { return .done(nil) } | ||
switch c { | ||
case "X": | ||
tokenizer.discardChar(&input) | ||
self.state = .hexadecimalStart(uppercase: true) | ||
return .progress | ||
case "x": | ||
tokenizer.discardChar(&input) | ||
self.state = .hexadecimalStart(uppercase: false) | ||
return .progress | ||
case _: | ||
self.state = .decimalStart | ||
return .progress | ||
} | ||
case .hexadecimalStart(let uppercase): | ||
guard let c = tokenizer.peek(input) else { return .done(nil) } | ||
guard c.isHexDigit else { | ||
tokenizer.emitError(.absenceDigits) | ||
return .done(["&", "#", uppercase ? "X" : "x"]) | ||
} | ||
self.state = .hexadecimal | ||
return .progress | ||
case .decimalStart: | ||
guard let c = tokenizer.peek(input) else { return .done(nil) } | ||
guard c.isASCII && c.isWholeNumber else { | ||
tokenizer.emitError(.absenceDigits) | ||
return .done(["&", "#"]) | ||
} | ||
self.state = .decimal | ||
return .progress | ||
case .hexadecimal: | ||
guard let c = tokenizer.peek(input) else { return .done(nil) } | ||
if let n = c.hexDigitValue { | ||
tokenizer.discardChar(&input) | ||
self.num = self.num * 16 + n | ||
return .progress | ||
} else if ";" ~= c { | ||
tokenizer.discardChar(&input) | ||
self.state = .numericEnd | ||
return .progress | ||
} else { | ||
tokenizer.emitError(.missingSemicolon) | ||
self.state = .numericEnd | ||
return .progress | ||
} | ||
case .decimal: | ||
guard let c = tokenizer.peek(input) else { return .done(nil) } | ||
if c.isASCII, let n = c.wholeNumberValue { | ||
tokenizer.discardChar(&input) | ||
self.num = self.num * 10 + n | ||
return .progress | ||
} else if ";" ~= c { | ||
tokenizer.discardChar(&input) | ||
self.state = .numericEnd | ||
return .progress | ||
} else { | ||
tokenizer.emitError(.missingSemicolon) | ||
self.state = .numericEnd | ||
return .progress | ||
} | ||
case .numericEnd: | ||
switch self.num { | ||
case 0: | ||
tokenizer.emitError(.nullCharRef) | ||
return .done(["\u{FFFD}"]) | ||
case (0x10FFFF + 1)...: | ||
tokenizer.emitError(.charRefOutOfRange) | ||
return .done(["\u{FFFD}"]) | ||
case 0xD800...0xDBFF, 0xDC00...0xDFFF: | ||
tokenizer.emitError(.surrogateCharRef) | ||
return .done(["\u{FFFD}"]) | ||
case 0xFDD0...0xFDEF, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, | ||
0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, | ||
0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, | ||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, | ||
0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF: | ||
tokenizer.emitError(.noncharacterCharRef) | ||
guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") } | ||
return .done([Character(scalar)]) | ||
case 0x0D, 0...0x1F, 0x7F...0x9F: | ||
tokenizer.emitError(.controlCharRef) | ||
// TODO: If the number is one of the numbers in the first column of the following table, then find the row with that number in the first column, and set the character reference code to the number in the second column of that row. | ||
guard let scalar = Unicode.Scalar(self.num) else { preconditionFailure("unreachable") } | ||
return .done([Character(scalar)]) | ||
case let n: | ||
guard let scalar = Unicode.Scalar(n) else { preconditionFailure("unreachable") } | ||
return .done([Character(scalar)]) | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters