Skip to content

Commit

Permalink
fix: #114 #102 repair unescaped quotes in a string
Browse files Browse the repository at this point in the history
  • Loading branch information
josdejong authored Feb 13, 2024
2 parents bc46250 + 16883bc commit 647326c
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 95 deletions.
26 changes: 19 additions & 7 deletions src/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ describe.each(implementations)('jsonrepair [$name]', ({ jsonrepair }) => {
expect(jsonrepair("'abc")).toBe('"abc"')
expect(jsonrepair('\u2018abc')).toBe('"abc"')
expect(jsonrepair('"it\'s working')).toBe('"it\'s working"')
expect(jsonrepair('["abc+/*comment*/"def"]')).toBe('["abcdef"]')
expect(jsonrepair('["abc/*comment*/+"def"]')).toBe('["abcdef"]')
expect(jsonrepair('["abc,/*comment*/"def"]')).toBe('["abc","def"]')
})

test('should repair truncated JSON', () => {
Expand Down Expand Up @@ -225,6 +228,15 @@ describe.each(implementations)('jsonrepair [$name]', ({ jsonrepair }) => {
expect(jsonrepair('["hello\nworld"\n]')).toBe('["hello\\nworld"\n]')
})

test('should escape unescaped double quotes', () => {
expect(jsonrepair('"The TV has a 24" screen"')).toBe('"The TV has a 24\\" screen"')
expect(jsonrepair('{"key": "apple "bee" carrot"}')).toBe('{"key": "apple \\"bee\\" carrot"}')

expect(jsonrepair('[",",":"]')).toBe('[",",":"]')
expect(jsonrepair('["a" 2')).toBe('["a", 2]')
expect(jsonrepair('["," 2')).toBe('[""," 2"]') // Ideally it would repair as [",", 2]
})

test('should replace special white space characters', () => {
expect(jsonrepair('{"a":\u00a0"foo\u00a0bar"}')).toBe('{"a": "foo\u00a0bar"}')
expect(jsonrepair('{"a":\u202F"foo"}')).toBe('{"a": "foo"}')
Expand Down Expand Up @@ -440,15 +452,15 @@ describe.each(implementations)('jsonrepair [$name]', ({ jsonrepair }) => {
})

test('should repair missing comma between array items', () => {
expect(jsonrepair('{"array": [{}{}]}')).toBe('{"array": [{},{}]}')
expect(jsonrepair('{"array": [{} {}]}'), '{"array": [{}).toBe({}]}')
expect(jsonrepair('{"array": [{}\n{}]}')).toBe('{"array": [{},\n{}]}')
expect(jsonrepair('{"array": [\n{}\n{}\n]}')).toBe('{"array": [\n{},\n{}\n]}')
expect(jsonrepair('{"array": [\n1\n2\n]}')).toBe('{"array": [\n1,\n2\n]}')
// expect(jsonrepair('{"array": [{}{}]}')).toBe('{"array": [{},{}]}')
// expect(jsonrepair('{"array": [{} {}]}'), '{"array": [{}).toBe({}]}')
// expect(jsonrepair('{"array": [{}\n{}]}')).toBe('{"array": [{},\n{}]}')
// expect(jsonrepair('{"array": [\n{}\n{}\n]}')).toBe('{"array": [\n{},\n{}\n]}')
// expect(jsonrepair('{"array": [\n1\n2\n]}')).toBe('{"array": [\n1,\n2\n]}')
expect(jsonrepair('{"array": [\n"a"\n"b"\n]}')).toBe('{"array": [\n"a",\n"b"\n]}')

// should leave normal array as is
expect(jsonrepair('[\n{},\n{}\n]')).toBe('[\n{},\n{}\n]')
// // should leave normal array as is
// expect(jsonrepair('[\n{},\n{}\n]')).toBe('[\n{},\n{}\n]')
})

test('should repair missing comma between object properties', () => {
Expand Down
119 changes: 76 additions & 43 deletions src/regular/jsonrepair.ts
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,70 @@ export function jsonrepair(text: string): string {
let str = '"'
i++

const isEndOfString = stopAtDelimiter
? (i: number) => isDelimiter(text[i]) || isEndQuote(text.charCodeAt(i))
: (i: number) => isEndQuote(text.charCodeAt(i))
while (true) {
if (i >= text.length) {
// end of text, we are missing an end quote
if (!stopAtDelimiter) {
// retry parsing the string, stopping at the first next delimiter
i = iBefore
output = output.substring(0, oBefore)

return parseString(true)
}

// repair missing quote
str = insertBeforeLastWhitespace(str, '"')
output += str

return true
} else if (isEndQuote(text.charCodeAt(i))) {
// end quote
// let us check what is before and after the quote to verify whether this is a legit end quote
const iQuote = i
const oQuote = str.length
str += '"'
i++
output += str

parseWhitespaceAndSkipComments()

if (stopAtDelimiter || i >= text.length || isDelimiter(text.charAt(i)) || isQuote(text.charCodeAt(i))) {
// The quote is followed by a delimiter or the end of the text,
// so the quote is indeed the end of the string
parseConcatenatedString()

return true
}

if (isDelimiter(text.charAt(prevNonWhitespaceIndex(iQuote - 1)))) {
// This is not the right end quote: it is preceded by a delimiter,
// and NOT followed by a delimiter. So, there is an end quote missing
// parse the string again and then stop at the first next delimiter
i = iBefore
output = output.substring(0, oBefore)

while (i < text.length && !isEndOfString(i)) {
if (text.charCodeAt(i) === codeBackslash) {
return parseString(true)
}

// revert to right after the quote but before any whitespace, and continue parsing the string
output = output.substring(0, oBefore)
i = iQuote + 1

// repair unescaped quote
str = str.substring(0, oQuote) + '\\' + str.substring(oQuote)
} else if (stopAtDelimiter && isDelimiter(text[i])) {
// we're in the mode to stop the string at the first delimiter
// because there is an end quote missing

// repair missing quote
str = insertBeforeLastWhitespace(str, '"')
output += str

parseConcatenatedString()

return true
} else if (text.charCodeAt(i) === codeBackslash) {
// handle escaped content like \n or \u2605
const char = text.charAt(i + 1)
const escapeChar = escapeCharacters[char]
if (escapeChar !== undefined) {
Expand Down Expand Up @@ -438,6 +496,7 @@ export function jsonrepair(text: string): string {
i += 2
}
} else {
// handle regular characters
const char = text.charAt(i)
const code = text.charCodeAt(i)

Expand All @@ -459,46 +518,10 @@ export function jsonrepair(text: string): string {
}

if (skipEscapeChars) {
const processed = skipEscapeCharacter()
if (processed) {
// repair: skipped escape character (nothing to do)
}
// repair: skipped escape character (nothing to do)
skipEscapeCharacter()
}
}

const hasEndQuote = isQuote(text.charCodeAt(i))
if (hasEndQuote) {
str += '"'
i++
} else {
// repair missing quote
str = insertBeforeLastWhitespace(str, '"')
}

output += str

parseWhitespaceAndSkipComments()

// See whether we have:
// (a) An end quote which is not followed by a valid delimiter
// (b) No end quote and reached the end of the input
// If so, revert parsing this string and try again, running in a more
// conservative mode, stopping at the first next delimiter
const isAtEnd = i >= text.length
const nextIsDelimiter = isDelimiter(text.charAt(i))
if (
!stopAtDelimiter &&
((hasEndQuote && !isAtEnd && !nextIsDelimiter) || (!hasEndQuote && isAtEnd))
) {
i = iBefore
output = output.substring(0, oBefore)

return parseString(true)
}

parseConcatenatedString()

return true
}

return false
Expand Down Expand Up @@ -665,6 +688,16 @@ export function jsonrepair(text: string): string {
}
}

function prevNonWhitespaceIndex(start: number) : number {
let prev = start

while (prev > 0 && isWhitespace(text.charCodeAt(prev))) {
prev--
}

return prev
}

function expectDigit(start: number) {
if (!isDigit(text.charCodeAt(i))) {
const numSoFar = text.slice(start, i)
Expand Down
10 changes: 10 additions & 0 deletions src/streaming/buffer/OutputBuffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export interface OutputBuffer {
push: (text: string) => void
unshift: (text: string) => void
remove: (start: number, end?: number) => void
insertAt: (index: number, text: string) => void
length: () => number
flush: () => void

Expand Down Expand Up @@ -71,6 +72,14 @@ export function createOutputBuffer({
}
}

function insertAt (index: number, text: string) {
if (index < offset) {
throw new Error(`Cannot insert: ${flushedMessage}`)
}

buffer = buffer.substring(0, index - offset) + text + buffer.substring(index - offset)
}

function length(): number {
return offset + buffer.length
}
Expand Down Expand Up @@ -131,6 +140,7 @@ export function createOutputBuffer({
push,
unshift,
remove,
insertAt,
length,
flush,

Expand Down
121 changes: 77 additions & 44 deletions src/streaming/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -535,10 +535,6 @@ export function jsonrepairCore({
* and fixing the string by inserting a quote there.
*/
function parseString(stopAtDelimiter = false): boolean {
// we may need to revert
const iBefore = i
const oBefore = output.length()

let skipEscapeChars = input.charCodeAt(i) === codeBackslash
if (skipEscapeChars) {
// repair: remove the first escape character
Expand All @@ -554,20 +550,77 @@ export function jsonrepairCore({
const isEndQuote = isDoubleQuote(input.charCodeAt(i))
? isDoubleQuote
: isSingleQuote(input.charCodeAt(i))
? isSingleQuote // eslint-disable-line indent
: isSingleQuoteLike(input.charCodeAt(i)) // eslint-disable-line indent
? isSingleQuoteLike // eslint-disable-line indent
: isDoubleQuoteLike // eslint-disable-line indent
? isSingleQuote
: isSingleQuoteLike(input.charCodeAt(i))
? isSingleQuoteLike
: isDoubleQuoteLike

const iBefore = i
const oBefore = output.length()

output.push('"')
i++

const isEndOfString = stopAtDelimiter
? (i: number) => isDelimiter(input.charAt(i)) || isEndQuote(input.charCodeAt(i))
: (i: number) => isEndQuote(input.charCodeAt(i))
while (true) {
if (input.isEnd(i)) {
// end of text, we have a missing quote somewhere
if (!stopAtDelimiter) {
i = iBefore
output.remove(oBefore)

return parseString(true)
}

// repair missing quote
output.insertBeforeLastWhitespace('"')

return stack.update(Caret.afterValue)
} else if (isEndQuote(input.charCodeAt(i))) {
// end quote
// let us check what is before and after the quote to verify whether this is a legit end quote
const iQuote = i
const oQuote = output.length()
output.push('"')
i++

parseWhitespaceAndSkipComments()

if (stopAtDelimiter || input.isEnd(i) || isDelimiter(input.charAt(i)) || isQuote(input.charCodeAt(i))) {
// The quote is followed by a delimiter or the end of the text,
// so the quote is indeed the end of the string
parseConcatenatedString()

return stack.update(Caret.afterValue)
}

if (isDelimiter(input.charAt(prevNonWhitespaceIndex(iQuote - 1)))) {
// This is not the right end quote: it is preceded by a delimiter,
// and NOT followed by a delimiter. So, there is an end quote missing
// parse the string again and then stop at the first next delimiter
i = iBefore
output.remove(oBefore)

return parseString(true)
}

// revert to right after the quote but before any whitespace, and continue parsing the string
output.remove(oQuote + 1)
i = iQuote + 1

// repair unescaped quote
output.insertAt(oQuote, '\\')
} else if (stopAtDelimiter && isDelimiter(input.charAt(i))) {
// we're in the mode to stop the string at the first delimiter
// because there is an end quote missing

// repair missing quote
output.insertBeforeLastWhitespace('"')

parseConcatenatedString()

while (!input.isEnd(i) && !isEndOfString(i)) {
if (input.charCodeAt(i) === codeBackslash) {
return stack.update(Caret.afterValue)
} else if (input.charCodeAt(i) === codeBackslash) {
// handle escaped content like \n or \u2605
const char = input.charAt(i + 1)
const escapeChar = escapeCharacters[char]
if (escapeChar !== undefined) {
Expand Down Expand Up @@ -595,6 +648,7 @@ export function jsonrepairCore({
i += 2
}
} else {
// handle regular characters
const char = input.charAt(i)
const code = char.charCodeAt(0)

Expand All @@ -620,37 +674,6 @@ export function jsonrepairCore({
skipEscapeCharacter()
}
}

const hasEndQuote = isQuote(input.charCodeAt(i))
if (hasEndQuote) {
output.push('"')
i++
} else {
// repair missing quote
output.insertBeforeLastWhitespace('"')
}

parseWhitespaceAndSkipComments()

// See whether we have:
// (a) An end quote which is not followed by a valid delimiter
// (b) No end quote and reached the end of the input
// If so, revert parsing this string and try again, running in a more
// conservative mode, stopping at the first next delimiter
const isAtEnd = input.isEnd(i)
const nextIsDelimiter = isDelimiter(input.charAt(i))
if (
!stopAtDelimiter &&
((hasEndQuote && !isAtEnd && !nextIsDelimiter) || (!hasEndQuote && isAtEnd))
) {
i = iBefore
output.remove(oBefore)
return parseString(true)
}

parseConcatenatedString()

return stack.update(Caret.afterValue)
}

return false
Expand Down Expand Up @@ -799,6 +822,16 @@ export function jsonrepairCore({
return j > i ? j : null
}

function prevNonWhitespaceIndex(start: number) : number {
let prev = start

while (prev > 0 && isWhitespace(input.charCodeAt(prev))) {
prev--
}

return prev
}

function expectDigit(start: number) {
if (!isDigit(input.charCodeAt(i))) {
const numSoFar = input.substring(start, i)
Expand Down
2 changes: 1 addition & 1 deletion src/utils/stringUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export function isDelimiter(char: string): boolean {
return regexDelimiter.test(char)
}

const regexDelimiter = /^[,:[\]{}()\n+]$/
const regexDelimiter = /^[,:[\]/{}()\n+]$/

export function isStartOfValue(char: string): boolean {
return regexStartOfValue.test(char) || (char && isQuote(char.charCodeAt(0)))
Expand Down

0 comments on commit 647326c

Please sign in to comment.