Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(internal): cleanup and restructure diffstr() #4703

Merged
merged 2 commits into from
May 9, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 98 additions & 82 deletions internal/diff_str.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,95 +3,111 @@
import { _internals } from "./diff.ts";

/**
* Renders the differences between the actual and expected strings
* Partially inspired from https://github.com/kpdecker/jsdiff
* @param A Actual string
* @param B Expected string
* Unescape invisible characters.
*
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
*
* @param string String to unescape.
*
* @returns Unescaped string.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
function unescape(string: string): string {
// unescape invisible characters.
// ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
.replaceAll( // does not remove line breaks
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}

function tokenize(string: string, { wordDiff = false } = {}): string[] {
if (wordDiff) {
// Split string on whitespace symbols
const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
// Extended Latin character set
const words =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

// Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
words.test(token) &&
words.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
}
return tokens.filter((token) => token);
} else {
// Split string on new lines symbols
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/);
function unescape(string: string): string {
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
// This does not remove line breaks
.replaceAll(
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}

// Ignore final empty token when text ends with a newline
if (!lines[lines.length - 1]) {
lines.pop();
}
const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
const EXT_LATIN_CHARS =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

// Merge the content and line separators into single tokens
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
tokens.push(line);
}
/**
* Tokenizes a string into an array of tokens.
*
* @param string The string to tokenize.
* @param wordDiff If true, performs word-based tokenization. Default is false.
*
* @returns An array of tokens.
*/
function tokenize(string: string, wordDiff = false): string[] {
if (wordDiff) {
const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (

Check warning on line 45 in internal/diff_str.ts

View check run for this annotation

Codecov / codecov/patch

internal/diff_str.ts#L45

Added line #L45 was not covered by tests
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
EXT_LATIN_CHARS.test(token) &&
EXT_LATIN_CHARS.test(tokenPlusTwo)

Check warning on line 50 in internal/diff_str.ts

View check run for this annotation

Codecov / codecov/patch

internal/diff_str.ts#L50

Added line #L50 was not covered by tests
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;

Check warning on line 54 in internal/diff_str.ts

View check run for this annotation

Codecov / codecov/patch

internal/diff_str.ts#L52-L54

Added lines #L52 - L54 were not covered by tests
}
return tokens;
}
return tokens;
}
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/).filter((line) => line);

// Create details by filtering relevant word-diff for current line
// and merge "space-diff" if surrounded by word-diff for cleaner displays
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
tokens.push(line);
}
}
return tokens;
}

/**
* Create details by filtering relevant word-diff for current line and merge
* "space-diff" if surrounded by word-diff for cleaner displays.
*
* @param line Current line
* @param tokens Word-diff tokens
*
* @returns Array of diff results.
*/
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
}

/**
* Renders the differences between the actual and expected strings. Partially
* inspired from {@link https://github.com/kpdecker/jsdiff}.
*
* @param A Actual string
* @param B Expected string
*
* @returns Array of diff results.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
// Compute multi-line diff
const diffResult = _internals.diff(
tokenize(`${unescape(A)}\n`),
Expand Down Expand Up @@ -120,8 +136,8 @@
while (bLines.length) {
b = bLines.shift();
const tokenized = [
tokenize(a.value, { wordDiff: true }),
tokenize(b?.value ?? "", { wordDiff: true }),
tokenize(a.value, true),
tokenize(b?.value ?? "", true),
] as [string[], string[]];
if (hasMoreRemovedLines) tokenized.reverse();
tokens = _internals.diff(tokenized[0], tokenized[1]);
Expand Down