Skip to content

Commit

Permalink
refactor(internal): cleanup and restructure diffstr() (denoland#4703)
Browse files Browse the repository at this point in the history
* refactor(internal): cleanup and restructure `diffstr()`

* work
  • Loading branch information
iuioiua authored May 9, 2024
1 parent 54f93b8 commit c53ca87
Showing 1 changed file with 98 additions and 82 deletions.
180 changes: 98 additions & 82 deletions internal/diff_str.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,95 +3,111 @@ import type { DiffResult } from "./_types.ts";
import { diff } from "./diff.ts";

/**
* Renders the differences between the actual and expected strings
* Partially inspired from https://github.com/kpdecker/jsdiff
* @param A Actual string
* @param B Expected string
* Unescape invisible characters.
*
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
*
* @param string String to unescape.
*
* @returns Unescaped string.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
function unescape(string: string): string {
// unescape invisible characters.
// ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
.replaceAll( // does not remove line breaks
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}

function tokenize(string: string, { wordDiff = false } = {}): string[] {
if (wordDiff) {
// Split string on whitespace symbols
const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
// Extended Latin character set
const words =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

// Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
words.test(token) &&
words.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
}
return tokens.filter((token) => token);
} else {
// Split string on new lines symbols
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/);
function unescape(string: string): string {
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
// This does not remove line breaks
.replaceAll(
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}

// Ignore final empty token when text ends with a newline
if (!lines[lines.length - 1]) {
lines.pop();
}
const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
const EXT_LATIN_CHARS =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

// Merge the content and line separators into single tokens
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
tokens.push(line);
}
/**
* Tokenizes a string into an array of tokens.
*
* @param string The string to tokenize.
* @param wordDiff If true, performs word-based tokenization. Default is false.
*
* @returns An array of tokens.
*/
function tokenize(string: string, wordDiff = false): string[] {
if (wordDiff) {
const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
EXT_LATIN_CHARS.test(token) &&
EXT_LATIN_CHARS.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
return tokens;
}
return tokens;
}
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/).filter((line) => line);

// Create details by filtering relevant word-diff for current line
// and merge "space-diff" if surrounded by word-diff for cleaner displays
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
tokens.push(line);
}
}
return tokens;
}

/**
* Create details by filtering relevant word-diff for current line and merge
* "space-diff" if surrounded by word-diff for cleaner displays.
*
* @param line Current line
* @param tokens Word-diff tokens
*
* @returns Array of diff results.
*/
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
}

/**
* Renders the differences between the actual and expected strings. Partially
* inspired from {@link https://github.com/kpdecker/jsdiff}.
*
* @param A Actual string
* @param B Expected string
*
* @returns Array of diff results.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
// Compute multi-line diff
const diffResult = diff(
tokenize(`${unescape(A)}\n`),
Expand Down Expand Up @@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
while (bLines.length) {
b = bLines.shift();
const tokenized = [
tokenize(a.value, { wordDiff: true }),
tokenize(b?.value ?? "", { wordDiff: true }),
tokenize(a.value, true),
tokenize(b?.value ?? "", true),
] as [string[], string[]];
if (hasMoreRemovedLines) tokenized.reverse();
tokens = diff(tokenized[0], tokenized[1]);
Expand Down

0 comments on commit c53ca87

Please sign in to comment.