Skip to content

Commit

Permalink
fix: add pairs group check
Browse files Browse the repository at this point in the history
  • Loading branch information
makamekm committed Dec 2, 2024
1 parent 9d46a10 commit 59dbbf6
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 37 deletions.
6 changes: 3 additions & 3 deletions src/constants/pairs.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
export type RegExpPair = [() => RegExp, () => RegExp];

export const REGEXP_PAIRS: RegExpPair[] = [
[() => /\*\*[\p{L}\W]/giu, () => /[\p{L}\W]\*\*/giu],
[() => /_[\p{L}\W]/giu, () => /[\p{L}\W]_/giu],
[() => /~~[\p{L}\W]/giu, () => /[\p{L}\W]~~/giu],
[() => /\*\*[\p{L}_~*]/giu, () => /[\p{L}_~*]\*\*/giu],
[() => /_[\p{L}_~*]/giu, () => /[\p{L}_~*]_/giu],
[() => /~~[\p{L}_~*]/giu, () => /[\p{L}_~*]~~/giu],
];
35 changes: 1 addition & 34 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {anyPass, call, compose, zipWith} from 'ramda';

import {fstChars, lstChars, sentences} from './parsers';
import {fstChars, groupPairs, lstChars, sentences} from './parsers';
import {
leftAbbreviation,
leftEndsWithHardbreak,
Expand All @@ -18,7 +18,6 @@ import {
rightStartsWithLowercase,
spaceBothSides,
} from './rules';
import {REGEXP_PAIRS, RegExpPair} from './constants';

// sides preprocessing before evaluation
const leftPreprocessor = lstChars(20);
Expand Down Expand Up @@ -52,38 +51,6 @@ const join = compose(joinCondition, zipWith<any, any, any>(call, sidesPreprocess
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const breaks = compose(breakCondition, zipWith<any, any, any>(call, sidesPreprocessors));

function isUnpairedStr(str: string, regExp: RegExpPair) {
const first = str?.match(regExp[0]()) ?? [];
const second = str?.match(regExp[1]()) ?? [];
return first.length !== second.length;
}

function isUnpaired(str: string) {
for (const pair of REGEXP_PAIRS) {
if (isUnpairedStr(str, pair)) {
return true;
}
}
return false;
}

function groupPairs(parsed: string[]) {
let index = 0;

while (index < parsed.length - 1) {
const current = parsed[index];
const next = parsed[index + 1] || '';

if (isUnpaired(current)) {
parsed.splice(index, 2, current + next);
} else {
index++;
}
}

return parsed;
}

// sentences processing
export function sentenize(text: string): string[] {
const parts = text.split(/((?:\n\s*){2,})/);
Expand Down
19 changes: 19 additions & 0 deletions src/parsers/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
fstChars,
fstToken,
fstWord,
groupPairs,
lstChars,
lstToken,
lstWord,
Expand Down Expand Up @@ -76,6 +77,24 @@ describe('sentences', () => {
expect(actual).toStrictEqual(expected[i]);
}
});
it('should split text on delimiter(s) with groupPairs special symbols **', () => {
const input = [
'Предложение **один.Предложение два.Предложение** три.Предложение четыре.',
'Предложение **один?Предложение два?Предложение** три?Предложение четыре?',
'Предложение **один!Предложение два!Предложение** три!Предложение четыре!',
'Предложение **один…Предложение два…Предложение** три…Предложение четыре…',
];
const expected = [
['Предложение **один.Предложение два.Предложение** три.', 'Предложение четыре.'],
['Предложение **один?Предложение два?Предложение** три?', 'Предложение четыре?'],
['Предложение **один!Предложение два!Предложение** три!', 'Предложение четыре!'],
['Предложение **один…Предложение два…Предложение** три…', 'Предложение четыре…'],
];
for (let i = 0; i < input.length; i++) {
const actual = groupPairs(sentences(input[i]));
expect(actual).toStrictEqual(expected[i]);
}
});
it('should default to array of single element of original text in case of one sentence', () => {
const input = 'Одно длинное предложение без разделителя';
const expected = ['Одно длинное предложение без разделителя'];
Expand Down
35 changes: 35 additions & 0 deletions src/parsers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import {
BRACKETS_CLOSE_MARKERS,
QUOTATION_CLOSE_MARKERS,
QUOTATION_GENERIC_MARKERS,
REGEXP_PAIRS,
RegExpPair,
SENTENCE_END_MARKERS,
WINDOW_WIDTH,
} from '../constants';
Expand Down Expand Up @@ -130,8 +132,41 @@ const dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags);

const dotSuffix = compose(defaultTo(''), snd, match(dotSuffixRegExp));

function isUnpairedStr(str: string, regExp: RegExpPair) {
const first = str?.match(regExp[0]()) ?? [];
const second = str?.match(regExp[1]()) ?? [];
return first.length !== second.length;
}

function isUnpaired(str: string) {
for (const pair of REGEXP_PAIRS) {
if (isUnpairedStr(str, pair)) {
return true;
}
}
return false;
}

function groupPairs(parsed: string[]) {
let index = 0;

while (index < parsed.length - 1) {
const current = parsed[index];
const next = parsed[index + 1] || '';

if (isUnpaired(current)) {
parsed.splice(index, 2, current + next);
} else {
index++;
}
}

return parsed;
}

export {
sentences,
groupPairs,
words,
delimiters,
fst,
Expand Down

0 comments on commit 59dbbf6

Please sign in to comment.