Skip to content

Commit

Permalink
fix: add pairs group check
Browse files Browse the repository at this point in the history
  • Loading branch information
makamekm committed Dec 2, 2024
1 parent 77a297f commit 1d4bd52
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 2 deletions.
3 changes: 3 additions & 0 deletions src/constants/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ export {
export {WINDOW_WIDTH} from './parameters';

export {INITIALS, HEAD, TAIL, OTHER, HEAD_PAIR, TAIL_PAIR, OTHER_PAIR} from './abbreviations';

export {REGEXP_PAIRS} from './pairs';
export type {RegExpPair} from './pairs';
7 changes: 7 additions & 0 deletions src/constants/pairs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
export type RegExpPair = [() => RegExp, () => RegExp];

export const REGEXP_PAIRS: RegExpPair[] = [
[() => /\*\*[\p{L}_~*]/giu, () => /[\p{L}_~*]\*\*/giu],
[() => /_[\p{L}_~*]/giu, () => /[\p{L}_~*]_/giu],
[() => /~~[\p{L}_~*]/giu, () => /[\p{L}_~*]~~/giu],
];
4 changes: 2 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {anyPass, call, compose, zipWith} from 'ramda';

import {fstChars, lstChars, sentences} from './parsers';
import {fstChars, groupPairs, lstChars, sentences} from './parsers';
import {
leftAbbreviation,
leftEndsWithHardbreak,
Expand Down Expand Up @@ -80,5 +80,5 @@ export function sentenize(text: string): string[] {
}
}

return parsed;
return groupPairs(parsed);
}
19 changes: 19 additions & 0 deletions src/parsers/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
fstChars,
fstToken,
fstWord,
groupPairs,
lstChars,
lstToken,
lstWord,
Expand Down Expand Up @@ -76,6 +77,24 @@ describe('sentences', () => {
expect(actual).toStrictEqual(expected[i]);
}
});
it('should split text on delimiter(s) with groupPairs special symbols **', () => {
const input = [
'Предложение **один.Предложение два.Предложение** три.Предложение четыре.',
'Предложение **один?Предложение два?Предложение** три?Предложение четыре?',
'Предложение **один!Предложение два!Предложение** три!Предложение четыре!',
'Предложение **один…Предложение два…Предложение** три…Предложение четыре…',
];
const expected = [
['Предложение **один.Предложение два.Предложение** три.', 'Предложение четыре.'],
['Предложение **один?Предложение два?Предложение** три?', 'Предложение четыре?'],
['Предложение **один!Предложение два!Предложение** три!', 'Предложение четыре!'],
['Предложение **один…Предложение два…Предложение** три…', 'Предложение четыре…'],
];
for (let i = 0; i < input.length; i++) {
const actual = groupPairs(sentences(input[i]));
expect(actual).toStrictEqual(expected[i]);
}
});
it('should default to array of single element of original text in case of one sentence', () => {
const input = 'Одно длинное предложение без разделителя';
const expected = ['Одно длинное предложение без разделителя'];
Expand Down
35 changes: 35 additions & 0 deletions src/parsers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import {
BRACKETS_CLOSE_MARKERS,
QUOTATION_CLOSE_MARKERS,
QUOTATION_GENERIC_MARKERS,
REGEXP_PAIRS,
RegExpPair,
SENTENCE_END_MARKERS,
WINDOW_WIDTH,
} from '../constants';
Expand Down Expand Up @@ -130,8 +132,41 @@ const dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags);

const dotSuffix = compose(defaultTo(''), snd, match(dotSuffixRegExp));

function isUnpairedStr(str: string, regExp: RegExpPair) {
const first = str?.match(regExp[0]()) ?? [];
const second = str?.match(regExp[1]()) ?? [];
return first.length !== second.length;
}

function isUnpaired(str: string) {
for (const pair of REGEXP_PAIRS) {
if (isUnpairedStr(str, pair)) {
return true;
}
}
return false;
}

function groupPairs(parsed: string[]) {
let index = 0;

while (index < parsed.length - 1) {
const current = parsed[index];
const next = parsed[index + 1] || '';

if (isUnpaired(current)) {
parsed.splice(index, 2, current + next);
} else {
index++;
}
}

return parsed;
}

export {
sentences,
groupPairs,
words,
delimiters,
fst,
Expand Down

0 comments on commit 1d4bd52

Please sign in to comment.