Skip to content

Commit

Permalink
Merge pull request #131 from jfkthame/isolates-opt
Browse files Browse the repository at this point in the history
Use a simplified codepath if no bidi isolation controls are present.
  • Loading branch information
Manishearth authored Mar 5, 2024
2 parents ea27a79 + aa06343 commit ca612da
Show file tree
Hide file tree
Showing 4 changed files with 295 additions and 103 deletions.
31 changes: 30 additions & 1 deletion src/explicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,25 @@ use super::char_data::{
BidiClass::{self, *},
};
use super::level::Level;
use super::prepare::removed_by_x9;
use super::LevelRunVec;
use super::TextSource;

/// Compute explicit embedding levels for one paragraph of text (X1-X8).
/// Compute explicit embedding levels for one paragraph of text (X1-X8), and identify
/// level runs (BD7) for use when determining Isolating Run Sequences (X10).
///
/// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`,
/// for each char in `text`.
///
/// `runs` returns the list of level runs (BD7) of the text.
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn compute<'a, T: TextSource<'a> + ?Sized>(
text: &'a T,
para_level: Level,
original_classes: &[BidiClass],
levels: &mut [Level],
processing_classes: &mut [BidiClass],
runs: &mut LevelRunVec,
) {
assert_eq!(text.len(), original_classes.len());

Expand All @@ -51,6 +57,9 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
let mut overflow_embedding_count = 0u32;
let mut valid_isolate_count = 0u32;

let mut current_run_level = Level::ltr();
let mut current_run_start = 0;

for (i, len) in text.indices_lengths() {
let last = stack.last().unwrap();

Expand Down Expand Up @@ -182,6 +191,26 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
levels[i + j] = levels[i];
processing_classes[i + j] = processing_classes[i];
}

// Identify level runs to be passed to prepare::isolating_run_sequences().
if i == 0 {
// Initialize for the first (or only) run.
current_run_level = levels[i];
} else {
// Check if we need to start a new level run.
// <https://www.unicode.org/reports/tr9/#BD7>
if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
// End the last run and start a new one.
runs.push(current_run_start..i);
current_run_level = levels[i];
current_run_start = i;
}
}
}

// Append the trailing level run, if non-empty.
if levels.len() > current_run_start {
runs.push(current_run_start..levels.len());
}
}

Expand Down
104 changes: 83 additions & 21 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ mod prepare;
pub use crate::char_data::{BidiClass, UNICODE_VERSION};
pub use crate::data_source::BidiDataSource;
pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
pub use crate::prepare::LevelRun;
pub use crate::prepare::{LevelRun, LevelRunVec};

#[cfg(feature = "hardcoded-data")]
pub use crate::char_data::{bidi_class, HardcodedBidiData};
Expand Down Expand Up @@ -248,8 +248,14 @@ struct InitialInfoExt<'text> {

/// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
/// requires no further bidi processing (i.e. there are no RTL characters or bidi
/// control codes present).
pure_ltr: Vec<bool>,
/// control codes present), and whether any bidi isolation controls are present.
flags: Vec<ParagraphInfoFlags>,
}

#[derive(PartialEq, Debug)]
struct ParagraphInfoFlags {
is_pure_ltr: bool,
has_isolate_controls: bool,
}

impl<'text> InitialInfoExt<'text> {
Expand All @@ -269,12 +275,12 @@ impl<'text> InitialInfoExt<'text> {
default_para_level: Option<Level>,
) -> InitialInfoExt<'a> {
let mut paragraphs = Vec::<ParagraphInfo>::new();
let mut pure_ltr = Vec::<bool>::new();
let (original_classes, _, _) = compute_initial_info(
let mut flags = Vec::<ParagraphInfoFlags>::new();
let (original_classes, _, _, _) = compute_initial_info(
data_source,
text,
default_para_level,
Some((&mut paragraphs, &mut pure_ltr)),
Some((&mut paragraphs, &mut flags)),
);

InitialInfoExt {
Expand All @@ -283,7 +289,7 @@ impl<'text> InitialInfoExt<'text> {
original_classes,
paragraphs,
},
pure_ltr,
flags,
}
}
}
Expand All @@ -299,8 +305,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
data_source: &D,
text: &'a T,
default_para_level: Option<Level>,
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>,
) -> (Vec<BidiClass>, Level, bool) {
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<ParagraphInfoFlags>)>,
) -> (Vec<BidiClass>, Level, bool, bool) {
let mut original_classes = Vec::with_capacity(text.len());

// The stack contains the starting code unit index for each nested isolate we're inside.
Expand All @@ -310,8 +316,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
let mut isolate_stack = Vec::new();

debug_assert!(
if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs {
paragraphs.is_empty() && pure_ltr.is_empty()
if let Some((ref paragraphs, ref flags)) = split_paragraphs {
paragraphs.is_empty() && flags.is_empty()
} else {
true
}
Expand All @@ -323,6 +329,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
// Per-paragraph flag: can subsequent processing be skipped? Set to false if any
// RTL characters or bidi control characters are encountered in the paragraph.
let mut is_pure_ltr = true;
// Set to true if any bidi isolation controls are present in the paragraph.
let mut has_isolate_controls = false;

#[cfg(feature = "flame_it")]
flame::start("compute_initial_info(): iter text.char_indices()");
Expand All @@ -341,7 +349,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(

match class {
B => {
if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs {
if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs {
// P1. Split the text into separate paragraphs. The paragraph separator is kept
// with the previous paragraph.
let para_end = i + len;
Expand All @@ -350,14 +358,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
// P3. If no character is found in p2, set the paragraph level to zero.
level: para_level.unwrap_or(LTR_LEVEL),
});
pure_ltr.push(is_pure_ltr);
flags.push(ParagraphInfoFlags {
is_pure_ltr,
has_isolate_controls,
});
// Reset state for the start of the next paragraph.
para_start = para_end;
// TODO: Support defaulting to direction of previous paragraph
//
// <http://www.unicode.org/reports/tr9/#HL1>
para_level = default_para_level;
is_pure_ltr = true;
has_isolate_controls = false;
isolate_stack.clear();
}
}
Expand Down Expand Up @@ -394,6 +406,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(

RLI | LRI | FSI => {
is_pure_ltr = false;
has_isolate_controls = true;
isolate_stack.push(i);
}

Expand All @@ -405,15 +418,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
}
}

if let Some((paragraphs, pure_ltr)) = split_paragraphs {
if let Some((paragraphs, flags)) = split_paragraphs {
if para_start < text.len() {
paragraphs.push(ParagraphInfo {
range: para_start..text.len(),
level: para_level.unwrap_or(LTR_LEVEL),
});
pure_ltr.push(is_pure_ltr);
flags.push(ParagraphInfoFlags {
is_pure_ltr,
has_isolate_controls,
});
}
debug_assert_eq!(paragraphs.len(), pure_ltr.len());
debug_assert_eq!(paragraphs.len(), flags.len());
}
debug_assert_eq!(original_classes.len(), text.len());

Expand All @@ -424,6 +440,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
original_classes,
para_level.unwrap_or(LTR_LEVEL),
is_pure_ltr,
has_isolate_controls,
)
}

Expand Down Expand Up @@ -482,20 +499,21 @@ impl<'text> BidiInfo<'text> {
text: &'a str,
default_para_level: Option<Level>,
) -> BidiInfo<'a> {
let InitialInfoExt { base, pure_ltr, .. } =
let InitialInfoExt { base, flags, .. } =
InitialInfoExt::new_with_data_source(data_source, text, default_para_level);

let mut levels = Vec::<Level>::with_capacity(text.len());
let mut processing_classes = base.original_classes.clone();

for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
for (para, flags) in base.paragraphs.iter().zip(flags.iter()) {
let text = &text[para.range.clone()];
let original_classes = &base.original_classes[para.range.clone()];

compute_bidi_info_for_para(
data_source,
para,
*is_pure_ltr,
flags.is_pure_ltr,
flags.has_isolate_controls,
text,
original_classes,
&mut processing_classes,
Expand Down Expand Up @@ -720,7 +738,7 @@ impl<'text> ParagraphBidiInfo<'text> {
) -> ParagraphBidiInfo<'a> {
// Here we could create a ParagraphInitialInfo struct to parallel the one
// used by BidiInfo, but there doesn't seem any compelling reason for it.
let (original_classes, paragraph_level, is_pure_ltr) =
let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) =
compute_initial_info(data_source, text, default_para_level, None);

let mut levels = Vec::<Level>::with_capacity(text.len());
Expand All @@ -738,6 +756,7 @@ impl<'text> ParagraphBidiInfo<'text> {
data_source,
&para_info,
is_pure_ltr,
has_isolate_controls,
text,
&original_classes,
&mut processing_classes,
Expand Down Expand Up @@ -1066,6 +1085,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
data_source: &D,
para: &ParagraphInfo,
is_pure_ltr: bool,
has_isolate_controls: bool,
text: &'a T,
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
Expand All @@ -1079,16 +1099,26 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>

let processing_classes = &mut processing_classes[para.range.clone()];
let levels = &mut levels[para.range.clone()];
let mut level_runs = LevelRunVec::new();

explicit::compute(
text,
para.level,
original_classes,
levels,
processing_classes,
&mut level_runs,
);

let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
let mut sequences = prepare::IsolatingRunSequenceVec::new();
prepare::isolating_run_sequences(
para.level,
original_classes,
levels,
level_runs,
has_isolate_controls,
&mut sequences,
);
for sequence in &sequences {
implicit::resolve_weak(text, sequence, processing_classes);
implicit::resolve_neutral(
Expand All @@ -1100,6 +1130,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
processing_classes,
);
}

implicit::resolve_levels(processing_classes, levels);

assign_levels_to_removed_chars(para.level, original_classes, levels);
Expand Down Expand Up @@ -1549,6 +1580,24 @@ mod tests {
#[cfg(feature = "hardcoded-data")]
fn test_process_text() {
let tests = vec![
(
// text
"",
// base level
Some(RTL_LEVEL),
// levels
Level::vec(&[]),
// original_classes
vec![],
// paragraphs
vec![],
// levels_u16
Level::vec(&[]),
// original_classes_u16
vec![],
// paragraphs_u16
vec![],
),
(
// text
"abc123",
Expand Down Expand Up @@ -1710,6 +1759,19 @@ mod tests {
paragraphs: t.4.clone(),
}
);
// If it was empty, also test that ParagraphBidiInfo handles it safely.
if t.4.len() == 0 {
assert_eq!(
ParagraphBidiInfo::new(t.0, t.1),
ParagraphBidiInfo {
text: t.0,
original_classes: t.3.clone(),
levels: t.2.clone(),
paragraph_level: RTL_LEVEL,
is_pure_ltr: true,
}
)
}
// If it was a single paragraph, also test ParagraphBidiInfo.
if t.4.len() == 1 {
assert_eq!(
Expand Down
Loading

0 comments on commit ca612da

Please sign in to comment.