Skip to content

Commit

Permalink
[lipi] Support ISO : and improve runtime
Browse files Browse the repository at this point in the history
Runtime improvement is approximately 2x:

    Benchmark: `make profile-time-osx`
    Before: 22.52s
    After : 11.21s

The main improvements are:
- using `unicode_normalization` instead of our hand-rolled function.
- fixing a bug in the `len_longest_key` calculation that made the value
  ~3x larger than it should be. The algorithm itself is still slow, but
  this change quickly puts out the fire.

In addition, this commit adds `Ord`/`PartialOrd` to a variety of types
and improves various comments.
  • Loading branch information
akprasad committed Nov 20, 2024
1 parent 6022952 commit c426cdf
Show file tree
Hide file tree
Showing 14 changed files with 202 additions and 43 deletions.
4 changes: 2 additions & 2 deletions vidyut-chandas/src/akshara.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::sounds;

/// The weight of an akshara.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum Weight {
/// A *guru* or heavy syllable.
G,
Expand All @@ -18,7 +18,7 @@ pub enum Weight {
/// - It must not start with an anusvara or visarga.
///
/// Together, these three rurles mean that an input string has exactly one division into aksharas.
#[derive(Debug, Clone, Eq, PartialEq)]
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct Akshara {
pub(crate) text: String,
pub(crate) weight: Weight,
Expand Down
6 changes: 3 additions & 3 deletions vidyut-chandas/src/padya.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub enum PatternWeight {
}

/// Describes how a vrtta matches some input.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd)]
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum MatchType {
/// No match.
None,
Expand All @@ -26,7 +26,7 @@ pub enum MatchType {
}

/// A traditional shorthand for vrtta weights.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum Gana {
/// *ya* (L G G)
Ya,
Expand Down Expand Up @@ -257,7 +257,7 @@ impl TryFrom<&str> for Vrtta {
}
}

#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub(crate) enum JatiKind {
/// A default jati.
Basic,
Expand Down
2 changes: 1 addition & 1 deletion vidyut-lipi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ clap = { version = "4.0.12", features = ["derive"] }
wasm-bindgen = "0.2"
serde-wasm-bindgen = "0.4"
console_error_panic_hook = "0.1.7"
unicode-normalization = "0.1.22"

[lib]
crate-type = ["cdylib", "rlib"]

[dev-dependencies]
codes-iso-15924 = { version = "0.1.3", default-features = false }
unicode-normalization = "0.1.22"
4 changes: 2 additions & 2 deletions vidyut-lipi/examples/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ fn main() {
}

let mut lipika = Lipika::new();
let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Tibetan);
println!("{output}");
let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Devanagari);
_ = lipika.transliterate(output, Scheme::Devanagari, Scheme::Slp1);
}
22 changes: 19 additions & 3 deletions vidyut-lipi/scripts/create_schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

CRATE_DIR = Path(__file__).parent.parent

# Scripts to use from `common_maps.git`
ALLOWED = {
"AHOM",
"ASSAMESE",
Expand Down Expand Up @@ -71,6 +72,7 @@
}


# Human-readable names for Unicode combos
KEY_NAMES = {
"\u0905": "A",
"\u0906": "AA",
Expand Down Expand Up @@ -216,6 +218,7 @@ def __init__(self, d):
for k, v in d.items():
setattr(self, k, v)

# Mapping from names to Unicode sequences
C = AttributeDict({v: k for k, v in KEY_NAMES.items()})


Expand All @@ -238,15 +241,16 @@ def __init__(self, d):
}


# Tweaks to the defaults in common_map
OVERRIDES = {
"BARAHA":
# Existing accent marks seem to be mostly wrong -- delete so that we
# can redefine them elsewhere.
{
"\u1ce1": None,
"\ua8e1": None,
"\ua8e2": None,
"\ua8e3": None,
C.COMBINING_DIGIT_1: None,
C.COMBINING_DIGIT_2: None,
C.COMBINING_DIGIT_3: None,
},
"GRANTHA": {
# vowel sign AU
Expand Down Expand Up @@ -447,6 +451,7 @@ def __init__(self, d):
}


# Additional characters not present in common_map (or deleted in OVERRIDES)
EXTENSIONS = {
"ASSAMESE": [
(C.CANDRABINDU_VIRAMA, "\u09fc"),
Expand Down Expand Up @@ -541,6 +546,16 @@ def __init__(self, d):
(C.QA, "q"),
(C.JIHVAMULIYA, "ẖ"),
(C.UPADHMANIYA, "ḫ"),
(C.KA + C.VIRAMA + C.HA, "k:h"),
(C.GA + C.VIRAMA + C.HA, "g:h"),
(C.CA + C.VIRAMA + C.HA, "c:h"),
(C.JA + C.VIRAMA + C.HA, "j:h"),
(C.TTA + C.VIRAMA + C.HA, "ṭ:h"),
(C.DDA + C.VIRAMA + C.HA, "ḍ:h"),
(C.TA + C.VIRAMA + C.HA, "t:h"),
(C.DA + C.VIRAMA + C.HA, "d:h"),
(C.PA + C.VIRAMA + C.HA, "p:h"),
(C.BA + C.VIRAMA + C.HA, "b:h"),
],
"KANNADA": [
(C.JIHVAMULIYA, "\u0cf1"),
Expand Down Expand Up @@ -640,6 +655,7 @@ def _sanitize(s: str) -> str:


def to_unique(xs: list) -> list:
"""Remove duplicates from `xs`."""
seen = set()
ret = []
for x in xs:
Expand Down
10 changes: 10 additions & 0 deletions vidyut-lipi/src/autogen_schemes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5096,6 +5096,16 @@ pub const ISO_15919: &[(&str, &str)] = &[
(TAMIL_AYTHAM, "ḳ"),
(QA, "q"),
(UPADHMANIYA, "ḫ"),
("क्ह", "k:h"),
("ग्ह", "g:h"),
("च्ह", "c:h"),
("ज्ह", "j:h"),
("ट्ह", "ṭ:h"),
("ड्ह", "ḍ:h"),
("त्ह", "t:h"),
("द्ह", "d:h"),
("प्ह", "p:h"),
("ब्ह", "b:h"),
(E, "e"),
(O, "o"),
(SIGN_E, "e"),
Expand Down
5 changes: 4 additions & 1 deletion vidyut-lipi/src/mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,9 @@ impl Mapping {
for (k, v) in &b_map.numeral_to_int {
int_to_numeral.insert(*v, k.to_string());
}
let len_longest_key = all.keys().map(|a| a.len()).max().unwrap_or(0);
// Take length in *chars*, not in *bytes*.
// (Using chars over bytes offers a ~3x speedup in the core transliterate loop.)
let len_longest_key = all.keys().map(|a| a.chars().count()).max().unwrap_or(0);
let numeral_to_int = a_map.numeral_to_int.clone();

Self {
Expand Down Expand Up @@ -409,6 +411,7 @@ impl Mapping {
self.all.get(key)
}

/// Dumps this mapping's data to stdout.
#[allow(unused)]
pub(crate) fn dump(&self) {
let mut items: Vec<_> = self.all.iter().collect();
Expand Down
1 change: 1 addition & 0 deletions vidyut-lipi/src/scheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ impl Scheme {
}
}

#[cfg(target_arch = "wasm32")]
pub(crate) fn unicode_composition_exclusions(&self) -> &[&str] {
use crate::unicode_norm as u;
use Scheme::*;
Expand Down
37 changes: 37 additions & 0 deletions vidyut-lipi/src/transliterate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,29 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String {
}
}

// Special case: from ISO-15959 separator logic for a:i, a:u
//
// (consonants are handled in the mapping. We can't do the same for a:i and a:u because the
// implicit 'a' vowel causes problems.)
// TODO: is there a better place to put this?
if mapping.from == Scheme::Iso15919
&& (input[i..].starts_with("a:i") || input[i..].starts_with("a:u"))
{
if is_to_abugida && had_virama {
// 'a' means we should pop virama.
output.pop();
had_virama = false;
} else {
// Otherwise, add independent 'a' vowel.
if let Some(x) = mapping.get("a") {
output.push_str(x.text());
}
}
// Increment past "a:"
i += 2;
continue;
}

// 1. Find the largest prefix of `input[i..]` that is defined in `mapping`.
//
// We must check for the *largest* match to distinguish between `b` and `bh`, `R` and `RR`,
Expand Down Expand Up @@ -141,6 +164,20 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String {
}
}
}

// Special case: to ISO-15959 separator logic for a:i, a:u
//
// (consonants are handled in the mapping. We can't do the same for a:i and a:u because
// the implicit 'a' vowel causes problems.)
// TODO: is there a better place to put this?
if mapping.to == Scheme::Iso15919
&& (output.ends_with("ai") || output.ends_with("au"))
&& matches!(token.text(), "i" | "u")
{
output.pop();
output.push(':');
output.push_str(token.text());
}
} else {
// ITRANS: `\` skips the next character.
if is_from_itrans {
Expand Down
18 changes: 16 additions & 2 deletions vidyut-lipi/src/unicode_norm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
use crate::scheme::Scheme;
use rustc_hash::FxHashMap;
use unicode_normalization::UnicodeNormalization;

type Table = &'static [(&'static str, &'static str)];

Expand Down Expand Up @@ -117,6 +118,7 @@ pub const DEVANAGARI_NFD: Table = &[
];

/// Characters that should not be created during NFD --> NFC.
#[cfg(target_arch = "wasm32")]
pub const DEVANAGARI_COMPOSITION_EXCLUSIONS: &[&str] = &[
"\u{0958}", // ka
"\u{0959}", // kha
Expand All @@ -139,6 +141,7 @@ pub const BENGALI_NFD: Table = &[
];

/// Characters that should not be created during NFD --> NFC.
#[cfg(target_arch = "wasm32")]
pub const BENGALI_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{09dc}", "\u{09dd}", "\u{09df}"];

/// Spec: <https://unicode.org/charts/PDF/U1000.pdf>
Expand Down Expand Up @@ -178,6 +181,7 @@ pub const GURMUKHI_NFD: Table = &[
];

/// Spec: <https://unicode.org/charts/PDF/U0A00.pdf>
#[cfg(target_arch = "wasm32")]
pub const GURMUKHI_COMPOSITION_EXCLUSIONS: &[&str] = &[
"\u{0a33}", "\u{0a36}", "\u{0a59}", "\u{0a5a}", "\u{0a5b}", "\u{0a5e}",
];
Expand Down Expand Up @@ -207,6 +211,7 @@ pub const ORIYA_NFD: Table = &[
("\u{0b5d}", "\u{0b22}\u{0b3c}"), // letter rha
];

#[cfg(target_arch = "wasm32")]
pub const ORIYA_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{0b5c}", "\u{0b5d}"];

/// Spec: <https://unicode.org/charts/PDF/U11580.pdf>
Expand Down Expand Up @@ -253,9 +258,17 @@ pub const TIRHUTA_NFD: Table = &[
///
/// Only characters that appear in one of our `Scheme`s will be converted. All other characters
/// will be left as-is.
#[cfg(not(target_arch = "wasm32"))]
pub(crate) fn to_nfc(s: &str) -> String {
s.nfc().collect()
}

/// WASM-only version of `to_nfc`.
///
/// TODO: consider using `unicode_normalization` in non-WASM with conditional compilation. Leaning
/// against due to having to reason about two different systems.
/// The `unicode_normalization` implementation of this logic is substantially faster (which
/// motivates using it in non-WASM builds) but also much larger (which motivates avoiding it in
/// WASM builds).
#[cfg(target_arch = "wasm32")]
pub(crate) fn to_nfc(s: &str) -> String {
let mut map = FxHashMap::default();
let mut len_longest_key = 0;
Expand Down Expand Up @@ -305,6 +318,7 @@ pub(crate) fn to_nfc(s: &str) -> String {
///
/// Our version of `to_nfd` supports only those characters that are part of a `Scheme`. All other
/// characters are left unchanged.
#[allow(unused)]
pub(crate) fn to_nfd(s: &str) -> String {
let mut map: FxHashMap<String, String> = FxHashMap::default();

Expand Down
50 changes: 50 additions & 0 deletions vidyut-lipi/tests/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,56 @@ fn iso_15919_bug_no_greedy_match_on_nfd() {
);
}

#[test]
fn iso_15919_colon_separator() {
// Consonants
assert_two_way_pairwise(&[
(
Iso15919,
"k:ha g:ha c:ha j:ha ṭ:ha ḍ:ha t:ha d:ha p:ha b:ha",
),
(Slp1, "kha gha cha jha wha qha tha dha pha bha"),
(Devanagari, "क्ह ग्ह च्ह ज्ह ट्ह ड्ह त्ह द्ह प्ह ब्ह"),
(Kannada, "ಕ್ಹ ಗ್ಹ ಚ್ಹ ಜ್ಹ ಟ್ಹ ಡ್ಹ ತ್ಹ ದ್ಹ ಪ್ಹ ಬ್ಹ"),
]);

// Consonants with marks
assert_two_way_pairwise(&[
(
Iso15919,
"k:hā g:hā c:hā j:hā ṭ:hā ḍ:hā t:hā d:hā p:hā b:hā",
),
(Slp1, "khA ghA chA jhA whA qhA thA dhA phA bhA"),
(Devanagari, "क्हा ग्हा च्हा ज्हा ट्हा ड्हा त्हा द्हा प्हा ब्हा"),
(Kannada, "ಕ್ಹಾ ಗ್ಹಾ ಚ್ಹಾ ಜ್ಹಾ ಟ್ಹಾ ಡ್ಹಾ ತ್ಹಾ ದ್ಹಾ ಪ್ಹಾ ಬ್ಹಾ"),
]);

// Consonants with viramas
assert_two_way_pairwise(&[
(Iso15919, "k:h g:h c:h j:h ṭ:h ḍ:h t:h d:h p:h b:h"),
(Slp1, "kh gh ch jh wh qh th dh ph bh"),
(Devanagari, "क्ह् ग्ह् च्ह् ज्ह् ट्ह् ड्ह् त्ह् द्ह् प्ह् ब्ह्"),
(Kannada, "ಕ್ಹ್ ಗ್ಹ್ ಚ್ಹ್ ಜ್ಹ್ ಟ್ಹ್ ಡ್ಹ್ ತ್ಹ್ ದ್ಹ್ ಪ್ಹ್ ಬ್ಹ್"),
]);

// Vowels
assert_two_way_pairwise(&[
(Iso15919, "a:i a:u ka:i ka:u"),
(Slp1, "ai au kai kau"),
(Devanagari, "अइ अउ कइ कउ"),
(Kannada, "ಅಇ ಅಉ ಕಇ ಕಉ"),
]);

// Regular colons -- ignore
// TODO: what's the best policy for handling these?
assert_two_way_pairwise(&[
(Iso15919, "a: ka: k: a:ā k:ta"),
(Slp1, "a: ka: k: a:A k:ta"),
(Devanagari, "अ: क: क्: अ:आ क्:त"),
(Kannada, "ಅ: ಕ: ಕ್: ಅ:ಆ ಕ್:ತ"),
]);
}

#[test]
fn iso_15919_tamil_aytam() {
assert_transliterate("ஃ", Tamil, Iso15919, "ḳ");
Expand Down
Loading

0 comments on commit c426cdf

Please sign in to comment.