diff --git a/vidyut-chandas/src/akshara.rs b/vidyut-chandas/src/akshara.rs index 2e819c2..f0f42b6 100644 --- a/vidyut-chandas/src/akshara.rs +++ b/vidyut-chandas/src/akshara.rs @@ -1,7 +1,7 @@ use crate::sounds; /// The weight of an akshara. -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum Weight { /// A *guru* or heavy syllable. G, @@ -18,7 +18,7 @@ pub enum Weight { /// - It must not start with an anusvara or visarga. /// /// Together, these three rurles mean that an input string has exactly one division into aksharas. -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Akshara { pub(crate) text: String, pub(crate) weight: Weight, diff --git a/vidyut-chandas/src/padya.rs b/vidyut-chandas/src/padya.rs index f5198c1..8539819 100644 --- a/vidyut-chandas/src/padya.rs +++ b/vidyut-chandas/src/padya.rs @@ -13,7 +13,7 @@ pub enum PatternWeight { } /// Describes how a vrtta matches some input. -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd)] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum MatchType { /// No match. None, @@ -26,7 +26,7 @@ pub enum MatchType { } /// A traditional shorthand for vrtta weights. -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum Gana { /// *ya* (L G G) Ya, @@ -257,7 +257,7 @@ impl TryFrom<&str> for Vrtta { } } -#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub(crate) enum JatiKind { /// A default jati. Basic, diff --git a/vidyut-lipi/Cargo.toml b/vidyut-lipi/Cargo.toml index 57220f3..9fb9d65 100644 --- a/vidyut-lipi/Cargo.toml +++ b/vidyut-lipi/Cargo.toml @@ -18,10 +18,10 @@ clap = { version = "4.0.12", features = ["derive"] } wasm-bindgen = "0.2" serde-wasm-bindgen = "0.4" console_error_panic_hook = "0.1.7" +unicode-normalization = "0.1.22" [lib] crate-type = ["cdylib", "rlib"] [dev-dependencies] codes-iso-15924 = { version = "0.1.3", default-features = false } -unicode-normalization = "0.1.22" diff --git a/vidyut-lipi/examples/sample.rs b/vidyut-lipi/examples/sample.rs index f1d79ed..cda1635 100644 --- a/vidyut-lipi/examples/sample.rs +++ b/vidyut-lipi/examples/sample.rs @@ -10,6 +10,6 @@ fn main() { } let mut lipika = Lipika::new(); - let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Tibetan); - println!("{output}"); + let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Devanagari); + _ = lipika.transliterate(output, Scheme::Devanagari, Scheme::Slp1); } diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py index 8d54232..5f1b9f6 100755 --- a/vidyut-lipi/scripts/create_schemes.py +++ b/vidyut-lipi/scripts/create_schemes.py @@ -14,6 +14,7 @@ CRATE_DIR = Path(__file__).parent.parent +# Scripts to use from `common_maps.git` ALLOWED = { "AHOM", "ASSAMESE", @@ -71,6 +72,7 @@ } +# Human-readable names for Unicode combos KEY_NAMES = { "\u0905": "A", "\u0906": "AA", @@ -216,6 +218,7 @@ def __init__(self, d): for k, v in d.items(): setattr(self, k, v) +# Mapping from names to Unicode sequences C = AttributeDict({v: k for k, v in KEY_NAMES.items()}) @@ -238,15 +241,16 @@ def __init__(self, d): } +# Tweaks to the defaults in common_map OVERRIDES = { "BARAHA": # Existing accent marks seem to be mostly wrong -- delete so that we # can redefine them elsewhere. { "\u1ce1": None, - "\ua8e1": None, - "\ua8e2": None, - "\ua8e3": None, + C.COMBINING_DIGIT_1: None, + C.COMBINING_DIGIT_2: None, + C.COMBINING_DIGIT_3: None, }, "GRANTHA": { # vowel sign AU @@ -447,6 +451,7 @@ def __init__(self, d): } +# Additional characters not present in common_map (or deleted in OVERRIDES) EXTENSIONS = { "ASSAMESE": [ (C.CANDRABINDU_VIRAMA, "\u09fc"), @@ -541,6 +546,16 @@ def __init__(self, d): (C.QA, "q"), (C.JIHVAMULIYA, "ẖ"), (C.UPADHMANIYA, "ḫ"), + (C.KA + C.VIRAMA + C.HA, "k:h"), + (C.GA + C.VIRAMA + C.HA, "g:h"), + (C.CA + C.VIRAMA + C.HA, "c:h"), + (C.JA + C.VIRAMA + C.HA, "j:h"), + (C.TTA + C.VIRAMA + C.HA, "ṭ:h"), + (C.DDA + C.VIRAMA + C.HA, "ḍ:h"), + (C.TA + C.VIRAMA + C.HA, "t:h"), + (C.DA + C.VIRAMA + C.HA, "d:h"), + (C.PA + C.VIRAMA + C.HA, "p:h"), + (C.BA + C.VIRAMA + C.HA, "b:h"), ], "KANNADA": [ (C.JIHVAMULIYA, "\u0cf1"), @@ -640,6 +655,7 @@ def _sanitize(s: str) -> str: def to_unique(xs: list) -> list: + """Remove duplicates from `xs`.""" seen = set() ret = [] for x in xs: diff --git a/vidyut-lipi/src/autogen_schemes.rs b/vidyut-lipi/src/autogen_schemes.rs index 89fc60e..5906ee3 100644 --- a/vidyut-lipi/src/autogen_schemes.rs +++ b/vidyut-lipi/src/autogen_schemes.rs @@ -5096,6 +5096,16 @@ pub const ISO_15919: &[(&str, &str)] = &[ (TAMIL_AYTHAM, "ḳ"), (QA, "q"), (UPADHMANIYA, "ḫ"), + ("क्ह", "k:h"), + ("ग्ह", "g:h"), + ("च्ह", "c:h"), + ("ज्ह", "j:h"), + ("ट्ह", "ṭ:h"), + ("ड्ह", "ḍ:h"), + ("त्ह", "t:h"), + ("द्ह", "d:h"), + ("प्ह", "p:h"), + ("ब्ह", "b:h"), (E, "e"), (O, "o"), (SIGN_E, "e"), diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs index bcdd9d4..88e3eb2 100644 --- a/vidyut-lipi/src/mapping.rs +++ b/vidyut-lipi/src/mapping.rs @@ -379,7 +379,9 @@ impl Mapping { for (k, v) in &b_map.numeral_to_int { int_to_numeral.insert(*v, k.to_string()); } - let len_longest_key = all.keys().map(|a| a.len()).max().unwrap_or(0); + // Take length in *chars*, not in *bytes*. + // (Using chars over bytes offers a ~3x speedup in the core transliterate loop.) + let len_longest_key = all.keys().map(|a| a.chars().count()).max().unwrap_or(0); let numeral_to_int = a_map.numeral_to_int.clone(); Self { @@ -409,6 +411,7 @@ impl Mapping { self.all.get(key) } + /// Dumps this mapping's data to stdout. #[allow(unused)] pub(crate) fn dump(&self) { let mut items: Vec<_> = self.all.iter().collect(); diff --git a/vidyut-lipi/src/scheme.rs b/vidyut-lipi/src/scheme.rs index 07b44d5..19e307e 100644 --- a/vidyut-lipi/src/scheme.rs +++ b/vidyut-lipi/src/scheme.rs @@ -613,6 +613,7 @@ impl Scheme { } } + #[cfg(target_arch = "wasm32")] pub(crate) fn unicode_composition_exclusions(&self) -> &[&str] { use crate::unicode_norm as u; use Scheme::*; diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs index 23ab6b8..8c97a31 100644 --- a/vidyut-lipi/src/transliterate.rs +++ b/vidyut-lipi/src/transliterate.rs @@ -70,6 +70,29 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { } } + // Special case: from ISO-15959 separator logic for a:i, a:u + // + // (consonants are handled in the mapping. We can't do the same for a:i and a:u because the + // implicit 'a' vowel causes problems.) + // TODO: is there a better place to put this? + if mapping.from == Scheme::Iso15919 + && (input[i..].starts_with("a:i") || input[i..].starts_with("a:u")) + { + if is_to_abugida && had_virama { + // 'a' means we should pop virama. + output.pop(); + had_virama = false; + } else { + // Otherwise, add independent 'a' vowel. + if let Some(x) = mapping.get("a") { + output.push_str(x.text()); + } + } + // Increment past "a:" + i += 2; + continue; + } + // 1. Find the largest prefix of `input[i..]` that is defined in `mapping`. // // We must check for the *largest* match to distinguish between `b` and `bh`, `R` and `RR`, @@ -141,6 +164,20 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { } } } + + // Special case: to ISO-15959 separator logic for a:i, a:u + // + // (consonants are handled in the mapping. We can't do the same for a:i and a:u because + // the implicit 'a' vowel causes problems.) + // TODO: is there a better place to put this? + if mapping.to == Scheme::Iso15919 + && (output.ends_with("ai") || output.ends_with("au")) + && matches!(token.text(), "i" | "u") + { + output.pop(); + output.push(':'); + output.push_str(token.text()); + } } else { // ITRANS: `\` skips the next character. if is_from_itrans { diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs index ff0fcb7..4363354 100644 --- a/vidyut-lipi/src/unicode_norm.rs +++ b/vidyut-lipi/src/unicode_norm.rs @@ -27,6 +27,7 @@ use crate::scheme::Scheme; use rustc_hash::FxHashMap; +use unicode_normalization::UnicodeNormalization; type Table = &'static [(&'static str, &'static str)]; @@ -117,6 +118,7 @@ pub const DEVANAGARI_NFD: Table = &[ ]; /// Characters that should not be created during NFD --> NFC. +#[cfg(target_arch = "wasm32")] pub const DEVANAGARI_COMPOSITION_EXCLUSIONS: &[&str] = &[ "\u{0958}", // ka "\u{0959}", // kha @@ -139,6 +141,7 @@ pub const BENGALI_NFD: Table = &[ ]; /// Characters that should not be created during NFD --> NFC. +#[cfg(target_arch = "wasm32")] pub const BENGALI_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{09dc}", "\u{09dd}", "\u{09df}"]; /// Spec: @@ -178,6 +181,7 @@ pub const GURMUKHI_NFD: Table = &[ ]; /// Spec: +#[cfg(target_arch = "wasm32")] pub const GURMUKHI_COMPOSITION_EXCLUSIONS: &[&str] = &[ "\u{0a33}", "\u{0a36}", "\u{0a59}", "\u{0a5a}", "\u{0a5b}", "\u{0a5e}", ]; @@ -207,6 +211,7 @@ pub const ORIYA_NFD: Table = &[ ("\u{0b5d}", "\u{0b22}\u{0b3c}"), // letter rha ]; +#[cfg(target_arch = "wasm32")] pub const ORIYA_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{0b5c}", "\u{0b5d}"]; /// Spec: @@ -253,9 +258,17 @@ pub const TIRHUTA_NFD: Table = &[ /// /// Only characters that appear in one of our `Scheme`s will be converted. All other characters /// will be left as-is. +#[cfg(not(target_arch = "wasm32"))] +pub(crate) fn to_nfc(s: &str) -> String { + s.nfc().collect() +} + +/// WASM-only version of `to_nfc`. /// -/// TODO: consider using `unicode_normalization` in non-WASM with conditional compilation. Leaning -/// against due to having to reason about two different systems. +/// The `unicode_normalization` implementation of this logic is substantially faster (which +/// motivates using it in non-WASM builds) but also much larger (which motivates avoiding it in +/// WASM builds). +#[cfg(target_arch = "wasm32")] pub(crate) fn to_nfc(s: &str) -> String { let mut map = FxHashMap::default(); let mut len_longest_key = 0; @@ -305,6 +318,7 @@ pub(crate) fn to_nfc(s: &str) -> String { /// /// Our version of `to_nfd` supports only those characters that are part of a `Scheme`. All other /// characters are left unchanged. +#[allow(unused)] pub(crate) fn to_nfd(s: &str) -> String { let mut map: FxHashMap = FxHashMap::default(); diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs index 070621b..ee1aef2 100644 --- a/vidyut-lipi/tests/basic.rs +++ b/vidyut-lipi/tests/basic.rs @@ -982,6 +982,56 @@ fn iso_15919_bug_no_greedy_match_on_nfd() { ); } +#[test] +fn iso_15919_colon_separator() { + // Consonants + assert_two_way_pairwise(&[ + ( + Iso15919, + "k:ha g:ha c:ha j:ha ṭ:ha ḍ:ha t:ha d:ha p:ha b:ha", + ), + (Slp1, "kha gha cha jha wha qha tha dha pha bha"), + (Devanagari, "क्ह ग्ह च्ह ज्ह ट्ह ड्ह त्ह द्ह प्ह ब्ह"), + (Kannada, "ಕ್ಹ ಗ್ಹ ಚ್ಹ ಜ್ಹ ಟ್ಹ ಡ್ಹ ತ್ಹ ದ್ಹ ಪ್ಹ ಬ್ಹ"), + ]); + + // Consonants with marks + assert_two_way_pairwise(&[ + ( + Iso15919, + "k:hā g:hā c:hā j:hā ṭ:hā ḍ:hā t:hā d:hā p:hā b:hā", + ), + (Slp1, "khA ghA chA jhA whA qhA thA dhA phA bhA"), + (Devanagari, "क्हा ग्हा च्हा ज्हा ट्हा ड्हा त्हा द्हा प्हा ब्हा"), + (Kannada, "ಕ್ಹಾ ಗ್ಹಾ ಚ್ಹಾ ಜ್ಹಾ ಟ್ಹಾ ಡ್ಹಾ ತ್ಹಾ ದ್ಹಾ ಪ್ಹಾ ಬ್ಹಾ"), + ]); + + // Consonants with viramas + assert_two_way_pairwise(&[ + (Iso15919, "k:h g:h c:h j:h ṭ:h ḍ:h t:h d:h p:h b:h"), + (Slp1, "kh gh ch jh wh qh th dh ph bh"), + (Devanagari, "क्ह् ग्ह् च्ह् ज्ह् ट्ह् ड्ह् त्ह् द्ह् प्ह् ब्ह्"), + (Kannada, "ಕ್ಹ್ ಗ್ಹ್ ಚ್ಹ್ ಜ್ಹ್ ಟ್ಹ್ ಡ್ಹ್ ತ್ಹ್ ದ್ಹ್ ಪ್ಹ್ ಬ್ಹ್"), + ]); + + // Vowels + assert_two_way_pairwise(&[ + (Iso15919, "a:i a:u ka:i ka:u"), + (Slp1, "ai au kai kau"), + (Devanagari, "अइ अउ कइ कउ"), + (Kannada, "ಅಇ ಅಉ ಕಇ ಕಉ"), + ]); + + // Regular colons -- ignore + // TODO: what's the best policy for handling these? + assert_two_way_pairwise(&[ + (Iso15919, "a: ka: k: a:ā k:ta"), + (Slp1, "a: ka: k: a:A k:ta"), + (Devanagari, "अ: क: क्: अ:आ क्:त"), + (Kannada, "ಅ: ಕ: ಕ್: ಅ:ಆ ಕ್:ತ"), + ]); +} + #[test] fn iso_15919_tamil_aytam() { assert_transliterate("ஃ", Tamil, Iso15919, "ḳ"); diff --git a/vidyut-prakriya/src/args/pratipadika.rs b/vidyut-prakriya/src/args/pratipadika.rs index 73dfc28..44ec5ef 100644 --- a/vidyut-prakriya/src/args/pratipadika.rs +++ b/vidyut-prakriya/src/args/pratipadika.rs @@ -3,6 +3,22 @@ use crate::args::{Krdanta, Samasa, Taddhitanta}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +/// Models a basic *prātipadika* that is not created with any other *pratyaya*s. +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct BasicPratipadika { + pub(crate) text: String, + pub(crate) is_avyaya: bool, + pub(crate) is_nyap: bool, +} + +impl BasicPratipadika { + /// Returns the text that constitutes this pratipadika. + pub fn text(&self) -> &str { + &self.text + } +} + /// A nominal stem. /// /// Rules 1.2.45 and 1.2.46 define a pratipadika as either: @@ -25,15 +41,6 @@ pub enum Pratipadika { Samasa(Box), } -/// Models a basic *prātipadika* that is not created with any other *pratyaya*s. -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct BasicPratipadika { - pub(crate) text: String, - pub(crate) is_avyaya: bool, - pub(crate) is_nyap: bool, -} - impl Pratipadika { /// (unstable) A simple constructor for `Pratipadika::Basic`. pub fn basic(text: impl AsRef) -> Self { diff --git a/vidyut-prakriya/src/args/taddhita.rs b/vidyut-prakriya/src/args/taddhita.rs index 86ef95f..adaa5f2 100644 --- a/vidyut-prakriya/src/args/taddhita.rs +++ b/vidyut-prakriya/src/args/taddhita.rs @@ -847,6 +847,17 @@ pub struct Taddhitanta { } impl Taddhitanta { + /// Defines a simple `Taddhitanta`. + /// + /// For more options, use `Taddhitanta::builder()` instead. + pub fn new(pratipadika: Pratipadika, taddhita: Taddhita) -> Self { + Self { + pratipadika, + taddhita, + artha: None, + require: None, + } + } /// Returns a new builder for this struct. pub fn builder() -> TaddhitantaBuilder { TaddhitantaBuilder::default() diff --git a/vidyut-prakriya/src/core/prakriya.rs b/vidyut-prakriya/src/core/prakriya.rs index f8ec2cb..c8579fa 100644 --- a/vidyut-prakriya/src/core/prakriya.rs +++ b/vidyut-prakriya/src/core/prakriya.rs @@ -17,29 +17,39 @@ pub type Code = &'static str; /// /// Most of a derivation's rules come directly from the Ashtadhyayi. But, some derivations use /// rules from other sources. We use this model to clearly define where different rules come from. -#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub enum Rule { - /// A sutra from the Ashtadhyayi. The string data here is an adhyaya-pada-sutra string, e.g. - /// "3.1.68". + /// A sutra from the Ashtadhyayi. + /// + /// Format: ".." Ashtadhyayi(&'static str), - /// A varttika on the Ashtadhyayi. The first string is an adhyaya-pada-sutra string, e.g. - /// "3.1.68",a nd the second string is an integer corresponding to the vArttika's position on - /// the sutra, e.g. "2" for the second vArttika on some sUtra. + /// A varttika on the Ashtadhyayi. + /// + /// Format: "..." Varttika(&'static str), - /// A sutra from the Dhatupatha. The string data here is a gana-sutra string, e.g. "10.0493". + /// A sutra from the Dhatupatha. + /// + /// Format: "." Dhatupatha(&'static str), - /// A sutra from the Unadipatha. The string here is a gana-sutra string, e.g. "1.1". + /// A sutra from the Unadipatha. + /// + /// Format: "." Unadipatha(&'static str), - /// A sutra from the Paniniya-Linganushasanam. The string here is the sutra's position in the - /// text, e.g. "40". + /// A sutra from the Paniniya-Linganushasanam. + /// + /// Format: "" Linganushasana(&'static str), - /// A sutra from the Phit Sutras. The string here is a gana-sutra string, e.g. "1.1". + /// A sutra from the Phit Sutras. + /// + /// Format: "." Phit(&'static str), - /// A comment in the Kashika-vrtti on a specific sutra. The string data here is an - /// adhyaya-pada-sutra string that describes the sutra being commented on. + /// A comment in the Kashika-vrtti on a specific sutra. + /// + /// Format: ".." Kashika(&'static str), - /// A quotation from the Vaiyakarana-siddhanta-kaumudi. The string here is the position of the - /// sutra being commented on in Kaumudi order, e.g. "446". + /// A quotation from the Vaiyakarana-siddhanta-kaumudi. + /// + /// Format: "" Kaumudi(&'static str), } @@ -74,7 +84,7 @@ impl From<&'static str> for Rule { /// structure with more information about the specific change. For example, we might explicitly /// indicate which term in the result was changed, which kind of rule was replied, and whether this /// rule was optional. -#[derive(Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub struct Step { rule: Rule, result: Vec, @@ -93,7 +103,7 @@ impl Step { } /// One of the terms in the derivation. -#[derive(Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub struct StepTerm { text: String, // NOTE: keep `tags` private. @@ -114,7 +124,7 @@ impl StepTerm { } /// Records whether an optional rule was accepted or declined. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] pub enum RuleChoice { /// Indicates that a rule was accepted during the derivation. Accept(Rule), @@ -123,7 +133,7 @@ pub enum RuleChoice { } /// Configuration options that affect how a `Prakriya` behaves during the derivation. -#[derive(Clone, Default, Debug)] +#[derive(Clone, Default, Debug, Eq, PartialEq)] pub(crate) struct Config { pub rule_choices: Vec, pub log_steps: bool, @@ -167,7 +177,7 @@ impl Config { /// For example, we might want the derivation to use *chandasi* rules, or we might wish to block /// such rules. Or, we might want to skip history logging so that we can generate words more /// quickly. -#[derive(Default, Debug)] +#[derive(Clone, Default, Debug, Eq, PartialEq)] pub struct Prakriya { terms: Vec, tags: EnumSet,