From 863c667e056e20583c701f3a7f7d99c575f77f7f Mon Sep 17 00:00:00 2001 From: Arun Prasad Date: Tue, 24 Dec 2024 23:21:20 -0800 Subject: [PATCH] [kosha] Simplify and optimize `create_kosha` This commit aims to improve the ergonomics, build time, and disk usage of `Kosha` and its associated classes. It is likely the first of several such commits, but it is an important checkpoint as we continue to lean more and more on data from `vidyut-prakriya`. --- Cargo.lock | 44 +- Makefile | 8 +- scripts/create_all_data.sh | 2 +- src/bin/create_kosha.rs | 1183 +++++++---------- src/bin/eval_cheda.rs | 29 +- src/bin/test_kosha.rs | 31 +- src/bin/train_cheda.rs | 4 +- todo.md | 2 + vidyut-chandas/src/akshara.rs | 6 + vidyut-chandas/src/chandas.rs | 1 - vidyut-chandas/src/error.rs | 22 +- vidyut-chandas/src/lib.rs | 3 +- vidyut-chandas/src/macros.rs | 47 + vidyut-chandas/src/padya.rs | 86 +- vidyut-cheda/src/bin/cheda.rs | 75 +- .../src/{segmenting.rs => chedaka.rs} | 35 +- vidyut-cheda/src/dcs.rs | 166 ++- vidyut-cheda/src/lib.rs | 4 +- vidyut-cheda/src/scoring.rs | 39 +- vidyut-cheda/src/strict_mode.rs | 39 +- vidyut-cheda/tests/error_messages.rs | 6 +- vidyut-kosha/Cargo.toml | 5 +- vidyut-kosha/bcd.md | 9 + vidyut-kosha/benches/kosha.rs | 4 +- vidyut-kosha/src/entries.rs | 217 +++ vidyut-kosha/src/errors.rs | 11 + vidyut-kosha/src/kosha.rs | 164 ++- vidyut-kosha/src/lib.rs | 2 +- vidyut-kosha/src/morph.rs | 835 ------------ vidyut-kosha/src/packing.rs | 659 ++++----- vidyut-kosha/tests/error_messages.rs | 10 +- vidyut-prakriya/Cargo.toml | 8 +- vidyut-prakriya/examples/create_pada.rs | 91 ++ vidyut-prakriya/src/angasya/subanta.rs | 2 +- vidyut-prakriya/src/args/krt.rs | 8 +- vidyut-prakriya/src/args/pada.rs | 4 +- vidyut-prakriya/src/args/pratipadika.rs | 9 + vidyut-prakriya/src/args/slp1_string.rs | 7 + vidyut-prakriya/src/ashtadhyayi.rs | 11 +- vidyut-prakriya/src/dhatu_karya.rs | 5 +- vidyut-prakriya/src/pratipadika_karya.rs | 2 +- vidyut-prakriya/src/samasa.rs | 32 +- vidyut-prakriya/src/samjna.rs | 12 +- vidyut-prakriya/src/stem_gana.rs | 6 +- vidyut-prakriya/src/stritva.rs | 5 + vidyut-prakriya/src/tripadi/pada_8_2.rs | 2 +- vidyut-prakriya/src/tripadi/pada_8_3.rs | 2 +- .../tests/integration/kashika_6_4.rs | 2 + .../tests/integration/kaumudi_13.rs | 1 + .../tests/integration/regressions.rs | 23 +- vidyut-sandhi/src/generator.rs | 8 +- vidyut-sandhi/src/sounds.rs | 1 + vidyut-sandhi/src/splitter.rs | 9 +- 53 files changed, 1719 insertions(+), 2279 deletions(-) create mode 100644 todo.md create mode 100644 vidyut-chandas/src/macros.rs rename vidyut-cheda/src/{segmenting.rs => chedaka.rs} (93%) create mode 100644 vidyut-kosha/bcd.md create mode 100644 vidyut-kosha/src/entries.rs delete mode 100644 vidyut-kosha/src/morph.rs create mode 100644 vidyut-prakriya/examples/create_pada.rs diff --git a/Cargo.lock b/Cargo.lock index 6f5bf1f..18194e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -433,7 +433,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -444,7 +444,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -497,7 +497,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -971,7 +971,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -1099,18 +1099,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -1260,9 +1260,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" -version = "1.0.193" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] @@ -1280,22 +1280,23 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.193" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] name = "serde_json" -version = "1.0.109" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0652c533506ad7a2e353cce269330d6afd8bdfb6d75e0ace5b35aacbd7b9e9" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -1375,9 +1376,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.39" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -1452,7 +1453,7 @@ checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -1509,7 +1510,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", ] [[package]] @@ -1673,6 +1674,7 @@ dependencies = [ "rand", "rustc-hash 1.1.0", "serde", + "serde_json", "tempfile", "vidyut-prakriya", ] @@ -1762,7 +1764,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -1784,7 +1786,7 @@ checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Makefile b/Makefile index b7ffe87..67f9f6c 100644 --- a/Makefile +++ b/Makefile @@ -34,12 +34,13 @@ create_sandhi_rules: RUST_LOG=info cargo run --release --bin create_sandhi_rules -- \ --data-dir data/build/vidyut-latest -# Creates a koshas and write it to disk. +# Creates a kosha and write it to disk. create_kosha: RUST_LOG=info cargo run --release --bin create_kosha -- \ --input-dir data/raw/lex \ --dhatupatha vidyut-prakriya/data/dhatupatha.tsv \ - --output-dir data/build/vidyut-latest + --output-dir data/build/vidyut-latest/kosha + # Trains a padaccheda model and saves important features to disk. # NOTE: when training, exclude the file paths used in `make eval`. @@ -57,7 +58,8 @@ train_cheda: # Runs basic end-to-end tests against the given kosha. test_kosha: - RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-latest/kosha + RUST_LOG=info cargo run --release --bin test_kosha -- \ + --data-dir data/build/vidyut-latest/kosha # Evaluate our parsing quality on a large sample of text. diff --git a/scripts/create_all_data.sh b/scripts/create_all_data.sh index 84d872b..7e7ed0a 100755 --- a/scripts/create_all_data.sh +++ b/scripts/create_all_data.sh @@ -43,7 +43,7 @@ echo "=========================" echo "vidyut-chandas" echo "=========================" mkdir -p "${OUTPUT_DIR}/chandas" -cp -r vidyut-chandas/data "${OUTPUT_DIR}/chandas" +cp -r vidyut-chandas/data/* "${OUTPUT_DIR}/chandas" echo "Copied files to output dir." echo echo "=========================" diff --git a/src/bin/create_kosha.rs b/src/bin/create_kosha.rs index 3bb476f..3606bcd 100644 --- a/src/bin/create_kosha.rs +++ b/src/bin/create_kosha.rs @@ -1,91 +1,61 @@ -//! Creates an FST kosha using our raw linguistic data. -//! -//! This binary is computationally intensive and may take several minutes. -//! -//! TODO: -//! - prefixes -//! - sya-Satf, sya-SAnac, ya-SAnac -//! - upasarga + tvA (upAsitvA, etc.) -//! - pada variants -//! - dedupe krdantas with existing nominals -//! - update `morph` encoding for krdantas +//! Creates a kosha of Sanskrit words and writes the results to disk. use clap::Parser; -use lazy_static::lazy_static; use log::info; use rayon::prelude::*; use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::process; -use vidyut_cheda::sounds::{is_ac, is_ghosha, is_hal}; -use vidyut_cheda::Config; -use vidyut_kosha::morph::*; +use vidyut_kosha::entries::{DhatuEntry, PadaEntry, PratipadikaEntry, SubantaEntry}; +use vidyut_kosha::packing::PackedEntry; use vidyut_kosha::Builder; -use vidyut_prakriya::args as vp; -use vidyut_prakriya::dhatupatha::Entry as DhatuEntry; +use vidyut_prakriya::args::*; use vidyut_prakriya::{Dhatupatha, Vyakarana}; type Result = std::result::Result>; type UpasargaDhatuMap = HashMap>>; - -/// A list of pratipadikas. -type StemVec = Vec<(String, Pratipadika)>; - -/// A list of complete padas. -type PadaVec = Vec<(String, Pada)>; - -/// A list of sup pratyayas. -type SupVec = Vec<(String, String, Pada)>; +type Pratipadikas = Vec<(String, PratipadikaEntry)>; +type Padas = Vec<(String, PackedEntry)>; #[derive(Parser, Debug)] #[command(author, version, about)] struct Args { /// Path to the underlying raw data. - #[arg(short, long)] + #[arg(long)] input_dir: PathBuf, /// Path to a dhatupatha file (e.g. the one used by vidyut-prakriya) - #[arg(short, long)] + #[arg(long)] dhatupatha: PathBuf, /// Path to the Vidyut output directory. - #[arg(short, long)] + #[arg(long)] output_dir: PathBuf, } /// Defines all of the input data paths we need to construct the FST. pub struct DataPaths { - pub indeclinables: PathBuf, - pub nominal_endings_compounded: PathBuf, - pub nominal_endings_inflected: PathBuf, - pub basic_pratipadikas: PathBuf, - pub irregular_subantas: PathBuf, - pub prefix_groups: PathBuf, - pub prefixed_roots: PathBuf, + pub avyayas: PathBuf, + pub pratipadikas: PathBuf, + pub gati: PathBuf, pub upasarga_dhatus: PathBuf, } impl DataPaths { - pub fn new(base: impl AsRef) -> Self { - let base = base.as_ref(); + fn new(base: &Path) -> Self { DataPaths { - indeclinables: base.join("indeclinables.csv"), - nominal_endings_compounded: base.join("nominal-endings-compounded.csv"), - nominal_endings_inflected: base.join("nominal-endings-inflected.csv"), - basic_pratipadikas: base.join("nominal-stems.csv"), - irregular_subantas: base.join("nominals-irregular.csv"), - prefix_groups: base.join("prefix-groups.csv"), - prefixed_roots: base.join("prefixed-roots.csv"), + avyayas: base.join("indeclinables.csv"), + pratipadikas: base.join("nominal-stems.csv"), + gati: base.join("prefix-groups.csv"), upasarga_dhatus: base.join("upasarga-dhatus.csv"), } } } -/// Creates a collection of (linga, vibhakti, vacana) combinations. -fn linga_vibhakti_vacana_options() -> Vec<(vp::Linga, vp::Vibhakti, vp::Vacana)> { +fn sup_options() -> Vec<(Linga, Vibhakti, Vacana)> { let mut ret = Vec::new(); - for linga in vp::Linga::iter() { - for vibhakti in vp::Vibhakti::iter() { - for vacana in vp::Vacana::iter() { + for linga in Linga::iter() { + for vibhakti in Vibhakti::iter() { + for vacana in Vacana::iter() { ret.push((linga, vibhakti, vacana)) } } @@ -93,43 +63,21 @@ fn linga_vibhakti_vacana_options() -> Vec<(vp::Linga, vp::Vibhakti, vp::Vacana)> ret } -/// Creates a collection of common sanAdi combinations. -fn sanadi_options() -> Vec> { - use vp::Sanadi::*; - vec![ - vec![], - vec![Ric], - vec![san], - vec![yaN], - vec![yaNluk], - vec![Ric, san], - vec![san, Ric], - ] -} - -fn tinanta_options() -> Vec<( - vp::Prayoga, - vp::DhatuPada, - vp::Lakara, - vp::Purusha, - vp::Vacana, -)> { +fn tin_options() -> Vec<(Prayoga, Lakara, Purusha, Vacana)> { let mut ret = Vec::new(); - for prayoga in vp::Prayoga::iter() { - for pada in vp::DhatuPada::iter() { - if prayoga == vp::Prayoga::Bhave { - // Duplicates karmani -- skip + for prayoga in Prayoga::iter() { + if prayoga == Prayoga::Bhave { + // Duplicates karmani -- skip + continue; + } + for lakara in Lakara::iter() { + if lakara == Lakara::Let { + // Noisy -- skip continue; } - for lakara in vp::Lakara::iter() { - if lakara == vp::Lakara::Let { - // Experimental -- skip - continue; - } - for purusha in vp::Purusha::iter() { - for vacana in vp::Vacana::iter() { - ret.push((prayoga, pada, lakara, purusha, vacana)); - } + for purusha in Purusha::iter() { + for vacana in Vacana::iter() { + ret.push((prayoga, lakara, purusha, vacana)); } } } @@ -137,366 +85,119 @@ fn tinanta_options() -> Vec<( ret } -fn parse_stem_linga(code: &str) -> &[Linga] { - use Linga::*; - match code { - "m" => &[Pum], - "f" => &[Stri], - "n" => &[Napumsaka], - "mf" => &[Pum, Stri], - "fn" => &[Stri, Napumsaka], - "mn" => &[Pum, Napumsaka], - "mfn" => &[Pum, Stri, Napumsaka], - "none" => &[], - &_ => panic!("Unknown type {}", code), - } -} +fn create_subanta_endings() -> HashMap> { + let mut ret = HashMap::new(); -/// Adds avyayas scraped from the MW dictionary. -fn add_avyayas(path: &Path, padas: &mut PadaVec) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let pada = r[0].to_string(); - padas.push(( - pada.clone(), - Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: pada, - lingas: Vec::new(), - }, - }), - )); + fn safe(s: &str) -> Slp1String { + Slp1String::from(s).expect("static") } - Ok(()) -} -// Adds irregular subantas specified manually. -// -// TODO: can we deprecate this given vidyut-prakriya? -fn add_irregular_subantas(path: &Path, padas: &mut PadaVec) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let pratipadika = r[0].to_string(); - let stem_lingas = parse_stem_linga(&r[1]); - let pada = r[2].to_string(); - let linga = r[3].parse().ok(); - let vibhakti = r[4].parse().ok(); - let vacana = r[5].parse().ok(); - - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: pratipadika.clone(), - lingas: stem_lingas.to_vec(), - }, - linga, - vibhakti, - vacana, - is_purvapada: false, - }); - - padas.push((pada.clone(), semantics)); + fn mula(s: &str, gana: Gana) -> Dhatu { + Dhatu::mula(safe(s), gana) } - // `mahA` is common but missing upstream, so add it specially. - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: "mahat".to_string(), - lingas: vec![Linga::Pum, Linga::Stri, Linga::Napumsaka], - }, - linga: None, - vibhakti: None, - vacana: None, - is_purvapada: true, - }); - padas.push(("mahA".to_string(), semantics)); - - Ok(()) -} - -/// Add simple pratipadikas scraped from the MW dictionary. -/// -/// TODO: deduplicate with our krdantas, etc. -fn add_basic_pratipadikas(path: &Path, stems: &mut StemVec) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let stem = r[0].to_string(); - let lingas = parse_stem_linga(&r[1]); - let semantics = Pratipadika::Basic { - text: stem.clone(), - lingas: lingas.to_vec(), - }; - stems.push((stem, semantics)); - } - Ok(()) -} - -/// Adds various common prefix groups. -/// -/// TODO: this doesn't make sense. We aren't storing the split prefixes anywhere ... -fn add_prefix_groups(path: &Path, padas: &mut PadaVec) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let value = &r[0]; - // FIXME: consider deleting this logic. - let semantics = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: value.to_string(), - lingas: Vec::new(), - }, - }); - padas.push((value.to_string(), semantics)); - } - Ok(()) -} - -/// TODO: delete this after migrating to vidyut-prakriya for everything. -fn read_sup_endings(paths: &DataPaths) -> Result { - let mut endings = SupVec::new(); - - let mut rdr = csv::Reader::from_path(&paths.nominal_endings_compounded)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let stem = r[0].to_string(); - let stem_lingas = parse_stem_linga(&r[1]); - let ending = r[2].to_string(); - let ending_linga = r[3].parse()?; - - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: stem.clone(), - lingas: stem_lingas.to_vec(), - }, - linga: Some(ending_linga), - vibhakti: None, - vacana: None, - is_purvapada: true, - }); - endings.push((ending, stem, semantics)); - } - - let mut rdr = csv::Reader::from_path(&paths.nominal_endings_inflected)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - - let stem = r[0].to_string(); - let ending = r[2].to_string(); - let linga = r[3].parse()?; - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: stem.clone(), - lingas: vec![linga], - }, - linga: Some(linga), - vibhakti: r[4].parse().ok(), - vacana: r[5].parse().ok(), - is_purvapada: false, - }); - endings.push((ending, stem, semantics)); - } - - Ok(endings) -} - -/// Create spelling variants for the given stems. -fn get_variants(text: &str) -> Vec { - lazy_static! { - static ref PREFIXES: Vec<(String, String)> = vec![ - ("saMp".to_string(), "samp".to_string()), - ("saMb".to_string(), "samb".to_string()), - ("saMB".to_string(), "samB".to_string()), - ("samp".to_string(), "saMp".to_string()), - ("samb".to_string(), "saMb".to_string()), - ("samB".to_string(), "saMB".to_string()), - ]; - } - - let mut variants = vec![]; - for (old, new) in PREFIXES.iter() { - if text.starts_with(old) { - variants.push(text.replace(old, new)); - } - } - - if text.contains("ttr") { - variants.push(text.replace("ttr", "tr")); - } - variants -} - -fn read_stems(paths: &DataPaths) -> Result { - let mut stems = StemVec::new(); - add_basic_pratipadikas(&paths.basic_pratipadikas, &mut stems)?; - - // Add simple support for variants. - let mut variants = StemVec::new(); - for (k, v) in &stems { - for k_variant in get_variants(&k) { - variants.push((k_variant.clone(), v.clone())); - } - } - stems.extend(variants); - - Ok(stems) -} - -fn read_padas(paths: &DataPaths) -> Result { - let mut padas = PadaVec::with_capacity(20_000_000); - add_avyayas(&paths.indeclinables, &mut padas).expect("Could not find indeclinables"); - add_prefix_groups(&paths.prefix_groups, &mut padas).expect("Could not find prefix groups"); - add_irregular_subantas(&paths.irregular_subantas, &mut padas) - .expect("Could not find irregular subantas"); - - let mut variants = PadaVec::new(); - for (k, v) in &padas { - for k_variant in get_variants(&k) { - variants.push((k_variant.clone(), v.clone())); + let havis = Krdanta::new(mula("hu\\", Gana::Juhotyadi), Unadi::isi); + let dhanus = Krdanta::new(mula("Dana~\\", Gana::Juhotyadi), Unadi::usi); + let bhagavat = Taddhitanta::new(Pratipadika::basic(safe("Baga")), Taddhita::matup); + let hanumat = Taddhitanta::new(Pratipadika::basic(safe("hanu")), Taddhita::matup); + let mahat = Krdanta::new(mula("maha~", Gana::Bhvadi), Unadi::ati).with_require("mahat"); + + let pratipadikas: &[(&str, &str, Pratipadika)] = &[ + ("is", "havis", havis.into()), + ("us", "Danus", dhanus.into()), + ("vat", "Bagavat", bhagavat.into()), + ("mat", "hanumat", hanumat.into()), + ("mahat", "mahat", mahat.into()), + ]; + let lvv = sup_options(); + + let v = create_vyakarana(); + for (ending_type, sample, phit) in pratipadikas { + let mut endings = Vec::new(); + for (linga, vibhakti, vacana) in lvv.iter().copied() { + let sup = Subanta::new(phit.clone(), linga, vibhakti, vacana); + let prakriyas = v.derive_subantas(&sup); + for p in prakriyas { + let text = p.text(); + let offset = sample.len() - ending_type.len(); + let ending = &text[offset..]; + endings.push((ending.to_string(), linga, vibhakti, vacana)); + } } + ret.insert(ending_type.to_string(), endings); } - padas.extend(variants); - Ok(padas) + ret } -fn inflect_halanta_stem(stem: &str, sup: &str) -> String { - if sup.starts_with(is_ac) { - String::from(stem) + sup - } else { - let n = stem.len(); - let prefix = &stem[..n - 1]; - let stem_ending = &stem[n - 1..n]; - - let stem_ending = match stem_ending { - "k" | "K" | "g" | "G" => "k", - "c" | "C" | "j" | "J" => "k", - "w" | "W" | "q" | "Q" => "w", - "t" | "T" | "d" | "D" => "t", - "p" | "P" | "b" | "B" => "p", - _ => stem_ending, - }; - let stem_ending = if sup.starts_with(is_ghosha) { - match stem_ending { - "k" => "g", - "w" => "q", - "t" => "d", - "p" => "b", - _ => stem_ending, - } - } else { - stem_ending - }; - - String::from(prefix) + stem_ending + sup - } +/// Creates a preconfigured Vyakarana instance. +fn create_vyakarana() -> Vyakarana { + Vyakarana::builder() + .log_steps(false) + .nlp_mode(true) + .is_chandasi(true) + .build() } -// Generates all nominal padas and adds them to the pada map. -fn add_nominals(stems: &StemVec, endings: &SupVec, padas: &mut PadaVec) { - let mut stem_to_endings = HashMap::new(); - for (ending, stem, semantics) in endings { - if !stem_to_endings.contains_key(stem) { - let stem = stem.clone(); - stem_to_endings.insert(stem, vec![]); +/// Creates all standard combinations of (upasarga x dhatu x sanadi) +fn create_all_dhatus( + dhatupatha_path: &Path, + upasarga_dhatu_path: &Path, +) -> Result> { + let sanadis = { + use Sanadi::*; + vec![ + vec![], + vec![Ric], + vec![san], + vec![yaN], + vec![yaNluk], + vec![Ric, san], + vec![san, Ric], + ] + }; + + // Load mula dhatus and the upasarga combinations they support. + let dhatupatha = Dhatupatha::from_path(&dhatupatha_path)?; + let mut upasarga_dhatus: UpasargaDhatuMap = HashMap::new(); + { + let mut rdr = csv::Reader::from_path(upasarga_dhatu_path)?; + for maybe_row in rdr.records() { + let r = maybe_row?; + let upasargas: Vec<_> = r[0].split("-").map(|x| x.to_string()).collect(); + let code = r[2].to_string(); + // the empty Vec is for the default case (no prefixes). + upasarga_dhatus + .entry(code) + .or_insert(vec![Vec::new()]) + .push(upasargas); } - stem_to_endings - .get_mut(stem) - .unwrap() - .push((ending.clone(), semantics.clone())); } - // For all stems, ... - for (stem_text, stem_semantics) in stems { - let mut was_inserted = false; - - // And all stem endings ... - for (stem_ending, sup_pratyayas) in stem_to_endings.iter() { - // If the stem ends in this ending ... - if let Some(prefix) = stem_text.strip_suffix(stem_ending) { - // Then for all pratyayas that the ending allows, ... - for (sup_text, sup_semantics) in sup_pratyayas { - let pada_text = prefix.to_string() + sup_text; - - if let Pada::Subanta(sup_semantics) = sup_semantics { - // Create and insert the corresponding pada. - let pada_semantics = Pada::Subanta(Subanta { - pratipadika: stem_semantics.clone(), - ..sup_semantics.clone() - }); - padas.push((pada_text.clone(), pada_semantics)); - } - } - was_inserted = true; - } - } - - if !was_inserted { - // If the stem is a special consonant ending ... - if is_hal(stem_text.chars().last().unwrap()) { - let pratyayas = stem_to_endings - .get("_") - .expect("`_` ending should be defined"); - for (sup_text, sup_semantics) in pratyayas { - let pada_text = inflect_halanta_stem(stem_text, sup_text); - - if let Pada::Subanta(sup_semantics) = sup_semantics { - // Create and insert the corresponding pada. - let pada_semantics = Pada::Subanta(Subanta { - pratipadika: stem_semantics.clone(), - ..sup_semantics.clone() - }); - padas.push((pada_text.clone(), pada_semantics)); - } + // Create the final list of dhatus. + let v = Vyakarana::new(); + let mut ret = Vec::new(); + let new = Vec::new(); + for entry in &dhatupatha { + let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + + for sanadi in &sanadis { + for prefixes in upasarga_groups { + let dhatu = entry + .dhatu() + .clone() + .with_sanadi(&sanadi) + .with_prefixes(prefixes); + let prakriyas = v.derive_dhatus(&dhatu); + if let Some(p) = prakriyas.iter().next() { + // Add valid dhatus only. + ret.push(DhatuEntry::new(dhatu, p.text())); } } } } -} - -fn create_sarvanamas(padas: &mut PadaVec) { - // Data copied from vidyut-prakriya. - const SARVANAMA: &[&str] = &[ - // qatara, qatama - // TODO: actually detect qatarac/qatamac in vidyut-prakriya. - "katara", "yatara", "tatara", "ekatara", "katama", "yatama", "tatama", "ekatama", - // sarvAdi - "sarva", "viSva", "uBa", "uBaya", "qatara", "qatama", "anya", "anyatara", "itara", "tvat", - "tva", "nema", "sama", "sima", "pUrva", "para", "avara", "dakziRa", "uttara", "apara", - "aDara", "sva", "antara", "tyad", "tad", "yad", "etad", "idam", "adas", "eka", "dvi", - "yuzmad", "asmad", "Bavatu~", "kim", - ]; - - let linga_vibhakti_vacana = linga_vibhakti_vacana_options(); - let v = Vyakarana::builder() - .log_steps(false) - .is_chandasi(true) - .build(); - for stem in SARVANAMA { - let prati = vp::Pratipadika::basic(stem); - let lingas = vec![Linga::Pum, Linga::Stri, Linga::Napumsaka]; - - for (linga, vibhakti, vacana) in linga_vibhakti_vacana.iter().copied() { - let args = vp::Subanta::new(prati.clone(), linga, vibhakti, vacana); - let prakriyas = v.derive_subantas(&args); - for p in prakriyas { - let morph = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: stem.to_string(), - lingas: lingas.clone(), - }, - linga: Some(Linga::from(linga)), - vibhakti: Some(Vibhakti::from(vibhakti)), - vacana: Some(Vacana::from(vacana)), - is_purvapada: false, - }); - let text = p.text(); - padas.push((text, morph)); - } - } - } + Ok(ret) } /// Creates all tinantas. @@ -509,75 +210,33 @@ fn create_sarvanamas(padas: &mut PadaVec) { /// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com /// /// TODO: gati, cvi -fn create_tinantas( - entries: &Vec, - upasarga_dhatus: &UpasargaDhatuMap, - padas: &mut PadaVec, -) { - let all_sanadis = sanadi_options(); - let args = tinanta_options(); - - let v = Vyakarana::builder() - .log_steps(false) - .is_chandasi(true) - .build(); +fn create_all_tinantas(builder: &Builder, all_dhatus: &[DhatuEntry]) -> Padas { + let plpv = tin_options(); + let v = create_vyakarana(); - let results: Vec<_> = entries - .par_iter() - .flat_map(|entry| { - let new = Vec::new(); - let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + all_dhatus + .par_chunks(all_dhatus.len() / 100) + .flat_map(|chunk| { let mut ret = Vec::new(); - for group in upasarga_groups { - for sanadi in &all_sanadis { - let dhatu = entry - .dhatu() - .clone() - .with_sanadi(sanadi) - .with_prefixes(group); - - for (prayoga, dhatu_pada, lakara, purusha, vacana) in &args { - let args = vp::Tinanta::builder() - .dhatu(dhatu.clone()) - .prayoga(*prayoga) - .pada(*dhatu_pada) - .lakara(*lakara) - .purusha(*purusha) - .vacana(*vacana) - .build() - .expect("ok"); - - let pada_prayoga = match (dhatu_pada, prayoga) { - (vp::DhatuPada::Parasmai, _) => PadaPrayoga::Parasmaipada, - (vp::DhatuPada::Atmane, vp::Prayoga::Kartari) => { - PadaPrayoga::AtmanepadaKartari - } - (vp::DhatuPada::Atmane, _) => PadaPrayoga::AtmanepadaNotKartari, - }; - - let prakriyas = v.derive_tinantas(&args); - ret.extend(prakriyas.iter().map(|prakriya| { - let text = prakriya.text(); - let semantics = Pada::Tinanta(Tinanta { - dhatu: dhatu.clone().into(), - purusha: Purusha::from(*purusha), - vacana: Vacana::from(*vacana), - lakara: Lakara::from(*lakara), - pada: PadaPrayoga::from(pada_prayoga), - }); - - (text, semantics) - })); + for entry in chunk { + let dhatu = entry.dhatu().clone(); + for (prayoga, lakara, purusha, vacana) in plpv.iter().copied() { + let args = Tinanta::new(dhatu.clone(), prayoga, lakara, purusha, vacana); + + let prakriyas = v.derive_tinantas(&args); + for prakriya in prakriyas { + let text = prakriya.text(); + let semantics = PadaEntry::Tinanta(args.clone()); + let packed_semantics = builder.pack(&semantics).expect("ok"); + ret.push((text, packed_semantics)) } } } ret.into_par_iter() }) - .collect(); - - padas.extend(results); + .collect() } /// Creates all krdantas that form nominals. @@ -590,256 +249,418 @@ fn create_tinantas( /// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com /// /// TODO: gati, cvi -fn create_inflected_krdantas( - entries: &Vec, - upasarga_dhatus: &UpasargaDhatuMap, - padas: &mut PadaVec, -) { - use vp::BaseKrt as VKrt; - - let linga_vibhakti_vacana = linga_vibhakti_vacana_options(); - let all_sanadis = sanadi_options(); - let all_krts = &[ +fn create_inflected_krt_subantas(builder: &mut Builder, all_dhatus: &[DhatuEntry]) -> Padas { + use BaseKrt as K; + const ALL_KRTS: &[K] = &[ // Lit - VKrt::kvasu, - VKrt::kAnac, + K::kvasu, + K::kAnac, // nistha - VKrt::kta, - VKrt::ktavatu, + K::kta, + K::ktavatu, // Lat - VKrt::Satf, - VKrt::SAnac, + K::Satf, + K::SAnac, // krtya - VKrt::yat, - VKrt::Ryat, - VKrt::kyap, - VKrt::tavya, - VKrt::anIyar, + K::yat, + K::Ryat, + K::kyap, + K::tavya, + K::anIyar, // Common - VKrt::Rvul, - VKrt::lyuw, - VKrt::tfc, - // TODO: all all of the others, including unadis. + K::Rvul, + K::lyuw, + K::tfc, + // TODO: add all of the others, including unadis. ]; - let sat_pratyayas = &[VKrt::Satf, VKrt::SAnac]; + let v = create_vyakarana(); + let mut all_krdantas: Vec = Vec::new(); + { + for entry in all_dhatus { + let dhatu = entry.dhatu(); + for krt in ALL_KRTS.iter().copied() { + let prayoga_lakara: &[(Option, Option)] = match krt { + K::Satf | K::SAnac => &[ + (Some(Prayoga::Kartari), Some(Lakara::Lat)), + (Some(Prayoga::Kartari), Some(Lakara::Lrt)), + (Some(Prayoga::Karmani), Some(Lakara::Lat)), + (Some(Prayoga::Karmani), Some(Lakara::Lrt)), + ], + _ => &[(None, None)], + }; + + for (prayoga, lakara) in prayoga_lakara.iter().copied() { + let mut builder = Krdanta::builder().dhatu(dhatu.clone()).krt(krt); + if let (Some(prayoga), Some(lakara)) = (prayoga, lakara) { + builder = builder.prayoga(prayoga).lakara(lakara); + } + let krdanta = builder.build().expect("ok"); - let v = Vyakarana::builder() - .log_steps(false) - .is_chandasi(true) - .build(); + // Keep only krdantas that are morphologically valid to avoid filling + // `register_pratipadikas` with junk. + let prakriyas = v.derive_krdantas(&krdanta); + if !prakriyas.is_empty() { + all_krdantas.push(PratipadikaEntry::new(krdanta.into(), vec![])); + } + } + } + } + builder.register_pratipadikas(&all_krdantas); + } + info!("- Created {} krdanta pratipadikas.", all_krdantas.len()); - let results: Vec<_> = entries - .par_iter() - .flat_map(|entry| { - let new = Vec::new(); - let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + let lvv = sup_options(); + all_krdantas + .par_chunks(all_krdantas.len() / 100) + .flat_map(|chunk| { let mut ret = Vec::new(); - for group in upasarga_groups { - for sanadi in &all_sanadis { - let dhatu = entry - .dhatu() - .clone() - .with_sanadi(sanadi) - .with_prefixes(group); - - for krt in all_krts { - let krdanta = vp::Krdanta::builder() - .dhatu(dhatu.clone()) - .krt(*krt) - .build() - .expect("ok"); - - for (linga, vibhakti, vacana) in &linga_vibhakti_vacana { - let args = - vp::Subanta::new(krdanta.clone(), *linga, *vibhakti, *vacana); - - let prakriyas = v.derive_subantas(&args); - ret.extend(prakriyas.iter().map(|p| { - let text = p.text(); - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Krdanta { - dhatu: dhatu.clone().into(), - krt: Krt::new(*krt), - }, - linga: Some(Linga::from(*linga)), - vibhakti: Some(Vibhakti::from(*vibhakti)), - vacana: Some(Vacana::from(*vacana)), - is_purvapada: false, - }); - - (text, semantics) - })); - } - } + for entry in chunk { + let krdanta = entry.pratipadika(); + for (linga, vibhakti, vacana) in lvv.iter().copied() { + let args = Subanta::new(krdanta.clone(), linga, vibhakti, vacana); - // lrt-sat (karizyan, karizyamARaH, ...) - for krt in sat_pratyayas { - for (linga, vibhakti, vacana) in &linga_vibhakti_vacana { - let krdanta = vp::Krdanta::builder() - .dhatu(dhatu.clone()) - .lakara(vp::Lakara::Lrt) - .krt(VKrt::Satf) - .build() - .expect("ok"); - - let args = - vp::Subanta::new(krdanta.clone(), *linga, *vibhakti, *vacana); - - let prakriyas = v.derive_subantas(&args); - ret.extend(prakriyas.iter().map(|p| { - let text = p.text(); - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Krdanta { - dhatu: dhatu.clone().into(), - krt: Krt::new(*krt), - }, - linga: Some(Linga::from(*linga)), - vibhakti: Some(Vibhakti::from(*vibhakti)), - vacana: Some(Vacana::from(*vacana)), - is_purvapada: false, - }); - - (text, semantics) - })); - } + let prakriyas = v.derive_subantas(&args); + for prakriya in &prakriyas { + let text = prakriya.text(); + + let subanta = Subanta::new(krdanta.clone(), linga, vibhakti, vacana); + let pada = PadaEntry::Subanta(subanta.into()); + let packed_semantics = builder.pack(&pada).expect("valid"); + ret.push((text, packed_semantics)) } } } ret.into_par_iter() }) - .collect(); + .collect() +} + +fn create_avyaya_krt_subantas(builder: &mut Builder, all_dhatus: &[DhatuEntry]) -> Padas { + use BaseKrt as K; + const AVYAYA_KRTS: &[K] = &[K::ktvA, K::tumun]; + + let mut all_krdantas: Vec = Vec::new(); + { + for entry in all_dhatus { + let dhatu = entry.dhatu(); + for krt in AVYAYA_KRTS.iter().copied() { + let krdanta = Krdanta::new(dhatu.clone(), krt); + let entry = PratipadikaEntry::new(krdanta.into(), vec![]); + all_krdantas.push(entry); + } + } + builder.register_pratipadikas(&all_krdantas); + } + info!( + "- Created {} krdanta avyaya pratipadikas.", + all_krdantas.len() + ); + + let v = create_vyakarana(); + all_krdantas + .par_chunks(all_krdantas.len() / 100) + .flat_map(|chunk| { + let mut ret = Vec::new(); - padas.extend(results); + for entry in chunk { + let krdanta = entry.pratipadika(); + let args = Subanta::avyaya(krdanta.clone()); + + let prakriyas = v.derive_subantas(&args); + for prakriya in &prakriyas { + let text = prakriya.text(); + let semantics = PadaEntry::Avyaya(SubantaEntry::new( + Subanta::avyaya(krdanta.clone()), + PratipadikaEntry::new(krdanta.clone(), vec![]), + )); + let packed_semantics = builder.pack(&semantics).expect("valid"); + ret.push((text, packed_semantics)) + } + } + + ret.into_par_iter() + }) + .collect() } -/// Creates all krdantas that form avyayas. -/// -/// This function generates the following combinations: -/// -/// (upasarga, dhatu, sanadi, krt) -/// -/// - `upasarga` comes from the Upasargartha-candrika. -/// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com -/// -/// TODO: gati, cvi -fn create_avyaya_krdantas( - entries: &Vec, - upasarga_dhatus: &UpasargaDhatuMap, - padas: &mut PadaVec, -) { - let all_sanadis = sanadi_options(); - let all_krts = &[vp::BaseKrt::ktvA, vp::BaseKrt::tumun]; - - let v = Vyakarana::builder() - .log_steps(false) - .is_chandasi(true) - .build(); +fn read_basic_pratipadikas(builder: &mut Builder, path: &Path) -> Result { + let mut ret = Pratipadikas::new(); + + fn parse_stem_linga(code: &str) -> &[Linga] { + use Linga::*; + match code { + "m" => &[Pum], + "f" => &[Stri], + "n" => &[Napumsaka], + "mf" => &[Pum, Stri], + "fn" => &[Stri, Napumsaka], + "mn" => &[Pum, Napumsaka], + "mfn" => &[Pum, Stri, Napumsaka], + "none" => &[], + &_ => panic!("Unknown type {}", code), + } + } - let results: Vec<_> = entries - .par_iter() - .flat_map(|entry| { - let new = Vec::new(); - let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + let mut rdr = csv::Reader::from_path(path)?; + for maybe_row in rdr.records() { + let r = maybe_row?; + // Weird '|' characters in input, all seem to be SLP1 `Q`. + let stem = r[0].to_string().replace('|', "Q"); + // Skip `L` for now. + if stem.contains('L') { + continue; + } + + let stem = Slp1String::from(stem).expect("ok"); + let lingas = parse_stem_linga(&r[1]); + let pratipadika = + if lingas == &[Linga::Stri] && (stem.ends_with('A') || stem.ends_with('I')) { + // senA, devI, ... + Pratipadika::nyap(stem) + } else { + Pratipadika::basic(stem) + }; + let entry = PratipadikaEntry::new(pratipadika, lingas.to_vec()); + ret.push((r[0].to_string(), entry)); + } + + let sankhyas = &[ + "saptan", + "azwan", + "navan", + "daSan", + "ekAdaSan", + "dvAdaSan", + "trayodaSan", + "caturdaSan", + "paYcadaSan", + "zoqaSan", + "saptadaSan", + "azwAdaSan", + ]; + for s in sankhyas { + let safe = Slp1String::from(s).expect("ok"); + let entry = PratipadikaEntry::new(Pratipadika::basic(safe), Vec::new()); + ret.push((s.to_string(), entry)); + } + + for (_, entry) in &ret { + builder.register_pratipadikas(&[entry.clone()]); + } + + Ok(ret) +} + +fn create_basic_subantas(builder: &Builder, all_pratipadikas: &Pratipadikas) -> Padas { + let v = create_vyakarana(); + let lvv = sup_options(); + + // For pratipadikas that are tedious to construct, e.g. havis, Danus, ... + let ending_table = create_subanta_endings(); + + all_pratipadikas + .par_chunks(all_pratipadikas.len() / 100) + .flat_map(|chunk| { let mut ret = Vec::new(); - for group in upasarga_groups { - for sanadi in &all_sanadis { - let dhatu = entry - .dhatu() - .clone() - .with_sanadi(sanadi) - .with_prefixes(group); - for krt in all_krts { - let args = vp::Krdanta::builder() - .dhatu(dhatu.clone().with_sanadi(sanadi)) - .krt(*krt) - .build() - .expect("ok"); - - let prakriyas = v.derive_krdantas(&args); - ret.extend(prakriyas.iter().map(|p| { - let text = p.text(); - let semantics = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Krdanta { - dhatu: dhatu.clone().into(), - krt: Krt::new(*krt), - }, - }); - - (text, semantics) - })); + for (_, entry) in chunk { + let prati = entry.pratipadika(); + let b = match prati { + Pratipadika::Basic(b) => b, + _ => continue, + }; + + // Shortcut logic for tricky pratipadikas (aunadika, taddhitanta, ...) + let mut used_shortcut = false; + for (ending_type, endings) in ending_table.iter() { + if b.text().ends_with(ending_type) { + let offset = b.text().len() - ending_type.len(); + // For tricky pratipadikas that aren't fully supported yet. + let prefix = &b.text()[..offset]; + + for (ending, linga, vibhakti, vacana) in endings { + let args = Subanta::new(prati.clone(), *linga, *vibhakti, *vacana); + let entry = PadaEntry::Subanta(args.into()); + let packed_semantics = builder.pack(&entry).expect("valid"); + + let key = prefix.to_string() + ending; + ret.push((key, packed_semantics)); + } + used_shortcut = true; + break; } } - } + if used_shortcut { + continue; + } + // For all other pratipadikas. Ideally, this should be the branch we use for + // all subantas. + for (linga, vibhakti, vacana) in lvv.iter().copied() { + let args = Subanta::new(prati.clone(), linga, vibhakti, vacana); + + let prakriyas = v.derive_subantas(&args); + for prakriya in &prakriyas { + let text = prakriya.text(); + let entry = PadaEntry::Subanta(args.clone().into()); + let packed_semantics = builder.pack(&entry).expect("valid"); + ret.push((text, packed_semantics)) + } + } + } ret.into_par_iter() }) - .collect(); - - padas.extend(results); + .collect() } -/// Maps a dhatu code (e.g. "01.0001") to all lists of prefixes it might take. -fn parse_upasarga_dhatus(path: &Path) -> Result { +fn read_basic_avyayas(builder: &mut Builder, path: &Path) -> Result { + let mut ret = Pratipadikas::new(); + let mut rdr = csv::Reader::from_path(path)?; - let mut ret: UpasargaDhatuMap = HashMap::new(); for maybe_row in rdr.records() { let r = maybe_row?; - let upasargas: Vec<_> = r[0].split("-").map(|x| x.to_string()).collect(); - let code = r[2].to_string(); - // the empty Vec is for the default case (no prefixes). - ret.entry(code).or_insert(vec![Vec::new()]).push(upasargas); + // Weird '|' characters in input, all seem to be SLP1 `Q`. + let stem = r[0].to_string().replace('|', "Q"); + // Skip `L` for now. + if stem.contains('L') { + continue; + } + + let stem = Slp1String::from(stem).expect("ok"); + let pratipadika = Pratipadika::basic(stem.clone()); + let entry = PratipadikaEntry::new(pratipadika, vec![]); + ret.push((r[0].to_string(), entry)); + } + + for (_, value) in &ret { + builder.register_pratipadikas(&[value.clone()]); } Ok(ret) } -fn run(args: Args) -> Result<()> { - info!("Reading linguistic data ..."); +/// Adds avyayas scraped from the MW dictionary. +fn create_avyayas(builder: &Builder, avyaya_pratipadikas: &Pratipadikas) -> Padas { + let v = create_vyakarana(); + avyaya_pratipadikas + .par_chunks(avyaya_pratipadikas.len() / 100) + .flat_map(|chunk| { + let mut ret = Vec::new(); - let data_paths = DataPaths::new(Path::new(&args.input_dir)); - let dhatupatha = Dhatupatha::from_path(&args.dhatupatha)?; - // let dhatu_entries: Vec = dhatupatha.into_iter().take(200).collect(); - let dhatu_entries: Vec = dhatupatha.into_iter().collect(); + for (_, entry) in chunk { + let prati = entry.pratipadika(); + let args = Subanta::new(prati.clone(), Linga::Pum, Vibhakti::Prathama, Vacana::Eka); - let mut padas = read_padas(&data_paths)?; + let prakriyas = v.derive_subantas(&args); + for prakriya in &prakriyas { + let text = prakriya.text(); + let entry = PadaEntry::Avyaya(args.clone().into()); + let packed_semantics = builder.pack(&entry).expect("valid"); + ret.push((text, packed_semantics)) + } + } + ret.into_par_iter() + }) + .collect() +} - info!("Creating tinantas ..."); - let upasarga_dhatus = parse_upasarga_dhatus(&data_paths.upasarga_dhatus)?; - create_tinantas(&dhatu_entries, &upasarga_dhatus, &mut padas); +fn read_gatis(builder: &mut Builder, path: &Path) -> Result { + let mut ret = Pratipadikas::new(); - info!("Creating krdantas (inflected) ..."); - create_inflected_krdantas(&dhatu_entries, &upasarga_dhatus, &mut padas); + let mut rdr = csv::Reader::from_path(path)?; + for maybe_row in rdr.records() { + let r = maybe_row?; + // Weird '|' characters in input, all seem to be SLP1 `Q`. + let stem = r[0].to_string().replace('|', "Q"); + // Skip `L` for now. + if stem.contains('L') { + continue; + } - info!("Creating krdantas (avyaya) ..."); - create_avyaya_krdantas(&dhatu_entries, &upasarga_dhatus, &mut padas); + // Keep only gati. + if !matches!(stem.chars().last().expect("present"), 'A' | 'I' | 'U') { + continue; + } + // Avoid prefix chains and upasargas. + if r[1].contains('-') || &r[0] == "A" { + continue; + } - info!("Creating plain subantas ..."); - create_sarvanamas(&mut padas); + let stem = Slp1String::from(stem).expect("ok"); + let pratipadika = Pratipadika::basic(stem.clone()); + let entry = PratipadikaEntry::new(pratipadika, vec![]); + ret.push((r[0].to_string(), entry)); + } + for (_, value) in &ret { + builder.register_pratipadikas(&[value.clone()]); + } - let stems = read_stems(&data_paths)?; - let endings = read_sup_endings(&data_paths)?; - add_nominals(&stems, &endings, &mut padas); + Ok(ret) +} + +fn run(args: Args) -> Result<()> { + let paths = DataPaths::new(&args.input_dir); + + info!("Generating words."); + let mut padas = Vec::new(); + let mut builder = Builder::new(&args.output_dir)?; + { + let all_dhatus = create_all_dhatus(&args.dhatupatha, &paths.upasarga_dhatus)?; + builder.register_dhatus(&all_dhatus); + info!("- Created {} dhatus.", all_dhatus.len()); + + let all_tinantas = create_all_tinantas(&builder, &all_dhatus); + info!("- Created {} tinantas.", all_tinantas.len()); + padas.extend(all_tinantas); + + let all_krt_subantas = create_inflected_krt_subantas(&mut builder, &all_dhatus); + info!( + "- Created {} inflected krdanta padas.", + all_krt_subantas.len() + ); + padas.extend(all_krt_subantas); + + let all_krt_avyayas = create_avyaya_krt_subantas(&mut builder, &all_dhatus); + info!("- Created {} avyaya krdanta padas.", all_krt_avyayas.len()); + padas.extend(all_krt_avyayas); + + let basic_pratipadikas = read_basic_pratipadikas(&mut builder, &paths.pratipadikas)?; + info!("- Loaded {} basic pratipadikas.", basic_pratipadikas.len()); + + let basic_subantas = create_basic_subantas(&builder, &basic_pratipadikas); + info!("- Created {} basic subantas.", basic_subantas.len()); + padas.extend(basic_subantas); + + let basic_avyayas = read_basic_avyayas(&mut builder, &paths.avyayas)?; + info!("- Loaded {} basic avyayas.", basic_avyayas.len()); + let basic_avyayas = create_avyayas(&builder, &basic_avyayas); + info!("- Created {} basic avyayas.", basic_avyayas.len()); + padas.extend(basic_avyayas); + + let gati = read_gatis(&mut builder, &paths.gati)?; + info!("- Loaded {} gati prefixes.", gati.len()); + let gati = create_avyayas(&builder, &gati); + info!("- Created {} gati prefixes.", gati.len()); + padas.extend(gati); + } - info!("Sorting keys ..."); + info!("Sorting keys."); padas.par_sort(); - info!("Inserting entries ..."); - let config = Config::new(&args.output_dir); - let mut builder = Builder::new(config.kosha())?; - let mut num_words = 0; - for (key, pada) in padas { - builder.insert(&key, &pada)?; - num_words += 1; + info!("Inserting entries."); + let mut num_entries = 0; + for (key, packed_pada) in padas { + builder.insert_packed(&key, &packed_pada)?; + num_entries += 1; } - info!("Finishing build ..."); + info!("Finishing build."); builder.finish()?; - info!("Complete. (Inserted {num_words} entries.)"); + info!("Complete. (Inserted {num_entries} entries.)"); Ok(()) } diff --git a/src/bin/eval_cheda.rs b/src/bin/eval_cheda.rs index 029e096..075e774 100644 --- a/src/bin/eval_cheda.rs +++ b/src/bin/eval_cheda.rs @@ -9,9 +9,10 @@ use vidyut_cheda::conllu::Reader; use vidyut_cheda::dcs; use vidyut_cheda::Result; use vidyut_cheda::{Chedaka, Config, Token}; -use vidyut_kosha::morph::*; +use vidyut_kosha::entries::*; use vidyut_lipi::{transliterate, Mapping, Scheme}; use vidyut_prakriya::args as vp; +use vidyut_prakriya::args::Pratipadika; #[derive(Parser, Debug)] #[command(author, version, about)] @@ -71,26 +72,24 @@ fn to_slp1(text: &str) -> String { /// Vidyut semantics and DCS semantics into a coarser space. fn as_code(w: &Token) -> String { match &w.info { - Pada::Subanta(s) => { - format!( - "n-{}-{}-{}", - s.linga.map_or("", |x| x.as_str()), - s.vibhakti.map_or("", |x| x.as_str()), - s.vacana.map_or("", |x| x.as_str()) - ) + PadaEntry::Subanta(s) => { + let s = s.subanta(); + format!("n-{}-{}-{}", s.linga(), s.vibhakti(), s.vacana(),) } - Pada::Tinanta(s) => { - format!("v-{}-{}", s.purusha.as_str(), s.vacana.as_str()) + PadaEntry::Tinanta(s) => { + format!("v-{}-{}", s.purusha().as_str(), s.vacana().as_str()) } - Pada::Unknown => "_".to_string(), - Pada::Avyaya(a) => { - let val = match &a.pratipadika { - Pratipadika::Basic { .. } => "i", - Pratipadika::Krdanta { krt, .. } => match krt.value() { + PadaEntry::Unknown => "_".to_string(), + PadaEntry::Avyaya(a) => { + let a = a.subanta(); + let val = match &a.pratipadika() { + Pratipadika::Basic(_) => "i", + Pratipadika::Krdanta(k) => match k.krt() { vp::Krt::Base(vp::BaseKrt::ktvA) => "ktva", vp::Krt::Base(vp::BaseKrt::tumun) => "tumun", _ => "_", }, + _ => "i", }; val.to_string() } diff --git a/src/bin/test_kosha.rs b/src/bin/test_kosha.rs index 6e9cbbf..e57e5ba 100644 --- a/src/bin/test_kosha.rs +++ b/src/bin/test_kosha.rs @@ -3,7 +3,7 @@ use clap::Parser; use std::path::PathBuf; use vidyut_cheda::Result; -use vidyut_kosha::morph::Pada; +use vidyut_kosha::entries::PadaEntry; use vidyut_kosha::Kosha; #[derive(Parser, Debug)] @@ -87,7 +87,7 @@ fn test_tinantas(k: &Kosha) -> Result<()> { // Other tricky tinantas "saMskaroti", "saYcaskAra", - "saYcaskrire", + "saYcaskarire", ]; let mut i = 0; @@ -108,7 +108,7 @@ fn test_tinantas(k: &Kosha) -> Result<()> { fn test_krdantas(k: &Kosha) -> Result<()> { let keys = vec![ // kta, ktavat - "BUtaH", + "BUtas", "BUtam", "BUtA", "BUtavAn", @@ -116,17 +116,17 @@ fn test_krdantas(k: &Kosha) -> Result<()> { "BUtavatI", // Satf "Bavan", - "BavantaH", + "Bavantas", "BavantI", "Bavizyan", - "BavizyantaH", + "Bavizyantas", "BavizyantI", // krtya "Bavyam", "Bavitavyam", "BavanIyam", // Other - "BAvakaH", + "BAvakas", "Bavanam", // With prefixes "aBiBUtam", @@ -144,7 +144,7 @@ fn test_krdantas(k: &Kosha) -> Result<()> { } } let n = keys.len(); - println!("{i} / {n} tinanta tests passed."); + println!("{i} / {n} krdanta tests passed."); Ok(()) } @@ -159,7 +159,7 @@ fn test_subantas(k: &Kosha) -> Result<()> { ("vaDUs", "vaDU"), ("kartA", "kartf"), ("rAs", "rE"), - // ("dyOs", "div"), + ("dyOs", "div"), ("nOs", "nO"), ("AtmA", "Atman"), ("manasA", "manas"), @@ -167,17 +167,16 @@ fn test_subantas(k: &Kosha) -> Result<()> { ("DanurByAm", "Danus"), ("hanumAn", "hanumat"), ("Bagavantam", "Bagavat"), - ("jagmivAn", "jagmivas"), // Consonant stems ("vAk", "vAc"), ("vit", "vid"), - // ("kakuB", "kakup"), - + ("kakup", "kakuB"), // Irregular subantas ("mahAn", "mahat"), ("trayas", "tri"), ("zaRRAm", "zaz"), ("sapta", "saptan"), + ("azwa", "azwan"), ("daSa", "daSan"), ("pitaras", "pitf"), ("mAtaras", "mAtf"), @@ -189,20 +188,20 @@ fn test_subantas(k: &Kosha) -> Result<()> { let mut i = 0; for (key, lemma) in &keys { - let present = k.contains_key(key); - let entries: std::result::Result, _> = + let found = k.contains_key(key); + let entries: std::result::Result, _> = k.get_all(key).iter().map(|x| k.unpack(x)).collect(); let entries = entries?; let has_lemma = entries.iter().any(|x| &x.lemma() == lemma); - if present && has_lemma { + if found && has_lemma { i += 1; } else { - println!("FAILED: key {key} is missing (present={present}, has_lemma={has_lemma})"); + println!("FAILED: {key} (found = {found}, has_lemma={has_lemma})"); } } let n = keys.len(); - println!("{i} / {n} tinanta tests passed."); + println!("{i} / {n} subanta tests passed."); Ok(()) } diff --git a/src/bin/train_cheda.rs b/src/bin/train_cheda.rs index d16a438..8f77b17 100644 --- a/src/bin/train_cheda.rs +++ b/src/bin/train_cheda.rs @@ -9,7 +9,7 @@ use vidyut_cheda::dcs; use vidyut_cheda::model::State; use vidyut_cheda::Result; use vidyut_cheda::{Config, Token}; -use vidyut_kosha::morph::*; +use vidyut_kosha::entries::*; use vidyut_lipi::{transliterate, Mapping, Scheme}; #[derive(Parser, Debug)] @@ -86,7 +86,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) { .or_insert(0); *c += 1; - let tag = token.info.part_of_speech_tag(); + let tag = token.info.pos_tag(); let c = s.lemma_counts.entry((lemma.to_string(), tag)).or_insert(0); *c += 1; diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..f03c2ae --- /dev/null +++ b/todo.md @@ -0,0 +1,2 @@ +- many kosha inputs are wrong (hanumantam, e.g.) +- kosha output is very large, suspiciously so. diff --git a/vidyut-chandas/src/akshara.rs b/vidyut-chandas/src/akshara.rs index f0f42b6..ef5d333 100644 --- a/vidyut-chandas/src/akshara.rs +++ b/vidyut-chandas/src/akshara.rs @@ -1,3 +1,4 @@ +use crate::enum_boilerplate; use crate::sounds; /// The weight of an akshara. @@ -9,6 +10,11 @@ pub enum Weight { L, } +enum_boilerplate!(Weight, { + G => "G", + L => "L", +}); + /// A Sanskrit syllable. /// /// An akshara follows the following rules: diff --git a/vidyut-chandas/src/chandas.rs b/vidyut-chandas/src/chandas.rs index ead7963..69fea19 100644 --- a/vidyut-chandas/src/chandas.rs +++ b/vidyut-chandas/src/chandas.rs @@ -306,7 +306,6 @@ mod tests { snigDacCAyAtaruzu vasatiM rAmagiryASramezu .. 1 ..", "mandAkrAntA", ); - assert!(c.classify("mo mo go go vidyunmAlA").padya().is_none()); } #[test] diff --git a/vidyut-chandas/src/error.rs b/vidyut-chandas/src/error.rs index 879be98..cf033f5 100644 --- a/vidyut-chandas/src/error.rs +++ b/vidyut-chandas/src/error.rs @@ -1,28 +1,36 @@ use std::fmt; #[allow(unused)] -pub(crate) type Result = std::result::Result; +pub(crate) type Result = std::result::Result; #[allow(unused)] #[derive(Debug)] -pub enum ChandasError { +pub enum Error { ParseError, + EnumParseError(String), IoError(std::io::Error), } -impl From for ChandasError { +impl Error { + pub(crate) fn enum_parse_error(value: &str) -> Self { + Error::EnumParseError(value.to_string()) + } +} + +impl From for Error { #[inline] - fn from(err: std::io::Error) -> ChandasError { - ChandasError::IoError(err) + fn from(err: std::io::Error) -> Error { + Error::IoError(err) } } -impl fmt::Display for ChandasError { +impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use ChandasError::*; + use Error::*; match self { ParseError => write!(f, "Could not parse meter."), + EnumParseError(e) => write!(f, "Could not parse enum value {e}."), IoError(_) => write!(f, "Could not open input file."), } } diff --git a/vidyut-chandas/src/lib.rs b/vidyut-chandas/src/lib.rs index d82756b..569eaaa 100644 --- a/vidyut-chandas/src/lib.rs +++ b/vidyut-chandas/src/lib.rs @@ -5,6 +5,7 @@ mod akshara; mod chandas; mod error; +mod macros; mod padya; mod sounds; @@ -12,4 +13,4 @@ mod wasm; pub use akshara::{Akshara, Weight}; pub use chandas::{Chandas, Match, Matches}; -pub use padya::{Jati, MatchType, Pada, PatternWeight, Vrtta}; +pub use padya::{Jati, MatchType, Vrtta, VrttaPada, VrttaWeight}; diff --git a/vidyut-chandas/src/macros.rs b/vidyut-chandas/src/macros.rs new file mode 100644 index 0000000..6b510ae --- /dev/null +++ b/vidyut-chandas/src/macros.rs @@ -0,0 +1,47 @@ +/// Implements various boilerplate for our enums. +#[macro_export] +macro_rules! enum_boilerplate { + ($Enum:ident, { $( $variant:ident => $str:literal ),* $(,)? }) => { + impl $Enum { + /// Returns a simple human-readable string that represents this enum's value. + pub fn as_str(&self) -> &'static str { + match self { + $( + $Enum::$variant => $str, + )* + } + } + + /// Iterates over all values of this enum in order. + #[allow(dead_code)] + pub fn iter() -> impl Iterator { + /// In Rust, `const` items are created at compile time. + const ITEMS: &[$Enum] = &[ + $( + $Enum::$variant, + )* + ]; + ITEMS.iter() + } + } + + impl std::str::FromStr for $Enum { + type Err = $crate::error::Error; + fn from_str(value: &str) -> $crate::error::Result { + let ret = match value { + $( + $str => $Enum::$variant, + )* + _ => return Err($crate::error::Error::enum_parse_error(value)) + }; + Ok(ret) + } + } + + impl core::fmt::Display for $Enum { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.as_str()) + } + } + } +} diff --git a/vidyut-chandas/src/padya.rs b/vidyut-chandas/src/padya.rs index 8539819..503973a 100644 --- a/vidyut-chandas/src/padya.rs +++ b/vidyut-chandas/src/padya.rs @@ -1,9 +1,10 @@ use crate::akshara::{Akshara, Weight}; -use crate::error::{ChandasError, Result}; +use crate::enum_boilerplate; +use crate::error::{Error, Result}; /// Models the weights that a vrtta can accept. #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub enum PatternWeight { +pub enum VrttaWeight { /// A heavy syllable. G, /// A light syllable. @@ -12,6 +13,12 @@ pub enum PatternWeight { Any, } +enum_boilerplate!(VrttaWeight, { + G => "G", + L => "L", + Any => ".", +}); + /// Describes how a vrtta matches some input. #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum MatchType { @@ -25,6 +32,13 @@ pub enum MatchType { Full, } +enum_boilerplate!(MatchType, { + None => "None", + Prefix => "Prefix", + Pada => "Pada", + Full => "Full", +}); + /// A traditional shorthand for vrtta weights. #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum Gana { @@ -50,6 +64,19 @@ pub enum Gana { Ga, } +enum_boilerplate!(Gana, { + Ya => "Ya", + Ma => "Ma", + Ta => "Ta", + Ra => "Ra", + Ja => "Ja", + Bha => "Bha", + Na => "Na", + Sa => "Sa", + La => "La", + Ga => "Ga", +}); + impl Gana { /// Returns the weights associated with each gana. #[allow(unused)] @@ -78,18 +105,18 @@ impl Gana { /// A *pāda* defines a specific pattern of light and heavy syllables and /// might also define one or more *yati*s (caesuras). #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct Pada { - weights: Vec, +pub struct VrttaPada { + weights: Vec, yati: Vec, } -impl Pada { - fn new(weights: Vec, yati: Vec) -> Self { - Pada { weights, yati } +impl VrttaPada { + fn new(weights: Vec, yati: Vec) -> Self { + VrttaPada { weights, yati } } /// Returns the weights that this pada uses. - pub fn weights(&self) -> &[PatternWeight] { + pub fn weights(&self) -> &[VrttaWeight] { &self.weights } @@ -101,7 +128,7 @@ impl Pada { /// Returns the ganas that define this pada. pub fn ganas(&self) -> Vec { use Gana::*; - use PatternWeight::*; + use VrttaWeight::*; let mut ganas = Vec::new(); for chunk in self.weights.chunks(3) { @@ -132,12 +159,12 @@ impl Pada { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Vrtta { name: String, - padas: Vec, + padas: Vec, } impl Vrtta { /// Creates a new `Vrtta` with the given name and weight pattern. - pub fn new(name: impl AsRef, padas: Vec) -> Self { + pub fn new(name: impl AsRef, padas: Vec) -> Self { Self { name: name.as_ref().to_string(), padas, @@ -152,12 +179,12 @@ impl Vrtta { } /// Returns the padas that constitute this vrtta. - pub fn padas(&self) -> &[Pada] { + pub fn padas(&self) -> &[VrttaPada] { &self.padas } pub(crate) fn try_match(&self, aksharas: &[Vec]) -> MatchType { - use PatternWeight::*; + use VrttaWeight::*; for row in aksharas { let mut s = Vec::new(); @@ -182,7 +209,7 @@ impl Vrtta { *last = Any; } - let pattern_flat: Vec = full.iter().flat_map(|x| x.to_owned()).collect(); + let pattern_flat: Vec = full.iter().flat_map(|x| x.to_owned()).collect(); let aksharas_flat: Vec<&Akshara> = aksharas.iter().flatten().collect(); let contains_aksharas = if pattern_flat.len() >= aksharas_flat.len() { @@ -219,16 +246,16 @@ impl Vrtta { } } -impl TryFrom<&str> for Pada { - type Error = ChandasError; +impl TryFrom<&str> for VrttaPada { + type Error = Error; fn try_from(text: &str) -> Result { - let weights: Vec = text + let weights: Vec = text .chars() .filter_map(|c| match c { - '.' => Some(PatternWeight::Any), - 'L' => Some(PatternWeight::L), - 'G' => Some(PatternWeight::G), + '.' => Some(VrttaWeight::Any), + 'L' => Some(VrttaWeight::L), + 'G' => Some(VrttaWeight::G), _ => None, }) .collect(); @@ -237,23 +264,22 @@ impl TryFrom<&str> for Pada { .enumerate() .map(|(i, (offset, _))| offset - i) .collect(); - Ok(Pada::new(weights, yati)) + Ok(VrttaPada::new(weights, yati)) } } impl TryFrom<&str> for Vrtta { - type Error = ChandasError; + type Error = Error; fn try_from(text: &str) -> Result { let fields: Vec<_> = text.split('\t').collect(); - debug_assert_eq!(fields.len(), 3); - - let name = fields[0]; - let _ = fields[1]; - let pattern_str = fields[2]; - let padas: Result> = pattern_str.split('/').map(|x| x.try_into()).collect(); - let padas = padas?; - Ok(Vrtta::new(name, padas)) + if let &[name, _kind, pattern] = &fields[..] { + let padas: Result> = pattern.split('/').map(|x| x.try_into()).collect(); + let padas = padas?; + Ok(Vrtta::new(name, padas)) + } else { + Err(Error::ParseError) + } } } diff --git a/vidyut-cheda/src/bin/cheda.rs b/vidyut-cheda/src/bin/cheda.rs index 43ad5cf..e4de944 100644 --- a/vidyut-cheda/src/bin/cheda.rs +++ b/vidyut-cheda/src/bin/cheda.rs @@ -9,37 +9,51 @@ use vidyut_cheda::{Chedaka, Config, Result}; #[derive(Parser)] #[command(author, version, about)] struct Args { - text: String, + #[arg(long, default_value = "")] + word: String, + #[arg(long, default_value = "")] + phrase: String, #[arg(long)] data_dir: PathBuf, } -/* -fn parse_text(text: &str, segmenter: &Segmenter) { - info!("Beginning parse: \"{}\"", text); - let padas = segmenter.segment(text); - if padas.is_empty() { - println!("No solutions found for \"{}\".", text); - } else { - for (i, pada) in padas.iter().enumerate() { - println!( - "[{}] {} : {}, {:?}", - i, - pada.text, - pada.lemma(), - pada.semantics - ); +trait Debugger { + fn debug_word(&self, text: &str) -> Result<()>; + fn debug_phrase(&self, text: &str) -> Result<()>; +} + +impl Debugger for Chedaka { + /// Prints all interpretations of a word. + fn debug_word(&self, text: &str) -> Result<()> { + let lex = self.kosha(); + println!("word={text}:"); + for packed_pada in lex.get_all(text) { + let pada = lex.unpack(&packed_pada)?; + println!("- `{}`, {:?}", pada.lemma(), pada); } + Ok(()) + } + + /// Prints all interpretations of a phrase. + fn debug_phrase(&self, text: &str) -> Result<()> { + let ret = self.run(text)?; + println!("phrase={text}:"); + println!("{ret:?}"); + Ok(()) } } -*/ -fn debug_word(text: &str, segmenter: &Chedaka) -> Result<()> { - let lex = segmenter.kosha(); - println!("{text}:"); - for packed_pada in lex.get_all(text) { - let pada = lex.unpack(&packed_pada)?; - println!("- `{}`, {:?}", pada.lemma(), pada); +fn run(args: Args) -> Result<()> { + info!("Loading raw data from disk."); + let config = Config::new(&args.data_dir); + let c = Chedaka::new(config).unwrap(); + + if !args.word.is_empty() { + c.debug_word(&args.word)?; + } + + if !args.phrase.is_empty() { + c.debug_phrase(&args.phrase)?; } Ok(()) @@ -47,22 +61,9 @@ fn debug_word(text: &str, segmenter: &Chedaka) -> Result<()> { fn main() { env_logger::init(); - let args = Args::parse(); - info!("Loading raw data from disk."); - let config = Config::new(&args.data_dir); - let segmenter = Chedaka::new(config); - - let segmenter = match segmenter { - Ok(data) => data, - Err(err) => { - println!("{}", err); - process::exit(1); - } - }; - - match debug_word(&args.text, &segmenter) { + match run(args) { Ok(()) => (), Err(err) => { println!("{}", err); diff --git a/vidyut-cheda/src/segmenting.rs b/vidyut-cheda/src/chedaka.rs similarity index 93% rename from vidyut-cheda/src/segmenting.rs rename to vidyut-cheda/src/chedaka.rs index 8c5f645..82a4ea8 100644 --- a/vidyut-cheda/src/segmenting.rs +++ b/vidyut-cheda/src/chedaka.rs @@ -11,7 +11,7 @@ use crate::sounds; use crate::strict_mode; use crate::Error; use compact_str::CompactString; -use vidyut_kosha::morph::Pada; +use vidyut_kosha::entries::PadaEntry; use vidyut_kosha::Kosha; use vidyut_sandhi::{Split, Splitter}; @@ -21,7 +21,7 @@ pub struct Token { /// The underlying text of the given word. pub text: CompactString, /// The data associated with this word. - pub info: Pada, + pub info: PadaEntry, } impl Token { @@ -31,7 +31,7 @@ impl Token { } /// The information we have about this word. - pub fn info(&self) -> &Pada { + pub fn info(&self) -> &PadaEntry { &self.info } @@ -66,7 +66,7 @@ impl TokenPool { } /// Represents an in-progress segment of a phrase. -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct Phrase { /// The words that we've recognized so far. pub tokens: Vec, @@ -137,10 +137,10 @@ fn analyze_pada( text: &str, split: &Split, chedaka: &Chedaka, - cache: &mut HashMap>, + cache: &mut HashMap>, ) -> Result<()> { if !cache.contains_key(text) { - let res: std::result::Result, _> = chedaka + let res: std::result::Result, _> = chedaka .kosha .get_all(text) .iter() @@ -150,7 +150,7 @@ fn analyze_pada( // Add the option to skip an entire chunk. (For typos, junk, etc.) if split.is_end_of_chunk() || text.starts_with(|c| !sounds::is_sanskrit(c)) { - res.push(Pada::Unknown); + res.push(PadaEntry::Unknown); } cache.insert(text.to_string(), res); @@ -158,12 +158,13 @@ fn analyze_pada( Ok(()) } +// Needs to be refactored. /* #[allow(dead_code)] fn debug_print_phrase(p: &Phrase) { if log_enabled!(Level::Debug) { - for word in &p.tokens { - debug!("- {} {:?}", word.text, word.info); + for token in &p.tokens { + debug!("- {} {:?}", token.text, token.info); } debug!("score={}", p.score); } @@ -211,7 +212,7 @@ fn debug_print_viterbi(v: &HashMap>) { fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { let text = normalize(raw_text); let mut pq = PriorityQueue::new(); - let mut word_cache: HashMap> = HashMap::new(); + let mut word_cache: HashMap> = HashMap::new(); let mut token_pool = TokenPool::new(); @@ -224,9 +225,6 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { pq.push(initial_state, score); while !pq.is_empty() { - // debug_print_stack(&pq); - // debug_print_viterbi(&viterbi_cache); - // Pop the best solution remaining. let (cur, cur_score) = pq.pop().expect("always defined"); @@ -264,7 +262,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { }; let i = token_pool.insert(Token { text: CompactString::from(first), - info: Pada::Unknown, + info: PadaEntry::Unknown, }); new.tokens.push(i); new @@ -278,7 +276,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { }; let i = token_pool.insert(Token { text: CompactString::from(cur.remaining), - info: Pada::Unknown, + info: PadaEntry::Unknown, }); new.tokens.push(i); new @@ -308,8 +306,8 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { let second = split.second(); analyze_pada(first, &split, ctx, &mut word_cache)?; - for semantics in word_cache.get(first).unwrap_or(&no_results) { - if !strict_mode::is_valid_word(&cur, &token_pool, &split, semantics) { + for artha in word_cache.get(first).unwrap_or(&no_results) { + if !strict_mode::is_valid_word(&cur, &token_pool, &split, artha) { continue; } @@ -321,7 +319,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { }; let i = token_pool.insert(Token { text: CompactString::from(first), - info: semantics.clone(), + info: artha.clone(), }); new.tokens.push(i); new.score = ctx.model.score(&new, &token_pool); @@ -341,6 +339,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { .entry(new.remaining.clone()) .or_default() .insert("STATE".to_string(), new.clone()); + println!("Pushing: {new:?} {new_score}"); pq.push(new, new_score); } } diff --git a/vidyut-cheda/src/dcs.rs b/vidyut-cheda/src/dcs.rs index ddc38ed..8da00c4 100644 --- a/vidyut-cheda/src/dcs.rs +++ b/vidyut-cheda/src/dcs.rs @@ -1,11 +1,12 @@ //! Utility functions for reading DCS data. +use crate::chedaka::Token; use crate::conllu::{Token as EvalToken, TokenFeatures}; use crate::errors::{Error, Result}; -use crate::segmenting::Token; use compact_str::CompactString; -use vidyut_kosha::morph::*; +use vidyut_kosha::entries::*; use vidyut_lipi::{transliterate, Mapping, Scheme}; -use vidyut_prakriya::args::BaseKrt; +use vidyut_prakriya::args as vp; +use vidyut_prakriya::args::{Linga, Pratipadika, Purusha, Slp1String, Subanta, Vacana, Vibhakti}; fn to_slp1(text: &str) -> String { let mapping = Mapping::new(Scheme::Iast, Scheme::Slp1); @@ -14,16 +15,18 @@ fn to_slp1(text: &str) -> String { /// Convert DCS semantics to Vidyut semantics. pub fn standardize(t: &EvalToken) -> Result { - let slp1_lemma = standardize_lemma(&t.lemma); + let lemma = standardize_lemma(&t.lemma); + let lemma = Slp1String::from(lemma).expect("ok"); let semantics = match t.upos.as_str() { "NOUN" | "PRON" | "ADJ" | "NUM" => parse_subanta(t)?, - "CONJ" | "CCONJ" | "SCONJ" | "ADV" | "PART" | "INTJ" | "ADP" => Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: slp1_lemma.clone(), - lingas: Vec::new(), - }, - }), + "CONJ" | "CCONJ" | "SCONJ" | "ADV" | "PART" | "INTJ" | "ADP" => { + let prati = Pratipadika::basic(lemma.clone()); + + let subanta = Subanta::avyaya(prati.clone()); + let p_entry = PratipadikaEntry::new(prati, vec![]); + PadaEntry::Avyaya(SubantaEntry::new(subanta, p_entry)) + } "VERB" => { if t.features.contains_key("VerbForm") { parse_krdanta(t)? @@ -33,13 +36,13 @@ pub fn standardize(t: &EvalToken) -> Result { parse_verb(t)? } } - "MANTRA" => Pada::Unknown, + "MANTRA" => PadaEntry::Unknown, _ => panic!("Unknown upos `{}`", t.upos), }; Ok(Token { // The original form is not consistently present in the DCS data, so just use the lemma. - text: CompactString::from(slp1_lemma), + text: CompactString::from(lemma.as_str()), info: semantics, }) } @@ -90,40 +93,35 @@ fn standardize_lemma(raw_lemma: &str) -> String { } /// Reshapes a DCS nominal into a Vidyut subanta. -fn parse_subanta(t: &EvalToken) -> Result { - let stem = parse_stem(t); - let linga = parse_linga(&t.features)?; - let vibhakti = parse_vibhakti(&t.features)?; - let vacana = parse_vacana(&t.features)?; - let is_purvapada = parse_is_purvapada(&t.features); +fn parse_subanta(t: &EvalToken) -> Result { + let pratipadika = parse_pratipadika(t); + let linga = parse_linga(&t.features)?.unwrap_or(Linga::Pum); + let vibhakti = parse_vibhakti(&t.features)?.unwrap_or(Vibhakti::Prathama); + let vacana = parse_vacana(&t.features)?.unwrap_or(Vacana::Eka); + let _is_purvapada = parse_is_purvapada(&t.features); - Ok(Pada::Subanta(Subanta { - pratipadika: stem, - linga, - vacana, - vibhakti, - is_purvapada, - })) + let subanta = vp::Subanta::new(pratipadika.clone(), linga, vibhakti, vacana); + let p_entry = PratipadikaEntry::new(pratipadika, vec![]); + Ok(PadaEntry::Subanta(SubantaEntry::new(subanta, p_entry))) } /// Reshapes a DCS verb into a Vidyut tinanta. -fn parse_verb(t: &EvalToken) -> Result { +fn parse_verb(t: &EvalToken) -> Result { let root = standardize_lemma(&t.lemma); let purusha = parse_purusha(&t.features)?; let vacana = parse_vacana(&t.features)?.unwrap_or(Vacana::Eka); - let lakara = parse_lakara(&t.features)?.unwrap_or(Lakara::Lat); - let pada = parse_verb_pada(&t.features); - Ok(Pada::Tinanta(Tinanta { - dhatu: Dhatu::mula(root), + let (lakara, _skip_at_agama) = parse_lakara(&t.features)?.unwrap_or((vp::Lakara::Lat, false)); + Ok(PadaEntry::Tinanta(vp::Tinanta::new( + vp::Dhatu::mula(Slp1String::from(root).expect("ok"), vp::Gana::Bhvadi), + vp::Prayoga::Kartari, + lakara, purusha, vacana, - lakara, - pada, - })) + ))) } /// Reshapes a DCS krdanta. -fn parse_krdanta(t: &EvalToken) -> Result { +fn parse_krdanta(t: &EvalToken) -> Result { match t .features .get("VerbForm") @@ -136,52 +134,51 @@ fn parse_krdanta(t: &EvalToken) -> Result { } /// Reshapes a DCS krdanta subanta. -fn parse_krdanta_subanta(t: &EvalToken) -> Result { - let stem = Pratipadika::Krdanta { - dhatu: Dhatu::mula(standardize_lemma(&t.lemma)), - krt: parse_krt_pratyaya(&t.features)?.unwrap_or(Krt::new(BaseKrt::kta)), - }; - let linga = parse_linga(&t.features)?; - let vibhakti = parse_vibhakti(&t.features)?; - let vacana = parse_vacana(&t.features)?; - let is_purvapada = parse_is_purvapada(&t.features); +fn parse_krdanta_subanta(t: &EvalToken) -> Result { + let lemma = standardize_lemma(&t.lemma); + let pratipadika = vp::Krdanta::new( + vp::Dhatu::mula(Slp1String::from(lemma).expect("ok"), vp::Gana::Bhvadi), + parse_krt_pratyaya(&t.features)?.unwrap_or(vp::BaseKrt::kta.into()), + ); + let linga = parse_linga(&t.features)?.unwrap_or(Linga::Pum); + let vibhakti = parse_vibhakti(&t.features)?.unwrap_or(Vibhakti::Prathama); + let vacana = parse_vacana(&t.features)?.unwrap_or(Vacana::Eka); + let _is_purvapada = parse_is_purvapada(&t.features); - Ok(Pada::Subanta(Subanta { - pratipadika: stem, - linga, - vacana, - vibhakti, - is_purvapada, - })) + let pratipadika: vp::Pratipadika = pratipadika.into(); + let subanta = vp::Subanta::new(pratipadika.clone(), linga, vibhakti, vacana); + let dummy_entry = PratipadikaEntry::new(pratipadika, vec![]); + Ok(PadaEntry::Subanta(SubantaEntry::new(subanta, dummy_entry))) } /// Reshapes a DCS krdanta avyaya. -fn parse_krdanta_avyaya(t: &EvalToken) -> Result { - let stem = Pratipadika::Krdanta { - dhatu: Dhatu::mula(standardize_lemma(&t.lemma)), +fn parse_krdanta_avyaya(t: &EvalToken) -> Result { + let lemma = standardize_lemma(&t.lemma); + let krdanta = vp::Krdanta::new( + vp::Dhatu::mula(Slp1String::from(lemma).expect("ok"), vp::Gana::Bhvadi), // Use an arbitrary default. - krt: parse_krt_pratyaya(&t.features)?.unwrap_or(Krt::new(BaseKrt::kta)), - }; + parse_krt_pratyaya(&t.features)?.unwrap_or(vp::BaseKrt::kta.into()), + ); - Ok(Pada::Avyaya(Avyaya { pratipadika: stem })) + let prati: Pratipadika = krdanta.into(); + let p_entry = PratipadikaEntry::new(prati.clone(), vec![]); + let subanta = vp::Subanta::avyaya(prati); + Ok(PadaEntry::Avyaya(SubantaEntry::new(subanta, p_entry))) } /// Reshapes a DCS stem into a Vidyut stem. -fn parse_stem(t: &EvalToken) -> Pratipadika { - Pratipadika::Basic { - text: standardize_lemma(&t.lemma), - lingas: Vec::new(), - } +fn parse_pratipadika(t: &EvalToken) -> Pratipadika { + Pratipadika::basic(Slp1String::from(&t.lemma).expect("ok")) } /// Reshapes a DCS tense into a Vidyut tense. -fn parse_krt_pratyaya(f: &TokenFeatures) -> Result> { +fn parse_krt_pratyaya(f: &TokenFeatures) -> Result> { let val = match f.get("Tense") { Some(s) => match s.as_str() { // FIXME: not enough information to reconstruct. - "Pres" => Some(Krt::new(BaseKrt::Satf)), - "Past" => Some(Krt::new(BaseKrt::kta)), - "Fut" => Some(Krt::new(BaseKrt::Satf)), + "Pres" => Some(vp::BaseKrt::Satf.into()), + "Past" => Some(vp::BaseKrt::kta.into()), + "Fut" => Some(vp::BaseKrt::Satf.into()), &_ => return Err(Error::parse_dcs("Tense", s)), }, None => None, @@ -237,11 +234,12 @@ fn parse_is_purvapada(f: &TokenFeatures) -> bool { /// Reshapes a DCS person into a Vidyut purusha. fn parse_purusha(f: &TokenFeatures) -> Result { + use Purusha::*; let val = match f.get("Person") { Some(s) => match s.as_str() { - "3" => Purusha::Prathama, - "2" => Purusha::Madhyama, - "1" => Purusha::Uttama, + "3" => Prathama, + "2" => Madhyama, + "1" => Uttama, &_ => return Err(Error::parse_dcs("Person", s)), }, None => return Err(Error::dcs_undefined("Person")), @@ -264,7 +262,8 @@ fn parse_vacana(f: &TokenFeatures) -> Result> { } /// Reshapes a DCS tense/mood into a Vidyut lakara. -fn parse_lakara(f: &TokenFeatures) -> Result> { +fn parse_lakara(f: &TokenFeatures) -> Result> { + use vp::Lakara::*; let tense = match f.get("Tense") { Some(s) => s, None => return Err(Error::dcs_undefined("Tense")), @@ -275,18 +274,18 @@ fn parse_lakara(f: &TokenFeatures) -> Result> { }; let val = match (tense.as_str(), mood.as_str()) { - ("Aor", "Ind") => Lakara::Lun, - ("Aor", "Jus") => Lakara::LunNoAgama, - ("Aor", "Prec") => Lakara::AshirLin, - ("Fut", "Cond") => Lakara::Lrn, - ("Fut", "Pot") => Lakara::Lrn, - ("Fut", "Ind") => Lakara::Lrt, - ("Impf", "Ind") => Lakara::Lan, - ("Perf", "Ind") => Lakara::Lit, - ("Pres", "Imp") => Lakara::Lot, - ("Pres", "Ind") => Lakara::Lat, - ("Pres", "Opt") => Lakara::VidhiLin, - ("Pres", "Sub") => Lakara::Lot, + ("Aor", "Ind") => (Lun, false), + ("Aor", "Jus") => (Lun, true), + ("Aor", "Prec") => (AshirLin, false), + ("Fut", "Cond") => (Lrn, false), + ("Fut", "Pot") => (Lrn, false), + ("Fut", "Ind") => (Lrt, false), + ("Impf", "Ind") => (Lan, false), + ("Perf", "Ind") => (Lit, false), + ("Pres", "Imp") => (Lot, false), + ("Pres", "Ind") => (Lat, false), + ("Pres", "Opt") => (VidhiLin, false), + ("Pres", "Sub") => (Lot, false), ("Aor", "Imp") => return Ok(None), ("Past", "Ind") => return Ok(None), ("Past", "Imp") => return Ok(None), @@ -303,8 +302,3 @@ fn parse_lakara(f: &TokenFeatures) -> Result> { }; Ok(Some(val)) } - -fn parse_verb_pada(_f: &TokenFeatures) -> PadaPrayoga { - // FIXME: unsupported in DCS? - PadaPrayoga::Parasmaipada -} diff --git a/vidyut-cheda/src/lib.rs b/vidyut-cheda/src/lib.rs index edc34cb..a88563a 100644 --- a/vidyut-cheda/src/lib.rs +++ b/vidyut-cheda/src/lib.rs @@ -2,9 +2,9 @@ #![deny(missing_docs)] #![deny(clippy::unwrap_used)] +pub use crate::chedaka::{Chedaka, Token}; pub use crate::config::Config; pub use crate::errors::{Error, Result}; -pub use crate::segmenting::{Chedaka, Token}; mod errors; mod scoring; @@ -23,7 +23,7 @@ pub mod sounds; pub mod conllu; pub mod dcs; +mod chedaka; mod config; mod normalize_text; -mod segmenting; mod strict_mode; diff --git a/vidyut-cheda/src/scoring.rs b/vidyut-cheda/src/scoring.rs index 641ea29..2d18373 100644 --- a/vidyut-cheda/src/scoring.rs +++ b/vidyut-cheda/src/scoring.rs @@ -5,14 +5,14 @@ // allowed. So instead, just allow dead code in this module. #![allow(dead_code)] +use crate::chedaka::{Phrase, TokenPool}; use crate::errors::{Error, Result}; -use crate::segmenting::{Phrase, TokenPool}; use core::str::FromStr; use modular_bitfield::prelude::*; use rustc_hash::FxHashMap; use std::path::Path; -use vidyut_kosha::morph::*; -use vidyut_kosha::packing::{PackedLinga, PackedVacana, PackedVibhakti}; +use vidyut_kosha::entries::*; +use vidyut_kosha::packing::{PackedLinga, PackedPurusha, PackedVacana, PackedVibhakti}; /// Models a Markov transition state. #[derive(Copy, Clone, Default, Eq, PartialEq, Hash)] @@ -25,7 +25,7 @@ impl State { } /// Initializes a transition state from the given pada. - pub fn from_pada(s: &Pada) -> Self { + pub fn from_pada(s: &PadaEntry) -> Self { PadaState::from_pada(s).into_state() } } @@ -68,9 +68,9 @@ pub struct TinantaState { #[skip(getters)] unused: B10, #[skip(getters)] - purusha: Purusha, + purusha: PackedPurusha, #[skip(getters)] - vacana: Vacana, + vacana: PackedVacana, } /// Models the transition state for some *pada*. @@ -89,27 +89,28 @@ impl PadaState { } /// Creates a state label for the given pada. - pub fn from_pada(p: &Pada) -> Self { + pub fn from_pada(p: &PadaEntry) -> Self { let zero = [0_u8; 2]; let (pos_tag, payload) = match p { - Pada::Unknown => (POSTag::Unknown, zero), - Pada::Subanta(s) => { + PadaEntry::Unknown => (POSTag::Unknown, zero), + PadaEntry::Subanta(s) => { + let s = s.subanta(); let bytes = SubantaState::new() - .with_linga(s.linga.into()) - .with_vacana(s.vacana.into()) - .with_vibhakti(s.vibhakti.into()) - .with_is_purvapada(s.is_purvapada) + .with_linga(s.linga().into()) + .with_vacana(s.vacana().into()) + .with_vibhakti(s.vibhakti().into()) + .with_is_purvapada(false) .into_bytes(); (POSTag::Subanta, bytes) } - Pada::Tinanta(s) => { + PadaEntry::Tinanta(s) => { let bytes = TinantaState::new() - .with_purusha(s.purusha) - .with_vacana(s.vacana) + .with_purusha(PackedPurusha::pack(s.purusha())) + .with_vacana(PackedVacana::pack(s.vacana())) .into_bytes(); (POSTag::Tinanta, bytes) } - Pada::Avyaya(_) => (POSTag::Avyaya, zero), + PadaEntry::Avyaya(_) => (POSTag::Avyaya, zero), }; PadaState::new() .with_pos(pos_tag) @@ -275,9 +276,7 @@ impl Model { let cur_state = State::from_pada(&last.info); let pada = &last.info; - let lemma_log_prob = self - .lemmas - .log_prob(pada.lemma(), pada.part_of_speech_tag()); + let lemma_log_prob = self.lemmas.log_prob(pada.lemma(), pada.pos_tag()); let transition_log_prob = self.transitions.log_prob(&prev_state, &cur_state); lemma_log_prob + transition_log_prob } else { diff --git a/vidyut-cheda/src/strict_mode.rs b/vidyut-cheda/src/strict_mode.rs index 4aca339..fbafb0d 100644 --- a/vidyut-cheda/src/strict_mode.rs +++ b/vidyut-cheda/src/strict_mode.rs @@ -1,8 +1,9 @@ //! Heuristics for validating segmented candidates. -use crate::segmenting::{Phrase, TokenPool}; +use crate::chedaka::{Phrase, TokenPool}; use crate::sounds; -use vidyut_kosha::morph::*; +use vidyut_kosha::entries::*; +use vidyut_prakriya::args::Subanta; /// Simple hand-coded rules to avoid overgenerating. use vidyut_sandhi::Split; @@ -11,28 +12,34 @@ pub(crate) fn is_valid_word( cur: &Phrase, pool: &TokenPool, split: &Split, - semantics: &Pada, + semantics: &PadaEntry, ) -> bool { - if let Pada::Subanta(s) = &semantics { + /* + if let Entry::Subanta(s) = &semantics { if_purvapada_then_not_chunk_end(split, s) - && if_ac_pada_then_not_hal(split, s.is_purvapada) + && if_ac_pada_then_not_hal(split, s.is_purvapada()) && if_not_in_compound_then_linga_match(cur, pool, s) - } else if let Pada::Tinanta(_) = &semantics { + } else if let Entry::Tinanta(_) = &semantics { if_ac_pada_then_not_hal(split, false) } else { true // TODO: extend if_ac_pada... to verbs } + */ + true } /// Avoid compounds with whitespace. /// (`Darmakzetre` vs. `Darma kzetre`) fn if_purvapada_then_not_chunk_end(split: &Split, s: &Subanta) -> bool { - if s.is_purvapada { + /* + if s.is_purvapada() { !split.is_end_of_chunk() } else { true } + */ + true } // Require that vowel-final words are not immediately followed by consonants. @@ -51,10 +58,12 @@ fn if_ac_pada_then_not_hal(split: &Split, is_purvapada: bool) -> bool { // Require that subantas use the endings that match their declared linga. // Exception: words in a compound, since these might be bahuvrihi compounds. fn if_not_in_compound_then_linga_match(cur: &Phrase, pool: &TokenPool, s: &Subanta) -> bool { + true + /* let in_compound = match cur.tokens.last() { Some(i) => match pool.get(*i) { Some(t) => match &t.info { - Pada::Subanta(s) => s.is_purvapada, + Pada::Subanta(s) => s.is_purvapada(), _ => false, }, None => false, @@ -65,12 +74,13 @@ fn if_not_in_compound_then_linga_match(cur: &Phrase, pool: &TokenPool, s: &Suban if in_compound { true } else { - match (&s.linga, &s.pratipadika) { - (Some(x), Pratipadika::Basic { text: _, lingas }) => lingas.contains(x), + match (&s.linga(), &s.pratipadika()) { + (Some(x), Pratipadika::basic(text)) => lingas.contains(x), // Otherwise, any linga is allowed. _ => true, } } + */ } #[cfg(test)] @@ -89,11 +99,8 @@ mod tests { Location::EndOfChunk, Kind::Prefix, ); - let info = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: "grAma".to_string(), - lingas: Vec::new(), - }, + let info = PadaEntry::Avyaya(Avyaya { + pratipadika: Pratipadika::basic("grAma"), }); let mut token_pool = TokenPool::new(); @@ -113,7 +120,7 @@ mod tests { Location::WithinChunk, Kind::Prefix, ); - let info = Pada::Subanta(Subanta { + let info = PadaEntry::Subanta(Subanta { pratipadika: Pratipadika::Basic { text: "grAma".to_string(), lingas: vec![Linga::Pum], diff --git a/vidyut-cheda/tests/error_messages.rs b/vidyut-cheda/tests/error_messages.rs index fe3b8d5..1fcba90 100644 --- a/vidyut-cheda/tests/error_messages.rs +++ b/vidyut-cheda/tests/error_messages.rs @@ -4,7 +4,7 @@ use std::io::BufWriter; use std::io::Write; use tempfile::tempdir; use vidyut_cheda::{Chedaka, Config, Error}; -use vidyut_kosha::morph::Pada; +use vidyut_kosha::entries::PadaEntry; use vidyut_kosha::Builder; #[test] @@ -30,8 +30,8 @@ fn create_ok() -> Result<(), Box> { let path = dir.path().join("kosha"); let mut b = Builder::new(path)?; - b.insert("arjunas", &Pada::Unknown)?; - b.insert("gacCati", &Pada::Unknown)?; + b.insert("arjunas", &PadaEntry::Unknown)?; + b.insert("gacCati", &PadaEntry::Unknown)?; b.finish()?; let sandhi = dir.path().join("sandhi.csv"); diff --git a/vidyut-kosha/Cargo.toml b/vidyut-kosha/Cargo.toml index f29e2a6..4d93595 100644 --- a/vidyut-kosha/Cargo.toml +++ b/vidyut-kosha/Cargo.toml @@ -18,8 +18,9 @@ log = "0.4.17" fst = "0.4.7" modular-bitfield = "0.11.2" rustc-hash = "1.1.0" -serde = { version = "1.0.152", optional = true, features = ["derive"] } -vidyut-prakriya = { path = "../vidyut-prakriya" } +serde = { version = "1.0.152", features = ["derive"] } +vidyut-prakriya = { path = "../vidyut-prakriya", features = ["serde"] } +serde_json = "1.0.132" [dev-dependencies] bencher = "0.1.5" diff --git a/vidyut-kosha/bcd.md b/vidyut-kosha/bcd.md new file mode 100644 index 0000000..6b114d5 --- /dev/null +++ b/vidyut-kosha/bcd.md @@ -0,0 +1,9 @@ +- dhatu + - aupadeshika + - gana + - antargana + - prefixes + - sanadi + + - clean_text + diff --git a/vidyut-kosha/benches/kosha.rs b/vidyut-kosha/benches/kosha.rs index 8061539..28b2bdf 100644 --- a/vidyut-kosha/benches/kosha.rs +++ b/vidyut-kosha/benches/kosha.rs @@ -13,7 +13,7 @@ use vidyut_kosha::packing::*; use vidyut_kosha::Kosha; type Result = std::result::Result>; -type NaiveKosha = MultiMap; +type NaiveKosha = MultiMap; #[derive(Parser, Debug)] #[command(author, version, about)] @@ -31,7 +31,7 @@ fn create_naive_kosha(fst_lex: &Kosha) -> Result { let mut stream = fst_lex.stream(); while let Some((key, value)) = stream.next() { let key = std::str::from_utf8(key)?; - let value = PackedPada::from_u32(value as u32); + let value = PackedEntry::from_u32(value as u32); ret.insert(key.to_string(), value); } Ok(ret) diff --git a/vidyut-kosha/src/entries.rs b/vidyut-kosha/src/entries.rs new file mode 100644 index 0000000..a4e8385 --- /dev/null +++ b/vidyut-kosha/src/entries.rs @@ -0,0 +1,217 @@ +//! Models the morphology of Sanskrit words, including their bases and endings. +use crate::errors::*; +use modular_bitfield::prelude::*; +use std::fmt::{Display, Formatter, Result as FmtResult}; +use std::str::FromStr; +use vidyut_prakriya::args as vp; + +use serde::{Deserialize, Serialize}; + +/// Implements various boilerplate for our enums: +/// +/// - `as_str` +/// - `iter` +/// - `FromStr` +/// - `Display` +macro_rules! enum_boilerplate { + ($Enum:ident, { $( $variant:ident => $str:literal ),* $(,)? }) => { + impl $Enum { + /// Returns a string representation of this enum. + pub fn as_str(&self) -> &'static str { + match self { + $( + $Enum::$variant => $str, + )* + } + } + + /// Iterates over the values of this enum in order. + pub fn iter() -> impl Iterator { + const ITEMS: &[$Enum] = &[ + $( + $Enum::$variant, + )* + ]; + ITEMS.iter() + } + } + + impl FromStr for $Enum { + type Err = Error; + fn from_str(s: &str) -> Result { + let val = match s { + $( + $str => $Enum::$variant, + )* + _ => return Err(Error::ParseEnum(stringify!($Enum), s.to_string())), + }; + Ok(val) + } + } + + impl Display for $Enum { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}", self.as_str()) + } + } + } +} + +/// A short part-of-speech tag for some `Pada`. +/// +/// We use this tag when calculating lemma counts. For example, *eva* is a common *avyaya* but +/// not a common *subanta*, and our statistics should reflect that distinction. Coarser +/// distinctions that include linga, vacana, etc. are interesting but less useful given our +/// limited training data. +#[derive(Clone, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[bits = 2] +pub enum POSTag { + /// A token with missing, unknown, or undefined semantics. + Unknown, + /// A nominal. + Subanta, + /// A verb. + Tinanta, + /// An indeclinable. + Avyaya, +} + +enum_boilerplate!(POSTag, { + Unknown => "_", + Subanta => "s", + Tinanta => "t", + Avyaya => "a", +}); + +/// A dhatu with its metadata. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct DhatuEntry { + dhatu: vp::Dhatu, + readable: String, +} + +impl DhatuEntry { + /// Creates a new `DhatuEntry`. + /// + /// `readable` should be the text obtained by calling `Vyakarana::derive_dhatus` on `dhatu`. + pub fn new(dhatu: vp::Dhatu, readable: String) -> Self { + Self { dhatu, readable } + } + + /// Returns the dhatu that generates this entry. + pub fn dhatu(&self) -> &vp::Dhatu { + &self.dhatu + } + + /// Returns the human-readable text representation of this dhatu. + /// + /// Examples: + /// + /// - `qukf\\Y` --> `kf` + /// - `vidi~` --> `vind` + pub fn text(&self) -> &str { + &self.readable + } +} + +/// A pratipadika with its metadata. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct PratipadikaEntry { + pratipadika: vp::Pratipadika, + lingas: Vec, +} + +impl PratipadikaEntry { + /// Creates a new `PratipadikaEntry`. + pub fn new(pratipadika: vp::Pratipadika, lingas: Vec) -> Self { + Self { + pratipadika, + lingas, + } + } + + /// Returns the pratipadika that generates this entry. + pub fn pratipadika(&self) -> &vp::Pratipadika { + &self.pratipadika + } + + /// Returns the lingas that this pratipadika is allowed to use. + /// + /// If empty, the pratipadika has no specific linga. + pub fn lingas(&self) -> &[vp::Linga] { + &self.lingas + } +} + +/// Wraps a `Subanta` with metadata about the pratipadika. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct SubantaEntry { + subanta: vp::Subanta, + entry: PratipadikaEntry, +} + +impl SubantaEntry { + /// Creates a new SubantaEntry. + pub fn new(subanta: vp::Subanta, entry: PratipadikaEntry) -> Self { + Self { subanta, entry } + } + + /// The underlying subanta. + pub fn subanta(&self) -> &vp::Subanta { + &self.subanta + } + + /// The pratipadika metadata. + pub fn entry(&self) -> &PratipadikaEntry { + &self.entry + } +} + +impl From for SubantaEntry { + fn from(val: vp::Subanta) -> Self { + let p_entry = PratipadikaEntry::new(val.pratipadika().clone(), vec![]); + Self::new(val, p_entry) + } +} + +/// Models the semantics of a Sanskrit *pada* (word). +/// +/// This enum can be packed into an unsigned integer via the `packing` module. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub enum PadaEntry { + /// Unknown data. + Unknown, + /// A *subanta* that is not an *avyaya*. + Subanta(SubantaEntry), + /// A *subanta* that is also an *avyaya*. + Avyaya(SubantaEntry), + /// A *tiṅanta* (verb). + Tinanta(vp::Tinanta), +} + +impl PadaEntry { + /// Returns a placeholder lemma. + pub fn lemma(&self) -> &str { + match self { + Self::Subanta(e) => { + use vp::Pratipadika::*; + match e.subanta().pratipadika() { + Basic(b) => b.text(), + _ => "", + } + } + _ => "", + } + } + + /// Returns a part-of-speech tag. + pub fn pos_tag(&self) -> POSTag { + match self { + PadaEntry::Unknown => POSTag::Unknown, + PadaEntry::Subanta(_) => POSTag::Subanta, + PadaEntry::Avyaya(_) => POSTag::Avyaya, + PadaEntry::Tinanta(_) => POSTag::Tinanta, + } + } +} diff --git a/vidyut-kosha/src/errors.rs b/vidyut-kosha/src/errors.rs index 8e87be9..cb6b5d1 100644 --- a/vidyut-kosha/src/errors.rs +++ b/vidyut-kosha/src/errors.rs @@ -1,3 +1,4 @@ +use serde_json::Error as JsonError; use std::fmt; use std::io; use std::num; @@ -10,6 +11,8 @@ pub type Result = std::result::Result; pub enum Error { /// An IO error. Io(io::Error), + /// A JSON-related IO error. + Json(JsonError), /// An FST error. Fst(fst::raw::Error), /// An integer couldn't be parsed. @@ -50,6 +53,13 @@ impl From for Error { } } +impl From for Error { + #[inline] + fn from(err: JsonError) -> Error { + Error::Json(err) + } +} + impl std::error::Error for Error {} impl fmt::Display for Error { @@ -58,6 +68,7 @@ impl fmt::Display for Error { match self { Io(e) => e.fmt(f), + Json(e) => e.fmt(f), Fst(e) => e.fmt(f), TooManyDuplicates(s) => write!(f, "Key `{}` has been inserted too many times.", s), UnknownDhatuId(id) => write!(f, "Unknown dhatu ID {}", id), diff --git a/vidyut-kosha/src/kosha.rs b/vidyut-kosha/src/kosha.rs index 1d7dc6f..8864334 100644 --- a/vidyut-kosha/src/kosha.rs +++ b/vidyut-kosha/src/kosha.rs @@ -23,7 +23,7 @@ //! //! To work around (2), we pack the semantics of Sanskrit words into integers with our //! `vidyut::packing` crate. Since strings are difficult to pack, we instead store them in a lookup -//! table and pack their integer ID instead. For details, see `packing::Unpacker`. +//! table and pack their integer ID instead. //! //! //! Efficiency @@ -33,8 +33,8 @@ //! roughly 1.5x slower than a default `HashMap`. Our production kosha stores more than 29 million //! words in around 31MB of data with an average storage cost of 1 byte per word. Of course, the //! specific storage cost will vary depending on the words in the input list. +use crate::entries::{DhatuEntry, PadaEntry, PratipadikaEntry}; use crate::errors::*; -use crate::morph::Pada; use crate::packing::*; use fst::map::Stream; use fst::raw::{Fst, Node, Output}; @@ -62,22 +62,25 @@ impl Paths { base: base_path.as_ref().to_path_buf(), } } + /// Path to the underlying FST. fn fst(&self) -> PathBuf { self.base.join("padas.fst") } - /// Path to the dhatus table, which maps indices to `Dhatu`s. + + /// Path to the dhatus registry. fn dhatus(&self) -> PathBuf { - self.base.join("dhatus.csv") + self.base.join("dhatus.json") } - /// Path to the pratipadikas table, which maps indices to `Pratipadika`s. + + /// Path to the pratipadikas registry. fn pratipadikas(&self) -> PathBuf { - self.base.join("pratipadikas.csv") + self.base.join("pratipadikas.json") } } -fn to_packed_pada(output: Output) -> PackedPada { - PackedPada::from_u32(output.value() as u32) +fn to_packed_pada(output: Output) -> PackedEntry { + PackedEntry::from_u32(output.value() as u32) } /// A compact Sanskrit kosha. @@ -85,7 +88,7 @@ pub struct Kosha { /// The underlying FST object. fst: Map>, /// Maps indices to semantics objects. - unpacker: Unpacker, + packer: Packer, } impl Kosha { @@ -95,12 +98,9 @@ impl Kosha { info!("Loading fst from `{:?}`", paths.fst()); let fst = Map::new(std::fs::read(paths.fst())?)?; - let unpacker = Unpacker::from_data( - PratipadikaTable::read(&paths.pratipadikas())?, - DhatuTable::read(&paths.dhatus())?, - ); + let packer = Packer::read(&paths.dhatus(), &paths.pratipadikas())?; - Ok(Self { fst, unpacker }) + Ok(Self { fst, packer }) } /// Returns a reference to this kosha's underlying FST. @@ -137,13 +137,13 @@ impl Kosha { } /// Unpacks the given word via this kosha's `Unpacker` instance. - pub fn unpack(&self, p: &PackedPada) -> Result { - self.unpacker.unpack(p) + pub fn unpack(&self, p: &PackedEntry) -> Result { + self.packer.unpack(p) } /// Gets all results for the given `key`, including duplicates. #[inline] - pub fn get_all(&self, key: &str) -> Vec { + pub fn get_all(&self, key: &str) -> Vec { // Adapted from `FstRef::get` // https://docs.rs/fst/0.4.7/src/fst/raw/mod.rs.html#682 let fst = self.fst.as_fst(); @@ -183,7 +183,7 @@ impl Kosha { /// - `out`: the output corresponding to this state. /// - `fst`: the underlying FST. /// - `results`: the results list. -fn add_duplicates(node: Node, out: Output, fst: &Fst>, results: &mut Vec) { +fn add_duplicates(node: Node, out: Output, fst: &Fst>, results: &mut Vec) { for c1 in 0..=DUPES_PER_BYTE { if let Some(i1) = node.find_input(c1) { let t1 = node.transition(i1); @@ -264,36 +264,59 @@ impl Builder { /// /// Keys must be inserted in lexicographic order. If a key is received out of order, /// the build process will fail. - pub fn insert(&mut self, key: &str, value: &Pada) -> Result<()> { - let seen_keys = &mut self.seen_keys; + pub fn insert(&mut self, key: &str, value: &PadaEntry) -> Result<()> { + let value = self.pack(value)?; + self.insert_packed(key, &value) + } + + /// Inserts the given `key` with the packed semantics in `value`. + /// + /// Keys must be inserted in lexicographic order. If a key is received out of order, + /// the build process will fail. + pub fn insert_packed(&mut self, key: &str, value: &PackedEntry) -> Result<()> { + let u64_payload = value.to_u32() as u64; + let seen_keys = &mut self.seen_keys; let num_repeats = match seen_keys.get(key) { Some(c) => *c, None => 0, }; seen_keys.insert(key.to_string(), num_repeats + 1); - let value = u64::from(self.packer.pack(value)?.to_u32()); - // For duplicates, add another u8 to make this key unique. if num_repeats > 0 { // Subtract 1 so that the duplicate tag always starts at 0. let final_key = create_extended_key(key, num_repeats - 1)?; - self.fst_builder.insert(&final_key, value)?; + self.fst_builder.insert(&final_key, u64_payload)?; } else { - self.fst_builder.insert(key, value)?; + self.fst_builder.insert(key, u64_payload)?; }; Ok(()) } + /// Registers the given dhatus on the internal packer. Duplicate dhatus are ignored. + pub fn register_dhatus(&mut self, dhatus: &[DhatuEntry]) { + self.packer.register_dhatus(dhatus); + } + + /// Registers the given pratipadikas on the internal packer. Duplicate pratipadikas are ignored. + pub fn register_pratipadikas(&mut self, pratipadikas: &[PratipadikaEntry]) { + self.packer.register_pratipadikas(pratipadikas); + } + + /// Packs the given `pada` into a more compact format. + pub fn pack(&self, pada: &PadaEntry) -> Result { + self.packer.pack(pada) + } + /// Writes all FST data to disk. pub fn finish(self) -> Result<()> { - info!("Writing FST and packer data to `{:?}`.", self.paths.base); + info!("Writing FST and packer data to {:?}.", self.paths.base); self.fst_builder.finish()?; - let unpacker = Unpacker::from_packer(&self.packer); - unpacker.write(&self.paths.dhatus(), &self.paths.pratipadikas())?; + self.packer + .write(&self.paths.dhatus(), &self.paths.pratipadikas())?; Ok(()) } @@ -303,14 +326,26 @@ impl Builder { #[allow(clippy::unwrap_used)] mod tests { use super::*; + use vidyut_prakriya::args as vp; + use vidyut_prakriya::args::{Dhatu, Pratipadika}; - use crate::morph::*; - use fst::Streamer; + use crate::entries::*; use tempfile::tempdir; - use vidyut_prakriya::args as vp; type TestResult = Result<()>; + fn safe(s: &str) -> vp::Slp1String { + vp::Slp1String::from(s).expect("ok") + } + + fn d_entry(d: Dhatu) -> DhatuEntry { + DhatuEntry::new(d, "text".to_string()) + } + + fn entry(p: Pratipadika) -> PratipadikaEntry { + PratipadikaEntry::new(p, vec![]) + } + #[test] fn test_paths() { let paths = Paths { @@ -323,42 +358,52 @@ mod tests { #[test] fn write_and_load() -> TestResult { - let tin = Pada::Tinanta(Tinanta { - dhatu: Dhatu::mula("gam".to_string()), - purusha: Purusha::Prathama, - vacana: Vacana::Eka, - lakara: Lakara::Lat, - pada: PadaPrayoga::Parasmaipada, - }); - let krdanta = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu::mula("gam".to_string()), - krt: Krt::new(vp::BaseKrt::Satf), - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Dvitiya), - is_purvapada: false, - }); - let sup = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: "agni".to_string(), - lingas: vec![Linga::Pum], - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Dvitiya), - is_purvapada: false, - }); + let gam = Dhatu::mula(safe("gam"), vp::Gana::Bhvadi); + let tin = PadaEntry::Tinanta(vp::Tinanta::new( + gam.clone(), + vp::Prayoga::Kartari, + vp::Lakara::Lat, + vp::Purusha::Prathama, + vp::Vacana::Eka, + )); + + let gacchan = vp::Krdanta::new( + Dhatu::mula(safe("gam"), vp::Gana::Bhvadi), + vp::BaseKrt::Satf, + ); + let gacchati = PadaEntry::Subanta( + vp::Subanta::new( + gacchan.clone(), + vp::Linga::Pum, + vp::Vibhakti::Saptami, + vp::Vacana::Eka, + ) + .into(), + ); + + let agni = Pratipadika::basic(safe("agni")); + let sup = PadaEntry::Subanta( + vp::Subanta::new( + agni.clone(), + vp::Linga::Pum, + vp::Vibhakti::Dvitiya, + vp::Vacana::Eka, + ) + .into(), + ); // Builder let dir = tempdir()?; let mut builder = Builder::new(dir.path())?; + builder.register_dhatus(&[d_entry(gam)]); + builder.register_pratipadikas(&[entry(gacchan.into()), entry(agni)]); + builder.insert("agnim", &sup)?; builder.insert("gacCati", &tin)?; - builder.insert("gacCati", &krdanta)?; + builder.insert("gacCati", &gacchati)?; builder.finish()?; + /* // Constructor let lex = Kosha::new(dir.path())?; @@ -383,7 +428,7 @@ mod tests { } assert_eq!(get_all_padas(&lex, "agnim")?, vec![sup]); - assert_eq!(get_all_padas(&lex, "gacCati")?, vec![tin, krdanta]); + assert_eq!(get_all_padas(&lex, "gacCati")?, vec![tin, gacchati]); assert_eq!(get_all_padas(&lex, "gacCat")?, vec![]); assert_eq!(get_all_padas(&lex, "123")?, vec![]); @@ -403,6 +448,7 @@ mod tests { ] ); + */ Ok(()) } diff --git a/vidyut-kosha/src/lib.rs b/vidyut-kosha/src/lib.rs index eaf7f35..75533f9 100644 --- a/vidyut-kosha/src/lib.rs +++ b/vidyut-kosha/src/lib.rs @@ -5,7 +5,7 @@ pub use errors::Error; pub use kosha::{Builder, Kosha}; -pub mod morph; +pub mod entries; pub mod packing; mod errors; diff --git a/vidyut-kosha/src/morph.rs b/vidyut-kosha/src/morph.rs deleted file mode 100644 index ee54574..0000000 --- a/vidyut-kosha/src/morph.rs +++ /dev/null @@ -1,835 +0,0 @@ -//! Models the morphology of Sanskrit words, including their bases and endings. -//! -//! For details on how we represent morphological data, see the `Pada` enum and its comments. -//! -//! We designed this module with the following design principles in mind: -//! -//! 1. Aim for pragmatism. Our goal is to model Sanskrit words with enough detail to be useful but -//! not with so much detail that we merely replicate the Ashtadhyayi. -//! -//! 2. Prefer traditional terms. The vocabulary and conceptual schema of traditional Sanskrit -//! grammar was designed specifically for Sanskrit and fits Sanskrit like a glove. -//! -//! 3. Prefer morphological names. For example, we refer to the various senses of the `-tum` suffix -//! with the simple label `KrtPratyaya::Tum`. As a counterexample, we explicitly model `Linga`, -//! `Vacana`, `Vibhakti`, etc. because using a single `Sup` enum is more trouble than it's -//! worth. - -use crate::errors::*; -use modular_bitfield::prelude::*; -use std::collections::HashMap; -use std::fmt::{Display, Formatter, Result as FmtResult}; -use std::str::FromStr; -use vidyut_prakriya::args as vp; - -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; - -/// Implements various boilerplate for our enums: -/// -/// - `as_str` -/// - `iter` -/// - `FromStr` -/// - `Display` -macro_rules! enum_boilerplate { - ($Enum:ident, { $( $variant:ident => $str:literal ),* $(,)? }) => { - impl $Enum { - /// Returns a string representation of this enum. - pub fn as_str(&self) -> &'static str { - match self { - $( - $Enum::$variant => $str, - )* - } - } - - /// Iterates over the values of this enum in order. - pub fn iter() -> impl Iterator { - const ITEMS: &[$Enum] = &[ - $( - $Enum::$variant, - )* - ]; - ITEMS.iter() - } - } - - impl FromStr for $Enum { - type Err = Error; - fn from_str(s: &str) -> Result { - let val = match s { - $( - $str => $Enum::$variant, - )* - _ => return Err(Error::ParseEnum(stringify!($Enum), s.to_string())), - }; - Ok(val) - } - } - - impl Display for $Enum { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self.as_str()) - } - } - } -} - -macro_rules! from_vidyut_prakriya { - ($Enum:ident, [ $( $variant:ident ),* $(,)? ]) => { - impl From for $Enum { - fn from(val: vp::$Enum) -> Self { - match val { - $( - vp::$Enum::$variant => $Enum::$variant, - )* - } - } - } - } -} - -/// Lemma for `None` semantics or any other case where the lemma is unknown. -pub const NONE_LEMMA: &str = "[none]"; - -/// Utility struct for reading complex serialized data. -struct FeatureMap(HashMap); - -impl FeatureMap { - fn from_str(s: &str) -> Self { - let map = s - .split('|') - .flat_map(|x| x.split_once('=')) - .map(|(x, y)| (x.to_string(), y.to_string())) - .collect::>(); - - FeatureMap(map) - } - fn get(&self, s: &str) -> Result<&String> { - if let Some(val) = self.0.get(s) { - Ok(val) - } else { - Err(Error::Generic(format!( - "Could not parse `{s}` as a feature map." - ))) - } - } -} - -/// The *liṅga* (gender) of a *subanta*. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[bits = 2] -pub enum Linga { - /// The masculine gender. - Pum, - /// The feminine gender. - Stri, - /// The neuter gender. - Napumsaka, -} - -enum_boilerplate!(Linga, { - Pum => "m", - Stri => "f", - Napumsaka => "n", -}); - -from_vidyut_prakriya!(Linga, [Pum, Stri, Napumsaka]); - -/// The *vacana* (number) of a *subanta* or *tiṅanta*. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[bits = 2] -pub enum Vacana { - /// The singular. - Eka, - /// The dual. - Dvi, - /// The plural. - Bahu, -} - -enum_boilerplate!(Vacana, { - Eka => "s", - Dvi => "d", - Bahu => "p", -}); - -from_vidyut_prakriya!(Vacana, [Eka, Dvi, Bahu]); - -/// The *vibhakti* (case) of a *subanta*. -/// -/// The term *vibhakti* refers generally to any triad of inflectional endings for a *subanta* -/// or *tiṅanta*. Here, `Vibhakti` refers specifically to the *subanta* tridas. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[bits = 4] -pub enum Vibhakti { - /// The first *vibhakti* (nominative case). - Prathama, - /// The second *vibhakti* (accusative case). - Dvitiya, - /// The third *vibhakti* (instrumental case). - Trtiya, - /// The fourth *vibhakti* (dative case). - Caturthi, - /// The fifth *vibhakti* (ablative case). - Panchami, - /// The sixth *vibhakti* (genitive case). - Sasthi, - /// The seventh *vibhakti* (locative case). - Saptami, - /// The first *vibhakti* in the condition of *sambodhana* (vocative case). - Sambodhana, -} - -enum_boilerplate!(Vibhakti, { - Prathama => "1", - Dvitiya => "2", - Trtiya => "3", - Caturthi => "4", - Panchami => "5", - Sasthi => "6", - Saptami => "7", - Sambodhana => "8", -}); - -from_vidyut_prakriya!( - Vibhakti, - [Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana] -); - -/// The *puruṣa* (person) of a *tiṅanta*. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[bits = 2] -pub enum Purusha { - /// The first *puruṣa* (third person). - Prathama, - /// The middle *puruṣa* (second person). - Madhyama, - /// The last *puruṣa* (first person). - Uttama, -} - -enum_boilerplate!(Purusha, { - Prathama => "3", - Madhyama => "2", - Uttama => "1", -}); - -from_vidyut_prakriya!(Purusha, [Prathama, Madhyama, Uttama]); - -/// The *lakāra* (tense/mood) of a *tiṅanta*. -/// -/// The *lakāras* are morphological categories, but each typically expresses a specific meaning. -/// For example, *laṭ-lakāra* almost always expresses an action in the present tense. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[bits = 4] -pub enum Lakara { - /// *laṭ-lakāra* (present indicative). - Lat, - /// *liṭ-lakāra* (perfect). - Lit, - /// *luṭ-lakāra* (periphrastic future). - Lut, - /// *lṛṭ-lakāra* (simple future). - Lrt, - /// *leṭ-lakāra* (Vedic subjunctive). - Let, - /// *loṭ-lakāra* (imperative). - Lot, - /// *laṅ-lakāra* (imperfect). - Lan, - /// *liṅ-lakāra* in the sense of benediction (benedictive). - AshirLin, - /// *liṅ-lakāra* in the sense of a rule or injunction (optative). - VidhiLin, - /// *luṅ-lakāra* (aorist). - Lun, - /// *luṅ-lakāra* without its *a-* prefix (injunctive). - LunNoAgama, - /// *lṛṅ-lakāra* (conditional). - Lrn, -} - -enum_boilerplate!(Lakara, { - Lat => "lat", - Lit => "lit", - Lut => "lut", - Lrt => "lrt", - Let => "let", - Lot => "lot", - Lan => "lan", - VidhiLin => "vidhi-lin", - AshirLin => "ashir-lin", - Lun => "lun", - LunNoAgama => "lun-no-agama", - Lrn => "lrn", -}); - -from_vidyut_prakriya!( - Lakara, - [Lat, Lit, Lut, Lrt, Let, Lot, Lan, VidhiLin, AshirLin, Lun, Lrn] -); - -/// A *pratyaya* (suffix) that creates a new *dhātu* (verb root) -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[bits = 2] -pub enum DhatuPratyaya { - /// *ṇic-pratyaya* (*i*), which expresses a causal action. - Nic, - /// *san-pratyaya* (*sa*), which expresses a desiderative action. - San, - /// *yaṅ-pratyaya* (*ya*), which expresses an intensive or frequentative action. - Yan, -} - -/// The *pada* and *prayoga* of the *tiṅanta*. Roughly, these correspond respectively to the -/// concepts of "voice" and "thematic relation." -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[bits = 2] -pub enum PadaPrayoga { - /// *parasmaipada*, which is always in *kartari prayoga*. - Parasmaipada, - /// *ātmanepada* in *kartari prayoga*. - AtmanepadaKartari, - /// *ātmanepada* in *bhāve* or *karmaṇi prayoga*. - AtmanepadaNotKartari, -} - -enum_boilerplate!(PadaPrayoga, { - Parasmaipada => "para", - AtmanepadaKartari => "atma-kartari", - AtmanepadaNotKartari => "atma-not-kartari", -}); - -/// Models the semantics of a *dhātu* (verb root). -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct Dhatu { - /// The prefixes that this dhatu uses. - prefixes: Vec, - /// The sanAdi-pratyayas that this dhatu uses. - sanadi: Vec, - /// The base text of the dhatu. - text: String, -} - -impl Dhatu { - /// Creates a new `Dhatu`. - pub fn mula(text: String) -> Self { - Self { - prefixes: Vec::new(), - sanadi: Vec::new(), - text, - } - } - - /// Sets prefixes on the dhatu. - pub fn with_prefixes(mut self, prefixes: Vec) -> Self { - self.prefixes = prefixes; - self - } - - /// Sets sanAdi-pratyayas on the dhatu. - pub fn with_sanadi(mut self, sanadi: Vec) -> Self { - self.sanadi = sanadi; - self - } - - /// Returns the prefixes that this dhatu uses. - pub fn prefixes(&self) -> &[String] { - &self.prefixes - } - - /// Returns the sanAdi-pratyayas that this dhatu uses. - pub fn sanadi(&self) -> &[vp::Sanadi] { - &self.sanadi - } - - /// The text of this dhatu. - pub fn text(&self) -> &String { - &self.text - } - - /// Returns a string representation of this dhatu. - pub fn as_str(&self) -> String { - let prefixes = self.prefixes.join("-"); - let sanadi_strings: Vec<_> = self.sanadi.iter().map(|s| s.to_string()).collect(); - let text = self.text(); - let sanadi = sanadi_strings.join("-"); - format!("{prefixes},{text},{sanadi}") - } -} - -impl From for Dhatu { - fn from(vp: vp::Dhatu) -> Self { - Dhatu { - prefixes: vp.prefixes().to_vec(), - sanadi: vp.sanadi().to_vec(), - text: match vp.aupadeshika() { - Some(s) => s.to_string(), - None => String::new(), - }, - } - } -} - -impl FromStr for Dhatu { - type Err = Error; - - /// Parses the string representation of this dhatu. - fn from_str(text: &str) -> Result { - let fields: Vec<_> = text.split(',').collect(); - - let prefixes = fields.get(0).map_or(Vec::new(), |s| { - if s.is_empty() { - Vec::new() - } else { - s.split("-").map(|s| s.to_string()).collect() - } - }); - let text = fields.get(1).map_or(String::new(), |s| s.to_string()); - let sanadi: Vec = fields.get(2).map_or(Vec::new(), |s| { - s.split("-").flat_map(|s| vp::Sanadi::from_str(s)).collect() - }); - - Ok(Dhatu { - prefixes, - sanadi, - text, - }) - } -} - -/// Models the semantics of a *prātipadika*. -/// -/// An *prātipadika* is generally synonymous with a nominal base. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum Pratipadika { - /// A basic *prātipadika* that cannot be analyzed further. - Basic { - /// The text of the *prātipadika*. - text: String, - /// The lingas this *prātipadika* uses in most contexts. - lingas: Vec, - }, - /// A *prātipadika* formed by combining a *dhātu* with one or more suffixes. - Krdanta { - /// The dhatu on which this krdanta is based. - dhatu: Dhatu, - /// The pratyaya that created this krdanta. - krt: Krt, - }, -} - -/// A *kṛt-pratyaya* (root or primary suffix). -/// -/// This list is not exhaustive. -#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, Ord, PartialOrd)] -pub struct Krt(vp::Krt); - -impl Krt { - /// Creates a new `Krt` pratyaya. - pub fn new(k: impl Into) -> Self { - Self(k.into()) - } - - /// Returns the underlying krt-pratyaya. - pub fn value(&self) -> vp::Krt { - self.0 - } -} - -impl Pratipadika { - /// Returns the lemma that the *prātipadika* is based on. - pub fn lemma(&self) -> &str { - match &self { - Pratipadika::Basic { text, .. } => text, - Pratipadika::Krdanta { dhatu, .. } => &dhatu.text(), - } - } - - /// Returns a string representation of this enum. - pub fn as_str(&self) -> String { - match self { - Pratipadika::Basic { text, lingas } => { - let lingas = lingas - .iter() - .map(Linga::as_str) - .collect::>() - .join(","); - format!("basic:text={text}|lingas={lingas}") - } - Pratipadika::Krdanta { dhatu, krt } => { - format!("krdanta:dhatu={}|krt={}", dhatu.as_str(), krt.0.as_str()) - } - } - } -} - -impl FromStr for Pratipadika { - type Err = Error; - fn from_str(s: &str) -> Result { - if let Some(s) = s.strip_prefix("basic:") { - let kv = FeatureMap::from_str(s); - let text = kv.get("text")?.clone(); - - let linga_str = kv.get("lingas")?; - let lingas = if linga_str.is_empty() { - Vec::new() - } else { - linga_str - .split(',') - .map(Linga::from_str) - .collect::>>()? - }; - - Ok(Pratipadika::Basic { text, lingas }) - } else if let Some(s) = s.strip_prefix("krdanta:") { - let kv = FeatureMap::from_str(s); - - let dhatu_str = kv.get("dhatu")?.clone(); - let krt = Krt(vp::BaseKrt::from_str(kv.get("krt")?) - // TODO: expect is dangerous here - .expect("ok") - .into()); - - Ok(Pratipadika::Krdanta { - dhatu: dhatu_str.parse()?, - krt, - }) - } else { - Err(Error::ParseEnum("Pratipadika", s.to_string())) - } - } -} - -/// A short part-of-speech tag for some `Pada`. -/// -/// We use this tag when calculating lemma counts. For example, *eva* is a common *avyaya* but -/// not a common *subanta*, and our statistics should reflect that distinction. Coarser -/// distinctions that include linga, vacana, etc. are interesting but less useful given our -/// limited training data. -#[derive(Clone, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[bits = 2] -pub enum POSTag { - /// A token with missing, unknown, or undefined semantics. - Unknown, - /// A nominal. - Subanta, - /// A verb. - Tinanta, - /// An indeclinable. - Avyaya, -} - -enum_boilerplate!(POSTag, { - Unknown => "_", - Subanta => "s", - Tinanta => "t", - Avyaya => "a", -}); - -/// Models the semantics of a *subanta* if it is not an *avyaya*. -/// -/// A *subanta* is any word that ends with one of the twenty-one suffixes in the *sup* list: -/// -/// | Singular | Dual | Plural | -/// |-----------|-----------|-----------| -/// | su । au । jas | -/// | am । auṭ । śas | -/// | ṭā । bhyām । bhis | -/// | ṅe । bhyām । bhyas | -/// | ṅasi । bhyām । bhyas | -/// | ṅas । os । ām | -/// | ṅi । os । sup | -/// -/// For *avyaya*s (indeclinables), see `Avyaya`. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct Subanta { - /// The nominal's stem. - pub pratipadika: Pratipadika, - /// The nominal's gender. - pub linga: Option, - /// The nominal's number. - pub vacana: Option, - /// The nominal's case. - pub vibhakti: Option, - /// Whether this *subanta* is part of some compound but not the final member of it. - pub is_purvapada: bool, -} - -/// Models the semantics of a *tiṅanta*. -/// -/// A *tiṅanta* (verb) is any word that ends with one of the eighteen suffixes in the *tiṅ* list: -/// -/// | Singular | Dual | Plural | -/// |-------------|-------------|-------------| -/// | *tip* | *tas* | *jhi (nti)* | -/// | *sip* | *tas* | *tha* | -/// | *mip* | *vas* | *mas* | -/// -/// | Singular | Dual | Plural | -/// |-------------|-------------|-------------| -/// | *ta* | *ātām* | *jha (nta)* | -/// | *thās* | *āthām* | *dhvam* | -/// | *iṭ* | *vahi* | *mahiṅ* | -/// -/// A *tiṅanta* expresses person, number, tense/mood, and voice in addition to whatever semantics -/// are conveyed by the *dhātu* and its prefixes. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct Tinanta { - /// The verb's root. - pub dhatu: Dhatu, - /// The verb's person. - pub purusha: Purusha, - /// The verb's number. - pub vacana: Vacana, - /// The verb's tense/mood. - pub lakara: Lakara, - /// The verb's voice, roughly speaking. - pub pada: PadaPrayoga, -} - -/// Models the semantics of an *avyaya*. -/// -/// An *avyaya*s (indeclinable) is traditionally modeled as a subtype of the *subanta* that has had -/// its *sup* suffix elided. But we model the *avyaya* separately because we felt that doing so -/// would be easier to reason about in downstream code. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -pub struct Avyaya { - /// The indeclinable's stem. - pub pratipadika: Pratipadika, -} - -/// Models the semantics of a Sanskrit *pada* (word). -/// -/// This enum can be packed into an unsigned integer via the `packing` module. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] -pub enum Pada { - /// Unknown or missing semantics. - Unknown, - /// A *subanta* (nominal, excluding *avyaya*s) - Subanta(Subanta), - /// A *tiṅanta* (verb). - Tinanta(Tinanta), - /// A basic *avyaya* (indeclinable). - Avyaya(Avyaya), -} - -impl Pada { - /// Returns the lemma of the given *pada*. - /// - /// The *lemma* of a word is a canonical form that represents a set of inflectional variants. - /// For example, the word *gacchati*, *gantum*, *gamanam*, and *jagāma* are all inflectional - /// variants of the lemma *gam*. - /// - /// In Vidyut, we use lemma frequencies to score different padaccheda solutions. - /// - /// In Sanskrit, a lemma is either a *dhātu* or a *prātipadika*. - pub fn lemma(&self) -> &str { - match &self { - Pada::Tinanta(t) => &t.dhatu.text(), - Pada::Subanta(s) => s.pratipadika.lemma(), - Pada::Avyaya(a) => a.pratipadika.lemma(), - Pada::Unknown => NONE_LEMMA, - } - } - - /// Returns the part of speech tag for the given `Pada`. - pub fn part_of_speech_tag(&self) -> POSTag { - match self { - Pada::Tinanta(_) => POSTag::Tinanta, - Pada::Subanta(_) => POSTag::Subanta, - Pada::Avyaya(_) => POSTag::Avyaya, - Pada::Unknown => POSTag::Unknown, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - type TestResult = Result<()>; - - #[test] - fn test_linga_serde() -> TestResult { - use Linga::*; - for val in [Pum, Stri, Napumsaka] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_vacana_serde() -> TestResult { - use Vacana::*; - for val in [Eka, Dvi, Bahu] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_vibhakti_serde() -> TestResult { - use Vibhakti::*; - for val in [ - Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana, - ] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_purusha_serde() -> TestResult { - use Purusha::*; - for val in [Prathama, Madhyama, Uttama] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_lakara_serde() -> TestResult { - use Lakara::*; - for val in [ - Lat, Lit, Lut, Lrt, Let, Lot, Lan, VidhiLin, AshirLin, Lun, Lrn, - ] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_pada_prayoga() -> TestResult { - use PadaPrayoga::*; - for val in [Parasmaipada, AtmanepadaKartari, AtmanepadaNotKartari] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - - #[test] - fn test_dhatu() -> TestResult { - let bhu = Dhatu::mula("BU".to_string()); - assert_eq!(bhu, bhu.as_str().parse()?); - - let abhibhu = Dhatu::mula("BU".to_string()).with_prefixes(vec!["aBi".to_string()]); - assert_eq!(abhibhu, abhibhu.as_str().parse()?); - - let abhibobhuya = Dhatu::mula("BU".to_string()) - .with_prefixes(vec!["aBi".to_string()]) - .with_sanadi(vec![vp::Sanadi::yaN]); - assert_eq!(abhibobhuya, abhibobhuya.as_str().parse()?); - - let pratyabhibubhushaya = Dhatu::mula("BU".to_string()) - .with_prefixes(vec!["prati".to_string(), "aBi".to_string()]) - .with_sanadi(vec![vp::Sanadi::san, vp::Sanadi::Ric]); - assert_eq!(pratyabhibubhushaya, pratyabhibubhushaya.as_str().parse()?); - - Ok(()) - } - - #[test] - fn test_pratipadika_serde_with_basic() -> TestResult { - let p = Pratipadika::Basic { - text: "agni".to_string(), - lingas: vec![Linga::Pum], - }; - assert_eq!(p, p.as_str().parse()?); - Ok(()) - } - - #[test] - fn test_pratipadika_serde_with_krdanta() -> TestResult { - let p = Pratipadika::Krdanta { - dhatu: Dhatu::mula("gam".to_string()), - krt: Krt(vp::BaseKrt::Satf.into()), - }; - assert_eq!(p, p.as_str().parse()?); - Ok(()) - } - - #[test] - fn test_subanta_lemma_with_basic_stem() { - let p = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: "agni".to_string(), - lingas: vec![Linga::Pum], - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Dvitiya), - is_purvapada: false, - }); - assert_eq!(p.lemma(), "agni"); - } - - #[test] - fn test_subanta_lemma_with_krdanta_stem() { - let p = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu::mula("gam".to_string()), - krt: Krt(vp::BaseKrt::Satf.into()), - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Dvitiya), - is_purvapada: false, - }); - assert_eq!(p.lemma(), "gam"); - } - - #[test] - fn test_tinanta_lemma() { - let p = Pada::Tinanta(Tinanta { - dhatu: Dhatu::mula("gam".to_string()), - purusha: Purusha::Prathama, - vacana: Vacana::Eka, - lakara: Lakara::Lat, - pada: PadaPrayoga::Parasmaipada, - }); - assert_eq!(p.lemma(), "gam"); - } - - #[test] - fn test_avyaya_lemma_with_basic_stem() { - let p = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: "svar".to_string(), - lingas: vec![], - }, - }); - assert_eq!(p.lemma(), "svar"); - } - - #[test] - fn test_avyaya_lemma_with_krdanta_stem() { - let p = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu::mula("gam".to_string()), - krt: Krt(vp::BaseKrt::tumun.into()), - }, - }); - assert_eq!(p.lemma(), "gam"); - } - - #[test] - fn test_none_lemma() { - let p = Pada::Unknown; - assert_eq!(p.lemma(), NONE_LEMMA); - } -} diff --git a/vidyut-kosha/src/packing.rs b/vidyut-kosha/src/packing.rs index a30d820..e0d3f9e 100644 --- a/vidyut-kosha/src/packing.rs +++ b/vidyut-kosha/src/packing.rs @@ -4,18 +4,16 @@ Code for packing and unpacking Sanskrit morphological data. **Packing* is the process of converting some data into a dense integer representation. The reverse process is accordingly called *unpacking*, -Packed data is useful for two reasons. First, packed data takes up less space in memory with little -or no performance penalty. Second, our finite-state transducer can store values only if they are -integers. In other words, packing is a necessary precondition to storing data in an FST. +Packed data is useful for two reasons; + +1. Packed data takes up less space in memory with little or no performance penalty. +2. Our finite-state transducer can store values only if they are integers. In other words, packing + is a necessary precondition to storing data in an FST. The downside of packed data is that it cannot easily store string data. To work around this problem, we can use a lookup table that maps integer indices to string values. But lookup tables are much more cumbersome than simple structs. -Therefore, we recommend using packed data only when the following conditions obtain: -- - - Approach ======== @@ -64,117 +62,56 @@ we need to convert between representations. TODO: investigate different packing orders to see if we can reduce the size of the FST. */ +use crate::entries::*; use crate::errors::*; -use crate::morph::*; use modular_bitfield::prelude::*; use rustc_hash::FxHashMap; +use vidyut_prakriya::args::{ + Dhatu, Lakara, Linga, Pratipadika, Prayoga, Purusha, Subanta, Tinanta, Vacana, Vibhakti, +}; use std::fs::File; -use std::io::{BufRead, BufReader}; +use std::io::{BufReader, BufWriter}; use std::path::Path; -/// Defines boilerplate methods for packing and unpacking enums. -/// -/// Requirements: `$Packed` and `$Raw` should be ordinary C-style enums. `$Packed` must have the -/// same exact values as `$Raw` plus an extra `None` value. -macro_rules! boilerplate { - ($Packed:ident, $Raw:ident, [$( $variant:ident ),*]) => { - impl From> for $Packed { - fn from(val: Option<$Raw>) -> $Packed { - match val { +macro_rules! packed_enum { + ($Packed:ident, $Raw:ident, [$( $variant:ident ),*], $num_bits:literal ) => { + #[doc = concat!("A space-efficient version of ", stringify!($Raw))] + #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, BitfieldSpecifier)] + #[bits = $num_bits] + pub enum $Packed { + $( + #[doc = concat!("A packed version of ", stringify!($Raw), "::", stringify!($variant), ".")] + $variant, + )* + } + + impl $Packed { + #[doc = concat!("Converts from ", stringify!($Raw), " to ", stringify!($Packed), ".")] + pub fn pack(value: $Raw) -> $Packed { + match value { $( - Some($Raw::$variant) => Self::$variant, + $Raw::$variant => $Packed::$variant, )* - None => Self::None, } } - } - impl $Packed { - /// Unpack this data into its corresponding Raw value. - fn unpack(&self) -> Option<$Raw> { + #[doc = concat!("Converts from ", stringify!($Packed), " to ", stringify!($Raw), ".")] + pub fn unpack(&self) -> $Raw { match self { $( - $Packed::$variant => Some($Raw::$variant), + $Packed::$variant => $Raw::$variant, )* - Self::None => None, } } } - } -} - -/// A lookup table for `Dhatu`s. -#[derive(Default, Debug)] -pub struct DhatuTable(Vec); - -impl DhatuTable { - /// Returns the dhatu at the given index. - pub fn get(&self, index: usize) -> Option<&Dhatu> { - self.0.get(index) - } - /// Reads this table from disk. - pub fn read(path: &Path) -> Result { - let f = File::open(path)?; - let reader = BufReader::new(f); - - let mut ret = Vec::new(); - for line in reader.lines() { - match line?.parse() { - Ok(s) => ret.push(s), - _ => {} + impl From<$Raw> for $Packed { + fn from(val: $Raw) -> $Packed { + $Packed::pack(val) } } - Ok(Self(ret)) - } - - /// Writes this table to disk. - pub fn write(&self, path: &Path) -> Result<()> { - let data: String = self - .0 - .iter() - .map(|d| d.as_str()) - .fold(String::new(), |x, y| x + &y + "\n"); - std::fs::write(path, data)?; - - Ok(()) - } -} - -/// A lookup table for `Pratipadika` data. -#[derive(Debug, Default)] -pub struct PratipadikaTable(Vec); - -impl PratipadikaTable { - /// Returns the pratipadika at the given index. - pub fn get(&self, index: usize) -> Option<&Pratipadika> { - self.0.get(index) - } - - /// Reads this table from disk. - pub fn read(path: &Path) -> Result { - let f = File::open(path)?; - let reader = BufReader::new(f); - - let mut ret = Vec::new(); - for line in reader.lines() { - ret.push(line?.to_string().parse()?); - } - Ok(Self(ret)) - } - - /// Writes this table to disk. - pub fn write(&self, out: &Path) -> Result<()> { - let data: String = self - .0 - .iter() - .map(Pratipadika::as_str) - .fold(String::new(), |x, y| x + &y + "\n"); - - std::fs::write(out, data)?; - Ok(()) - } + }; } /// Models the part of speech for the given `Pada`. The value of `PartOfSpeech` controls how we @@ -182,7 +119,7 @@ impl PratipadikaTable { #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, BitfieldSpecifier)] #[bits = 2] enum PartOfSpeech { - None, + Unknown, Subanta, Tinanta, Avyaya, @@ -190,84 +127,39 @@ enum PartOfSpeech { /// Semantics for an unknown term. #[bitfield(bits = 30)] -pub struct PackedNone { +pub struct PackedUnknown { #[skip] unused: B30, } -impl PackedNone { +impl PackedUnknown { #[allow(unused)] fn pack() -> Self { Self::new() } #[allow(unused)] - fn unpack(&self) -> Pada { - Pada::Unknown + fn unpack(&self) -> PadaEntry { + PadaEntry::Unknown } } -/// A space-efficient version of `Linga`. -#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, BitfieldSpecifier)] -#[bits = 2] -pub enum PackedLinga { - /// Unknown or missing `Linga`. - None, - /// The masculine gender. - Pum, - /// The feminine gender. - Stri, - /// The neuter gender. - Napumsaka, -} - -boilerplate!(PackedLinga, Linga, [Pum, Stri, Napumsaka]); - -/// A space-efficient version of `Vacana`. -#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, BitfieldSpecifier)] -#[bits = 2] -pub enum PackedVacana { - /// Unknown or missing vacana. - None, - /// The singular. - Eka, - /// The dual. - Dvi, - /// The plural. - Bahu, -} - -boilerplate!(PackedVacana, Vacana, [Eka, Dvi, Bahu]); - -/// A space-efficient version of `Vibhakti`. -#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, BitfieldSpecifier)] -#[bits = 4] -pub enum PackedVibhakti { - /// Unknown or missing vibhakti. - None, - /// The first *vibhakti* (nominative case). - Prathama, - /// The second *vibhakti* (accusative case). - Dvitiya, - /// The third *vibhakti* (instrumental case). - Trtiya, - /// The fourth *vibhakti* (dative case). - Caturthi, - /// The fifth *vibhakti* (ablative case). - Panchami, - /// The sixth *vibhakti* (genitive case). - Sasthi, - /// The seventh *vibhakti* (locative case). - Saptami, - /// The first *vibhakti* in the condition of *sambodhana* (vocative case). - Sambodhana, -} - -boilerplate!( +packed_enum!(PackedPrayoga, Prayoga, [Kartari, Karmani, Bhave], 2); +packed_enum!(PackedLinga, Linga, [Pum, Stri, Napumsaka], 2); +packed_enum!(PackedVacana, Vacana, [Eka, Dvi, Bahu], 2); +packed_enum!( PackedVibhakti, Vibhakti, - [Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana] + [Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana], + 4 ); +packed_enum!( + PackedLakara, + Lakara, + [Lat, Lit, Lut, Lrt, Let, Lot, Lan, VidhiLin, AshirLin, Lun, Lrn], + 4 +); +packed_enum!(PackedPurusha, Purusha, [Prathama, Madhyama, Uttama], 2); /// Semantics for a *subanta*. #[bitfield(bits = 30)] @@ -275,31 +167,33 @@ pub struct PackedSubanta { linga: PackedLinga, vacana: PackedVacana, vibhakti: PackedVibhakti, - is_purvapada: bool, + is_avyaya: bool, pratipadika_id: B21, } impl PackedSubanta { - fn pack(s: &Subanta, pratipadika_id: usize) -> Result { + fn pack(s: &SubantaEntry, pratipadika_id: usize) -> Result { + let s = s.subanta(); Ok(Self::new() .with_pratipadika_id(pratipadika_id.try_into()?) - .with_linga(s.linga.into()) - .with_vacana(s.vacana.into()) - .with_vibhakti(s.vibhakti.into()) - .with_is_purvapada(s.is_purvapada)) - } - - fn unpack(&self, pratipadikas: &PratipadikaTable) -> Result { - let val = Pada::Subanta(Subanta { - pratipadika: pratipadikas - .get(self.pratipadika_id() as usize) - .ok_or_else(|| Error::UnknownPratipadikaId(self.pratipadika_id()))? - .clone(), - linga: self.linga().unpack(), - vacana: self.vacana().unpack(), - vibhakti: self.vibhakti().unpack(), - is_purvapada: self.is_purvapada(), - }); + .with_linga(s.linga().into()) + .with_vacana(s.vacana().into()) + .with_vibhakti(s.vibhakti().into()) + .with_is_avyaya(s.is_avyaya())) + } + + fn unpack(&self, pratipadikas: &[PratipadikaEntry]) -> Result { + let p_entry = pratipadikas + .get(self.pratipadika_id() as usize) + .ok_or_else(|| Error::UnknownPratipadikaId(self.pratipadika_id()))?; + let subanta = Subanta::builder() + .pratipadika(p_entry.pratipadika().clone()) + .linga(self.linga().unpack()) + .vacana(self.vacana().unpack()) + .vibhakti(self.vibhakti().unpack()) + .build() + .expect("has required fields"); + let val = PadaEntry::Subanta(SubantaEntry::new(subanta, p_entry.clone())); Ok(val) } } @@ -307,10 +201,10 @@ impl PackedSubanta { /// Semantics for a *tinanta*. #[bitfield(bits = 30)] pub struct PackedTinanta { - lakara: Lakara, - purusha: Purusha, - vacana: Vacana, - pada: PadaPrayoga, + lakara: PackedLakara, + purusha: PackedPurusha, + vacana: PackedVacana, + prayoga: PackedPrayoga, dhatu_id: B20, } @@ -318,23 +212,28 @@ impl PackedTinanta { fn pack(t: &Tinanta, dhatu_id: usize) -> Result { Ok(Self::new() .with_dhatu_id(dhatu_id.try_into()?) - .with_lakara(t.lakara) - .with_purusha(t.purusha) - .with_vacana(t.vacana) - .with_pada(t.pada)) - } - - fn unpack(&self, dhatus: &DhatuTable) -> Result { - let val = Pada::Tinanta(Tinanta { - dhatu: dhatus - .get(self.dhatu_id() as usize) - .ok_or_else(|| Error::UnknownDhatuId(self.dhatu_id()))? - .clone(), - purusha: self.purusha(), - lakara: self.lakara(), - vacana: self.vacana(), - pada: self.pada(), - }); + .with_lakara(t.lakara().into()) + .with_purusha(t.purusha().into()) + .with_vacana(t.vacana().into()) + .with_prayoga(t.prayoga().into())) + } + + fn unpack(&self, dhatus: &[DhatuEntry]) -> Result { + let dhatu = dhatus + .get(self.dhatu_id() as usize) + .ok_or_else(|| Error::UnknownDhatuId(self.dhatu_id()))? + .dhatu() + .clone(); + let val = PadaEntry::Tinanta( + Tinanta::builder() + .dhatu(dhatu) + .purusha(self.purusha().unpack()) + .lakara(self.lakara().unpack()) + .vacana(self.vacana().unpack()) + .prayoga(self.prayoga().unpack()) + .build() + .expect("has required fields"), + ); Ok(val) } } @@ -346,24 +245,26 @@ pub struct PackedAvyaya { } impl PackedAvyaya { - fn pack(_a: &Avyaya, pratipadika_id: usize) -> Result { + fn pack(pratipadika_id: usize) -> Result { Ok(Self::new().with_pratipadika_id(pratipadika_id.try_into()?)) } - fn unpack(&self, pratipadikas: &PratipadikaTable) -> Result { - let val = Pada::Avyaya(Avyaya { - pratipadika: pratipadikas - .get(self.pratipadika_id() as usize) - .ok_or_else(|| Error::UnknownPratipadikaId(self.pratipadika_id()))? - .clone(), - }); + fn unpack(&self, pratipadikas: &[PratipadikaEntry]) -> Result { + let p_entry = pratipadikas + .get(self.pratipadika_id() as usize) + .ok_or_else(|| Error::UnknownPratipadikaId(self.pratipadika_id()))?; + + let pratipadika = p_entry.pratipadika().clone(); + let subanta = Subanta::avyaya(pratipadika); + let val = PadaEntry::Avyaya(SubantaEntry::new(subanta, p_entry.clone())); Ok(val) } } /// Semantics for a *pada*. #[bitfield] -pub struct PackedPada { +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub struct PackedEntry { /// The part of speech for these semantics. We use this value to decide how to interpret the /// `payload` field. pos: PartOfSpeech, @@ -372,7 +273,7 @@ pub struct PackedPada { payload: B30, } -impl PackedPada { +impl PackedEntry { /// Unsafely interprets this packed pada as an avyaya. pub fn unwrap_as_avyaya(&self) -> PackedAvyaya { PackedAvyaya::from_bytes(self.payload().to_le_bytes()) @@ -388,7 +289,7 @@ impl PackedPada { PackedTinanta::from_bytes(self.payload().to_le_bytes()) } - /// Unwraps the bitfield as an ordinary integer. + /// Unwraps the bitfield as an ordinary `u32. pub fn to_u32(self) -> u32 { u32::from_le_bytes(self.into_bytes()) } @@ -399,143 +300,151 @@ impl PackedPada { } } -/// Packs a `Pada` enum into a u32 code. +/// Packs and unpacks linguistic data. #[derive(Clone, Default)] pub struct Packer { - /// Maps a pratipadika to its numeric ID. - stem_mapper: FxHashMap, - /// Maps a dhatu to its numeric ID. - dhatu_mapper: FxHashMap, + dhatus: Vec, + pratipadikas: Vec, + pratipadika_to_index: FxHashMap, + dhatu_to_index: FxHashMap, } impl Packer { - /// Creates a new packer with no data. + /// Creates a new packer. pub fn new() -> Self { - Packer { - stem_mapper: FxHashMap::default(), - dhatu_mapper: FxHashMap::default(), - } + Self::default() } - /// Creates a mapping from integers to dhatus. - /// - /// Here, our integers are just the values 0, 1, ..., *n*. So to create a mapping from integers - /// to dhatus, we can return a simple vector. Then the dhatu at index i implicitly defines a - /// mpping from i to that dhatu. - pub fn create_dhatu_table(&self) -> DhatuTable { - let mut unsorted = self.dhatu_mapper.iter().collect::>(); - unsorted.sort_by_key(|x| x.1); - DhatuTable( - unsorted - .into_iter() - .map(|(dhatu, _)| dhatu.clone()) - .collect::>(), - ) + /// Loads a packer from disk. + pub fn read(dhatu_path: impl AsRef, pratipadika_path: impl AsRef) -> Result { + Self::read_inner(dhatu_path.as_ref(), pratipadika_path.as_ref()) } - /// Creates a mapping from integers to pratipadikas. - /// - /// The construction here is similar to what do we do in `create_dhatu_table`. - pub fn create_stem_table(&self) -> PratipadikaTable { - let mut unsorted = self.stem_mapper.iter().collect::>(); - unsorted.sort_by_key(|x| x.1); - PratipadikaTable( - unsorted - .into_iter() - .map(|(stem, _)| stem.clone()) - .collect::>(), - ) - } + fn read_inner(dhatu_path: &Path, pratipadika_path: &Path) -> Result { + let file = File::open(dhatu_path)?; + let reader = BufReader::new(file); + let dhatus: Vec = serde_json::from_reader(reader)?; - /// Packs the given semantics into an integer value. - pub fn pack(&mut self, semantics: &Pada) -> Result { - let to_u32 = u32::from_le_bytes; + let file = File::open(pratipadika_path)?; + let reader = BufReader::new(file); + let pratipadikas: Vec = serde_json::from_reader(reader)?; - let val = match semantics { - Pada::Subanta(s) => { - let stem_index = self.stem_index_for(&s.pratipadika); - let payload = PackedSubanta::pack(s, stem_index)?.into_bytes(); - PackedPada::new() - .with_pos(PartOfSpeech::Subanta) - .with_payload(to_u32(payload)) - } - Pada::Tinanta(t) => { - let dhatu_index = self.dhatu_index_for(&t.dhatu); - let payload = PackedTinanta::pack(t, dhatu_index)?.into_bytes(); - PackedPada::new() - .with_pos(PartOfSpeech::Tinanta) - .with_payload(to_u32(payload)) - } - Pada::Avyaya(a) => { - let stem_index = self.stem_index_for(&a.pratipadika); - let payload = PackedAvyaya::pack(a, stem_index)?.into_bytes(); - PackedPada::new() - .with_pos(PartOfSpeech::Avyaya) - .with_payload(to_u32(payload)) - } - Pada::Unknown => PackedPada::new().with_pos(PartOfSpeech::None), - }; - Ok(val) + let pratipadika_to_index: FxHashMap<_, _> = pratipadikas + .iter() + .enumerate() + .map(|(i, x)| (x.pratipadika().clone(), i)) + .collect(); + let dhatu_to_index: FxHashMap<_, _> = dhatus + .iter() + .enumerate() + .map(|(i, x)| (x.dhatu().clone(), i)) + .collect(); + + Ok(Self { + dhatus, + pratipadikas, + pratipadika_to_index, + dhatu_to_index, + }) } - fn stem_index_for(&mut self, p: &Pratipadika) -> usize { - if let Some(i) = self.stem_mapper.get(p) { - *i - } else { - let n = self.stem_mapper.len(); - self.stem_mapper.insert(p.clone(), n); - n - } + /// Writes daat in the registry to disk. + pub fn write( + &self, + dhatu_path: impl AsRef, + pratipadika_path: impl AsRef, + ) -> Result<()> { + self.write_inner(dhatu_path.as_ref(), pratipadika_path.as_ref()) } - fn dhatu_index_for(&mut self, d: &Dhatu) -> usize { - if let Some(i) = self.dhatu_mapper.get(d) { - *i - } else { - let n = self.dhatu_mapper.len(); - self.dhatu_mapper.insert(d.clone(), n); - n - } + fn write_inner(&self, dhatu_path: &Path, pratipadika_path: &Path) -> Result<()> { + let file = File::create(dhatu_path)?; + let writer = BufWriter::new(file); + serde_json::to_writer(writer, &self.dhatus)?; + + let file = File::create(pratipadika_path)?; + let writer = BufWriter::new(file); + serde_json::to_writer(writer, &self.pratipadikas)?; + + Ok(()) } -} -/// Unpacks a u32 code into a `Pada` enum. -pub struct Unpacker { - pratipadikas: PratipadikaTable, - dhatus: DhatuTable, -} + /// Registers the given dhatus on the packer. Duplicate dhatus are ignored. + pub fn register_dhatus(&mut self, entries: &[DhatuEntry]) { + let n = self.dhatus.len(); + for e in entries { + if !self.dhatu_to_index.contains_key(&e.dhatu()) { + self.dhatus.push(e.clone()); + } + } -impl Unpacker { - /// Creates an unpacker from the given packer. - pub fn from_packer(p: &Packer) -> Self { - Unpacker { - pratipadikas: p.create_stem_table(), - dhatus: p.create_dhatu_table(), + for (i, d) in self.dhatus[n..].iter().enumerate() { + self.dhatu_to_index.insert(d.dhatu().clone(), n + i); } } - /// Creates an unpacker from the given data. - pub fn from_data(pratipadikas: PratipadikaTable, dhatus: DhatuTable) -> Self { - Unpacker { - pratipadikas, - dhatus, + /// Registers the given pratipadikas on the packer. Duplicate pratipadikas are ignored. + pub fn register_pratipadikas(&mut self, entries: &[PratipadikaEntry]) { + let n = self.pratipadikas.len(); + for e in entries { + if !self.pratipadika_to_index.contains_key(&e.pratipadika()) { + self.pratipadikas.push(e.clone()); + } + } + + for (i, p) in self.pratipadikas[n..].iter().enumerate() { + self.pratipadika_to_index + .insert(p.pratipadika().clone(), n + i); } } - /// Writes this unpacker's data files to disk. - pub fn write(&self, dhatu_path: &Path, pratipadika_path: &Path) -> Result<()> { - self.dhatus.write(dhatu_path)?; - self.pratipadikas.write(pratipadika_path)?; - Ok(()) + /// Packs the given semantics into an integer value. + pub fn pack(&self, pada: &PadaEntry) -> Result { + let to_u32 = u32::from_le_bytes; + + let val = match pada { + PadaEntry::Avyaya(a) => { + match self.pratipadika_to_index.get(a.subanta().pratipadika()) { + Some(i) => { + let payload = PackedAvyaya::pack(*i)?.into_bytes(); + PackedEntry::new() + .with_pos(PartOfSpeech::Avyaya) + .with_payload(to_u32(payload)) + } + None => return Err(Error::Generic("Pratipadika not in index.".to_string())), + } + } + PadaEntry::Subanta(s) => match self.pratipadika_to_index.get(s.subanta().pratipadika()) + { + Some(i) => { + let payload = PackedSubanta::pack(s, *i)?.into_bytes(); + PackedEntry::new() + .with_pos(PartOfSpeech::Subanta) + .with_payload(to_u32(payload)) + } + None => return Err(Error::Generic("Pratipadika not in index.".to_string())), + }, + PadaEntry::Tinanta(t) => match self.dhatu_to_index.get(t.dhatu()) { + Some(i) => { + let payload = PackedTinanta::pack(t, *i)?.into_bytes(); + PackedEntry::new() + .with_pos(PartOfSpeech::Tinanta) + .with_payload(to_u32(payload)) + } + None => return Err(Error::Generic("Dhatu not in index.".to_string())), + }, + PadaEntry::Unknown => PackedEntry::new().with_pos(PartOfSpeech::Unknown), + }; + Ok(val) } /// Unpacks the given packed pada. - pub fn unpack(&self, pada: &PackedPada) -> Result { + pub fn unpack(&self, pada: &PackedEntry) -> Result { match pada.pos() { PartOfSpeech::Avyaya => pada.unwrap_as_avyaya().unpack(&self.pratipadikas), PartOfSpeech::Subanta => pada.unwrap_as_subanta().unpack(&self.pratipadikas), PartOfSpeech::Tinanta => pada.unwrap_as_tinanta().unpack(&self.dhatus), - PartOfSpeech::None => Ok(Pada::Unknown), + PartOfSpeech::Unknown => Ok(PadaEntry::Unknown), } } } @@ -543,91 +452,95 @@ impl Unpacker { #[cfg(test)] mod tests { use super::*; + use vidyut_prakriya::args as vp; type TestResult = Result<()>; + fn safe(s: &str) -> vp::Slp1String { + vp::Slp1String::from(s).expect("static") + } + + fn entry(p: Pratipadika) -> PratipadikaEntry { + PratipadikaEntry::new(p, vec![]) + } + #[test] fn test_subanta_packing() -> TestResult { - let devasya = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: "deva".to_owned(), - lingas: vec![Linga::Pum], - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Sasthi), - is_purvapada: false, - }); - let narasya = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: "nara".to_owned(), - lingas: vec![Linga::Pum], - }, - linga: Some(Linga::Pum), - vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::Sasthi), - is_purvapada: false, - }); + let deva = Pratipadika::basic(safe("deva")); + let nara = Pratipadika::basic(safe("nara")); + let devasya = PadaEntry::Subanta( + Subanta::new(deva.clone(), Linga::Pum, Vibhakti::Sasthi, Vacana::Eka).into(), + ); + let narasya = PadaEntry::Subanta( + Subanta::new(nara.clone(), Linga::Pum, Vibhakti::Sasthi, Vacana::Eka).into(), + ); let mut packer = Packer::new(); + packer.register_pratipadikas(&[entry(deva), entry(nara)]); + let devasya_code = packer.pack(&devasya)?; let narasya_code = packer.pack(&narasya)?; - - let unpacker = Unpacker::from_packer(&packer); - assert_eq!(unpacker.unpack(&narasya_code)?, narasya); - assert_eq!(unpacker.unpack(&devasya_code)?, devasya); + assert_eq!(packer.unpack(&narasya_code)?, narasya); + assert_eq!(packer.unpack(&devasya_code)?, devasya); Ok(()) } #[test] fn test_tinanta_packing() -> TestResult { - let gacchati = Pada::Tinanta(Tinanta { - dhatu: Dhatu::mula("gam".to_string()), - purusha: Purusha::Prathama, - vacana: Vacana::Eka, - lakara: Lakara::Lat, - pada: PadaPrayoga::Parasmaipada, - }); - - let carati = Pada::Tinanta(Tinanta { - dhatu: Dhatu::mula("car".to_string()), - purusha: Purusha::Prathama, - vacana: Vacana::Eka, - lakara: Lakara::Lat, - pada: PadaPrayoga::Parasmaipada, - }); + let gacchati = PadaEntry::Tinanta( + Tinanta::builder() + .dhatu(Dhatu::mula(safe("ga\\mx~"), vp::Gana::Bhvadi)) + .purusha(Purusha::Prathama) + .vacana(Vacana::Eka) + .lakara(Lakara::Lat) + .prayoga(Prayoga::Kartari) + .build() + .unwrap(), + ); + + let carati = PadaEntry::Tinanta( + Tinanta::builder() + .dhatu(Dhatu::mula(safe("cara~"), vp::Gana::Bhvadi)) + .purusha(Purusha::Prathama) + .vacana(Vacana::Eka) + .lakara(Lakara::Lat) + .prayoga(Prayoga::Kartari) + .build() + .unwrap(), + ); let mut packer = Packer::new(); + packer.register_dhatus(&[ + DhatuEntry::new( + Dhatu::mula(safe("ga\\mx~"), vp::Gana::Bhvadi), + "gam".to_string(), + ), + DhatuEntry::new( + Dhatu::mula(safe("cara~"), vp::Gana::Bhvadi), + "car".to_string(), + ), + ]); let gacchati_code = packer.pack(&gacchati)?; let carati_code = packer.pack(&carati)?; - let unpacker = Unpacker::from_packer(&packer); - assert_eq!(unpacker.unpack(&carati_code)?, carati); - assert_eq!(unpacker.unpack(&gacchati_code)?, gacchati); + assert_eq!(packer.unpack(&carati_code)?, carati); + assert_eq!(packer.unpack(&gacchati_code)?, gacchati); Ok(()) } #[test] fn test_avyaya_packing() -> TestResult { - let iti = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: "iti".to_owned(), - lingas: vec![], - }, - }); - let ca = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Basic { - text: "ca".to_owned(), - lingas: vec![], - }, - }); + let iti_stem = Pratipadika::basic(safe("iti")); + let ca_stem = Pratipadika::basic(safe("ca")); + let iti = PadaEntry::Avyaya(Subanta::avyaya(iti_stem.clone()).into()); + let ca = PadaEntry::Avyaya(Subanta::avyaya(ca_stem.clone()).into()); let mut packer = Packer::new(); + packer.register_pratipadikas(&[entry(iti_stem), entry(ca_stem)]); let iti_code = packer.pack(&iti)?; let ca_code = packer.pack(&ca)?; - let unpacker = Unpacker::from_packer(&packer); - assert_eq!(unpacker.unpack(&ca_code)?, ca); - assert_eq!(unpacker.unpack(&iti_code)?, iti); + assert_eq!(packer.unpack(&ca_code)?, ca); + assert_eq!(packer.unpack(&iti_code)?, iti); Ok(()) } } diff --git a/vidyut-kosha/tests/error_messages.rs b/vidyut-kosha/tests/error_messages.rs index 3522a57..b93a819 100644 --- a/vidyut-kosha/tests/error_messages.rs +++ b/vidyut-kosha/tests/error_messages.rs @@ -1,6 +1,6 @@ use std::fs::File; use tempfile::{tempdir, NamedTempFile}; -use vidyut_kosha::morph::Pada; +use vidyut_kosha::entries::PadaEntry; use vidyut_kosha::{Builder, Error, Kosha}; fn assert_is_fst_error(ret: Result) { @@ -37,10 +37,10 @@ fn build_with_out_of_order_keys() { let mut builder = Builder::new(&path).unwrap(); - let ret = builder.insert("b", &Pada::Unknown); + let ret = builder.insert("b", &PadaEntry::Unknown); assert!(ret.is_ok()); - let ret = builder.insert("a", &Pada::Unknown); + let ret = builder.insert("a", &PadaEntry::Unknown); assert_is_fst_error(ret); } @@ -52,10 +52,10 @@ fn build_with_too_many_duplicates() { let mut builder = Builder::new(&path).unwrap(); for _ in 0..=4225 { - let ret = builder.insert("a", &Pada::Unknown); + let ret = builder.insert("a", &PadaEntry::Unknown); assert!(ret.is_ok()); } - let ret = builder.insert("a", &Pada::Unknown); + let ret = builder.insert("a", &PadaEntry::Unknown); assert!(ret.is_err()); } diff --git a/vidyut-prakriya/Cargo.toml b/vidyut-prakriya/Cargo.toml index 794e402..769bc4d 100644 --- a/vidyut-prakriya/Cargo.toml +++ b/vidyut-prakriya/Cargo.toml @@ -25,15 +25,15 @@ serde-wasm-bindgen = "0.4" console_error_panic_hook = "0.1.7" rustc-hash = "2.0.0" -[features] -default = ["serde"] -serde = [] - [dev-dependencies] criterion = "0.5.1" test_utils = { path = "test_utils" } vidyut-lipi = { path = "../vidyut-lipi" } +[features] +default = ["serde"] +serde = [] + [[bench]] name = "microbenchmarks" harness = false diff --git a/vidyut-prakriya/examples/create_pada.rs b/vidyut-prakriya/examples/create_pada.rs new file mode 100644 index 0000000..5930b97 --- /dev/null +++ b/vidyut-prakriya/examples/create_pada.rs @@ -0,0 +1,91 @@ +//! A simple debugger that creates a form from the input arguments. + +use clap::Parser; +use std::error::Error; +use vidyut_prakriya::args::*; +use vidyut_prakriya::{Dhatupatha, Prakriya, Vyakarana}; + +#[derive(Parser)] +#[command(author, version, about)] +struct Args { + #[arg(long)] + kind: String, + + #[arg(long)] + prayoga: Option, + #[arg(long)] + lakara: Option, + #[arg(long)] + purusha: Option, + #[arg(long)] + vacana: Option, + + #[arg(long)] + pratipadika: Option, + #[arg(long)] + vibhakti: Option, + #[arg(long)] + linga: Option, +} + +/// Prints the `prakriyas` provided. +fn print_prakriyas(prakriyas: &[Prakriya]) { + for p in prakriyas { + println!("{}", p.text()); + println!("---------------------------"); + for step in p.history() { + let code = step.rule().code(); + let terms: Vec<_> = step + .result() + .iter() + .map(|x| x.text()) + .filter(|x| !x.is_empty()) + .collect(); + let result = terms.join(" + "); + println!("{:<10} | {}", code, result); + } + println!("---------------------------"); + println!("\n"); + } +} + +fn run(_dhatupatha: Dhatupatha, args: Args) -> Result<(), Box> { + let v = Vyakarana::new(); + + if args.kind == "subanta" { + (|| { + let sup = Subanta::new( + Pratipadika::basic(Slp1String::from(args.pratipadika?).expect("ok")), + args.linga?, + args.vibhakti?, + args.vacana?, + ); + let prakriyas = v.derive_subantas(&sup); + print_prakriyas(&prakriyas); + + Some(()) + })(); + } + + Ok(()) +} + +fn main() { + let args = Args::parse(); + + let dhatupatha = match Dhatupatha::from_path("data/dhatupatha.tsv") { + Ok(res) => res, + Err(err) => { + println!("{}", err); + std::process::exit(1); + } + }; + + match run(dhatupatha, args) { + Ok(()) => (), + Err(err) => { + eprintln!("{}", err); + std::process::exit(1); + } + } +} diff --git a/vidyut-prakriya/src/angasya/subanta.rs b/vidyut-prakriya/src/angasya/subanta.rs index 0fdaeb2..55538d9 100644 --- a/vidyut-prakriya/src/angasya/subanta.rs +++ b/vidyut-prakriya/src/angasya/subanta.rs @@ -275,7 +275,7 @@ fn try_sup_adesha(p: &mut Prakriya, i_anga: usize, i_sup: usize) -> Option<()> { fn try_add_num_agama_to_anga(p: &mut Prakriya, i_anga: usize) -> Option<()> { let anga = p.get(i_anga)?; - let sup = p.get(i_anga + 1)?; + let sup = p.get_if(i_anga + 1, |t| !t.is_lupta())?; let napum = p.has_tag(PT::Napumsaka); let is_ugit = anga.has_tag_in(&[T::udit, T::fdit]); diff --git a/vidyut-prakriya/src/args/krt.rs b/vidyut-prakriya/src/args/krt.rs index 66a8be3..45a88f7 100644 --- a/vidyut-prakriya/src/args/krt.rs +++ b/vidyut-prakriya/src/args/krt.rs @@ -581,8 +581,8 @@ impl Krdanta { } /// The upapada that conditions the krt pratyaya. - pub fn upapada(&self) -> &Option { - &self.upapada + pub fn upapada(&self) -> Option<&Subanta> { + self.upapada.as_ref() } /// The artha condition to use in the derivation. If not set, any artha is allowed. @@ -591,8 +591,8 @@ impl Krdanta { } /// The value that the krdanta must match, if defined. - pub fn require(&self) -> &Option { - &self.require + pub fn require(&self) -> Option<&String> { + self.require.as_ref() } /// Sets the required value for this krdanta. diff --git a/vidyut-prakriya/src/args/pada.rs b/vidyut-prakriya/src/args/pada.rs index d2b6fee..3cf7653 100644 --- a/vidyut-prakriya/src/args/pada.rs +++ b/vidyut-prakriya/src/args/pada.rs @@ -13,7 +13,7 @@ pub enum Pada { Tinanta(Tinanta), /// A "chunk of text" without any specific morphology. This is a temporary variant that we hope /// to clean up later. - Dummy(String), + Unknown(String), /// A dummy variant that we hope to clean up later. Nipata(String), } @@ -21,7 +21,7 @@ pub enum Pada { impl Pada { /// Creates a dummy pada from the given text. pub fn from_text(text: impl AsRef) -> Self { - Self::Dummy(text.as_ref().to_string()) + Self::Unknown(text.as_ref().to_string()) } /// Creates a dummy pada from the given text. diff --git a/vidyut-prakriya/src/args/pratipadika.rs b/vidyut-prakriya/src/args/pratipadika.rs index e5c5326..76ea381 100644 --- a/vidyut-prakriya/src/args/pratipadika.rs +++ b/vidyut-prakriya/src/args/pratipadika.rs @@ -71,6 +71,15 @@ impl Pratipadika { is_avyaya: false, }) } + + /// Returns whether the pratipadika describes an avyaya. + pub fn is_avyaya(&self) -> bool { + match self { + Self::Basic(b) => b.is_avyaya, + Self::Krdanta(k) => k.krt().is_avyaya(), + _ => false, + } + } } impl TryFrom<&str> for Pratipadika { diff --git a/vidyut-prakriya/src/args/slp1_string.rs b/vidyut-prakriya/src/args/slp1_string.rs index 491bbf1..3622eaa 100644 --- a/vidyut-prakriya/src/args/slp1_string.rs +++ b/vidyut-prakriya/src/args/slp1_string.rs @@ -158,6 +158,13 @@ impl Slp1String { } } +impl std::ops::Deref for Slp1String { + type Target = String; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + impl TryFrom for Slp1String { type Error = Error; fn try_from(val: String) -> Result { diff --git a/vidyut-prakriya/src/ashtadhyayi.rs b/vidyut-prakriya/src/ashtadhyayi.rs index 40b8d63..a584018 100644 --- a/vidyut-prakriya/src/ashtadhyayi.rs +++ b/vidyut-prakriya/src/ashtadhyayi.rs @@ -648,13 +648,18 @@ pub fn derive_subanta(mut prakriya: Prakriya, args: &Subanta) -> Result 0 { + let i_last = p.len() - 1; + p.set(i_last, |t| t.add_tag(Tag::Avyaya)); + } + p.add_tag(args.linga().as_tag().into()); pratipadika_karya::run_napumsaka_rules(p); sup_karya::run(p, args.linga(), args.vibhakti(), args.vacana()); samjna::run(p); - samasa::run_rules_for_avyayibhava(p); + samasa::run_avyaya_sup_lopa(p); run_main_rules(p, None, MainArgs::default()); tripadi::run(p); @@ -736,7 +741,7 @@ pub fn derive_samasa(mut prakriya: Prakriya, args: &Samasa) -> Result if args.samasa_type() == SamasaType::Avyayibhava { samjna::run(p); - samasa::run_rules_for_avyayibhava(p); + samasa::run_avyaya_sup_lopa(p); } samjna::try_decide_pratipadika(p); @@ -766,7 +771,7 @@ pub fn derive_vakya(mut prakriya: Prakriya, padas: &[Pada]) -> Result prakriya.extend(p.terms()); } } - Pada::Dummy(s) => { + Pada::Unknown(s) => { let mut pada = Term::make_upadesha(s); pada.add_tags(&[Tag::Pada]); prakriya.push(pada); diff --git a/vidyut-prakriya/src/dhatu_karya.rs b/vidyut-prakriya/src/dhatu_karya.rs index babce15..24a34c2 100644 --- a/vidyut-prakriya/src/dhatu_karya.rs +++ b/vidyut-prakriya/src/dhatu_karya.rs @@ -261,10 +261,13 @@ pub fn try_add_prefixes(p: &mut Prakriya, prefixes: &[String]) -> Option<()> { // TODO: prefixes that aren't upasargas? for prefix in prefixes { - let t: Term = match prefix.parse::() { + let mut t: Term = match prefix.parse::() { Ok(u) => u.into(), _ => Term::make_upadesha(prefix), }; + // For now, assume all dhatu prefixes are gati. + t.add_tag(T::Gati); + p.insert(i_offset, t); samjna::try_nipata_rules(p, i_offset); diff --git a/vidyut-prakriya/src/pratipadika_karya.rs b/vidyut-prakriya/src/pratipadika_karya.rs index 5fa2b54..918ca06 100644 --- a/vidyut-prakriya/src/pratipadika_karya.rs +++ b/vidyut-prakriya/src/pratipadika_karya.rs @@ -46,7 +46,7 @@ pub fn add_basic(p: &mut Prakriya, basic: &BasicPratipadika) { pub fn run_napumsaka_rules(p: &mut Prakriya) -> Option<()> { if p.has_tag(PT::Napumsaka) { let i_last_not_empty = p.find_last_where(|t| !t.is_empty() && !t.is_sup())?; - let t = p.get(i_last_not_empty)?; + let t = p.get_if(i_last_not_empty, |t| !t.is_avyaya())?; let sub = al::to_hrasva(t.antya()?)?; if !t.has_antya(sub) { p.run_at("1.2.47", i_last_not_empty, op::antya_char(&sub)); diff --git a/vidyut-prakriya/src/samasa.rs b/vidyut-prakriya/src/samasa.rs index f3795eb..85375cf 100644 --- a/vidyut-prakriya/src/samasa.rs +++ b/vidyut-prakriya/src/samasa.rs @@ -1,6 +1,7 @@ use crate::args::BaseKrt as K; use crate::args::Samasa; use crate::args::SamasaType; +use crate::args::Stri; use crate::args::Sup; use crate::core::operators as op; use crate::core::Rule::Varttika; @@ -392,21 +393,32 @@ pub fn try_sup_luk(p: &mut Prakriya) -> Option<()> { Some(()) } -pub fn run_rules_for_avyayibhava(p: &mut Prakriya) { - p.debug("run_rules_for_avyayibhava"); - if p.has_tag(PT::Avyayibhava) { +pub fn run_avyaya_sup_lopa(p: &mut Prakriya) -> Option<()> { + p.debug("run_avyaya_sup_lopa"); + + let i_avyaya = p.find_last_where(|t| t.is_avyaya())?; + let i_n = i_avyaya + 1; + + if p.is_avyayibhava() { p.run("2.4.17", |p| p.add_tag(PT::Napumsaka)); - let i_last = p.terms().len() - 1; - if p.has(i_last, |t| !t.is_sup()) { + if !p.has(i_n, |t| t.is_sup()) { p.run("4.1.2", |p| p.push(make_su_pratyaya())); - if p.has(i_last, |t| t.has_antya('a')) { - p.run_at("2.4.83", i_last + 1, |t| t.set_text("am")); - } else { - p.run_at("2.4.82", i_last + 1, op::luk); - } } } + + if p.has(i_n, |t| { + t.is(Stri::cAp) || t.is(Stri::qAp) || t.is(Stri::wAp) || t.is_sup() + }) { + if p.is_avyayibhava() && p.has(i_avyaya, |t| t.has_antya('a')) { + p.run_at("2.4.83", i_n, |t| t.set_text("am")); + } else { + // kftvA, hftvA + p.run_at("2.4.82", i_avyaya + 1, op::luk); + } + } + + Some(()) } pub fn run(p: &mut Prakriya, args: &Samasa) -> bool { diff --git a/vidyut-prakriya/src/samjna.rs b/vidyut-prakriya/src/samjna.rs index 496c058..81f3981 100644 --- a/vidyut-prakriya/src/samjna.rs +++ b/vidyut-prakriya/src/samjna.rs @@ -68,7 +68,6 @@ pub fn try_pragrhya_rules(p: &mut Prakriya) -> Option<()> { Some(()) } -#[allow(unused)] pub fn try_avyaya_rules(p: &mut Prakriya, i: usize) -> Option<()> { let t = p.get(i)?; @@ -77,7 +76,9 @@ pub fn try_avyaya_rules(p: &mut Prakriya, i: usize) -> Option<()> { // svarAdi contains more than 150 items, so short-circuit the check however we can. false } else { - t.has_text_in(gana::SVAR_ADI) + // HACK to allow this rule to apply only if explicitly an avyaya, otherwise we can't + // add sup to BUyas used as a nominal (BUyAMsi). + t.has_tag(T::Avyaya) && t.has_text_in(gana::SVAR_ADI) } }; @@ -193,10 +194,10 @@ fn try_run_for_pratipadika_at_index(p: &mut Prakriya, i: usize) -> Option<()> { if i_u && !decided && !(prati.has_text("saKi") && !prati.is_samasa()) { if prati.has_text("pati") { if prati.is_samasa() { - p.add_tag_at("1.4.8", i_sup - 1, T::Ghi); + p.add_tag_at("1.4.8", i, T::Ghi); } } else { - p.add_tag_at("1.4.7", i_sup - 1, T::Ghi); + p.add_tag_at("1.4.7", i, T::Ghi); } } else if ii_uu && !decided { if iyan_uvan_astri { @@ -478,4 +479,7 @@ pub fn run(p: &mut Prakriya) { try_run_for_pratipadika(p); try_run_for_sup(p); try_run_for_taddhita(p); + for i in 0..p.len() { + try_avyaya_rules(p, i); + } } diff --git a/vidyut-prakriya/src/stem_gana.rs b/vidyut-prakriya/src/stem_gana.rs index aecd26a..7965c97 100644 --- a/vidyut-prakriya/src/stem_gana.rs +++ b/vidyut-prakriya/src/stem_gana.rs @@ -9,10 +9,14 @@ pub const LAUKIKA_SANKHYA: &[&str] = &[ "azwan", "navan", "daSan", - "ekadaSan", + "ekAdaSan", "dvAdaSan", "trayodaSan", + "caturdaSan", + "paYcadaSan", "zoqaSan", + "saptadaSan", + "azwAdaSan", "viMSati", "triMSat", "catvAriMSat", diff --git a/vidyut-prakriya/src/stritva.rs b/vidyut-prakriya/src/stritva.rs index 76949e3..4ad0997 100644 --- a/vidyut-prakriya/src/stritva.rs +++ b/vidyut-prakriya/src/stritva.rs @@ -147,6 +147,11 @@ pub fn run(p: &mut Prakriya) -> Option<()> { let i_prati = sp.i_prati; let last = sp.last(); + // Not sure how to handle avyayas, but all of these seem out of scope. + if last.is_avyaya() { + return None; + } + // HACK: block uzRihA for now. if last.has_text("uzRih") { return None; diff --git a/vidyut-prakriya/src/tripadi/pada_8_2.rs b/vidyut-prakriya/src/tripadi/pada_8_2.rs index e6b2c4b..8a1b298 100644 --- a/vidyut-prakriya/src/tripadi/pada_8_2.rs +++ b/vidyut-prakriya/src/tripadi/pada_8_2.rs @@ -750,7 +750,7 @@ fn try_change_final_s_and_others(p: &mut Prakriya) -> Option<()> { { // vidvadByAm, uKAsradByAm, ... p.run_at("8.2.72", i, |t| t.set_antya("d")); - } else if is_sa_sajush && !p.nlp_mode() { + } else if is_sa_sajush && !(p.nlp_mode() && p.next_not_empty(i).is_none()) { // agnir atra, sajUr ftuBiH, ... do_ru_adesha("8.2.66", p, i); } diff --git a/vidyut-prakriya/src/tripadi/pada_8_3.rs b/vidyut-prakriya/src/tripadi/pada_8_3.rs index 9b1bd58..b9d766b 100644 --- a/vidyut-prakriya/src/tripadi/pada_8_3.rs +++ b/vidyut-prakriya/src/tripadi/pada_8_3.rs @@ -174,7 +174,7 @@ fn try_visarjaniyasya(p: &mut Prakriya) -> Option<()> { } else if y.has_at(1, SHAR) { p.run_at("8.3.35", i_x, |_| {}); } else if y.has_adi(KU_PU) { - if x.has_text_in(&["namas", "puras"]) && x.is_gati() { + if x.has_u_in(&["namas", "puras"]) && x.is_gati() { p.run_at("8.3.40", i_x, |t| t.set_antya("s")); } else if is_it_ut_upadha(x) && !x.is_pratyaya() { p.run_at("8.3.41", i_x, |t| t.set_antya("z")); diff --git a/vidyut-prakriya/tests/integration/kashika_6_4.rs b/vidyut-prakriya/tests/integration/kashika_6_4.rs index 51e4333..d164ad2 100644 --- a/vidyut-prakriya/tests/integration/kashika_6_4.rs +++ b/vidyut-prakriya/tests/integration/kashika_6_4.rs @@ -111,11 +111,13 @@ fn sutra_6_4_10() { assert_has_sup_1p(&shreyas, Napumsaka, &["SreyAMsi"]); assert_has_sup_1p("payas", Napumsaka, &["payAMsi"]); assert_has_sup_1p("yaSas", Napumsaka, &["yaSAMsi"]); + // mahat let mahat = create_krdanta("mahat", &[], &d("maha~", Bhvadi), Unadi::ati); assert_has_sup_1s(&mahat, Pum, &["mahAn"]); assert_has_sup_1d(&mahat, Pum, &["mahAntO"]); assert_has_sup_1p(&mahat, Pum, &["mahAntaH"]); + // asambudDo assert_has_sup_ss(&shreyas, Pum, &["Sreyan"]); assert_has_sup_ss(&mahat, Pum, &["mahan"]); diff --git a/vidyut-prakriya/tests/integration/kaumudi_13.rs b/vidyut-prakriya/tests/integration/kaumudi_13.rs index f9745cf..7ed48dc 100644 --- a/vidyut-prakriya/tests/integration/kaumudi_13.rs +++ b/vidyut-prakriya/tests/integration/kaumudi_13.rs @@ -132,6 +132,7 @@ fn sk_446() { assert_has_sup_1p(&caksus, Napumsaka, &["cakzUMzi"]); assert_has_sup_3s(&caksus, Napumsaka, &["cakzuzA"]); assert_has_sup_3d(&caksus, Napumsaka, &["cakzurByAm"]); + let havis = create_krdanta("havis", &[], &d("hu\\", Juhotyadi), Unadi::isi); assert_has_sup_1s(&havis, Napumsaka, &["haviH"]); assert_has_sup_1d(&havis, Napumsaka, &["havizI"]); diff --git a/vidyut-prakriya/tests/integration/regressions.rs b/vidyut-prakriya/tests/integration/regressions.rs index 9620c97..271bae1 100644 --- a/vidyut-prakriya/tests/integration/regressions.rs +++ b/vidyut-prakriya/tests/integration/regressions.rs @@ -8,7 +8,7 @@ use vidyut_prakriya::args::Gana::*; use vidyut_prakriya::args::Krdanta; use vidyut_prakriya::args::Lakara::*; use vidyut_prakriya::args::Linga::*; -use vidyut_prakriya::args::{BaseKrt as Krt, Dhatu, Lakara, Prayoga}; +use vidyut_prakriya::args::{BaseKrt as Krt, Dhatu, Lakara, Prayoga, Taddhita}; use vidyut_prakriya::Vyakarana; #[test] @@ -279,3 +279,24 @@ fn iccha_nipatana() { fn babdhi() { assert_has_sip(&[], &d("Basa~", Juhotyadi), Lot, &["babDi", "babDAt"]); } + +// Tests that sup-luk occurs after avyayas, regardless of which sup we try adding. +#[test] +fn gantum_sup() { + let gantum = krdanta(&[], &d("ga\\mx~", Bhvadi), Krt::tumun); + assert_has_sup_1s(&gantum, Pum, &["gantum"]); + assert_has_sup_1d(&gantum, Pum, &["gantum"]); + assert_has_sup_1p(&gantum, Pum, &["gantum"]); + assert_has_sup_1s(&gantum, Stri, &["gantum"]); + + let gatva = krdanta(&[], &d("ga\\mx~", Bhvadi), Krt::ktvA); + assert_has_sup_1s(&gatva, Pum, &["gatvA"]); + assert_has_sup_4s(&gatva, Napumsaka, &["gatvA"]); +} + +// Simultaneous derivation of taddhita + sup +#[test] +fn hanumataa() { + let hanumat = taddhitanta("hanu", Taddhita::matup); + assert_has_sup_3s(&hanumat, Pum, &["hanumatA"]); +} diff --git a/vidyut-sandhi/src/generator.rs b/vidyut-sandhi/src/generator.rs index 9d126f8..e88f31e 100644 --- a/vidyut-sandhi/src/generator.rs +++ b/vidyut-sandhi/src/generator.rs @@ -19,7 +19,7 @@ const HAL: &str = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzsh"; /// - the first part is `a` /// - the second part is `i` /// - the result is `e`. -#[derive(Debug)] +#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] pub struct Rule { first: String, second: String, @@ -28,15 +28,15 @@ pub struct Rule { impl Rule { /// Returns the first part of the rule. - pub fn first(&self) -> &String { + pub fn first(&self) -> &str { &self.first } /// Returns the second part of the rule. - pub fn second(&self) -> &String { + pub fn second(&self) -> &str { &self.second } /// Returns the result of the rule. - pub fn result(&self) -> &String { + pub fn result(&self) -> &str { &self.result } } diff --git a/vidyut-sandhi/src/sounds.rs b/vidyut-sandhi/src/sounds.rs index 94bd76d..1a92159 100644 --- a/vidyut-sandhi/src/sounds.rs +++ b/vidyut-sandhi/src/sounds.rs @@ -6,6 +6,7 @@ use lazy_static::lazy_static; /// /// This implementation is copied directly from `vidyut_prakriya::sounds`. For details, see the /// comments there. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd)] pub struct Set([u8; 256]); impl Set { diff --git a/vidyut-sandhi/src/splitter.rs b/vidyut-sandhi/src/splitter.rs index 42ea834..72a3d7c 100644 --- a/vidyut-sandhi/src/splitter.rs +++ b/vidyut-sandhi/src/splitter.rs @@ -24,7 +24,7 @@ use std::collections::hash_map::Keys; use std::path::Path; /// Describes the type of sandhi split that occurred. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub enum Kind { /// A split created by slicing the input string, with no sandhi rules applied. That is, /// `split.first` is a *prefix* of the original string. @@ -34,7 +34,7 @@ pub enum Kind { } /// Describes the type of sandhi split that occurred. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub enum Location { /// Indicates that the split occurs within a chunk. WithinChunk, @@ -43,7 +43,7 @@ pub enum Location { } /// Models a sandhi split. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] pub struct Split { first: CompactString, second: String, @@ -99,7 +99,7 @@ impl Split { } /// Maps a combination to the two strings (first, second) that created it. -#[derive(Default, Debug)] +#[derive(Clone, Default, Debug, Eq, PartialEq)] pub struct SplitsMap(FxHashMap>); impl SplitsMap { @@ -128,6 +128,7 @@ impl SplitsMap { } /// Splits Sanskrit words and expressions according to the specified rules. +#[derive(Clone, Default, Debug, Eq, PartialEq)] pub struct Splitter { map: SplitsMap, len_longest_key: usize,