From 60f5f9517291ff90f9536ea2a9517743d834913a Mon Sep 17 00:00:00 2001 From: Arun Prasad Date: Mon, 25 Dec 2023 19:06:39 -0800 Subject: [PATCH] [lipi] Add a stronger implementation This commit adds support for a variety of schemes as defined by the `indic_transliteration` project. It also adds a minimal test suite as a stronger guarantee on program correctness. `vidyut-lipi` is still immature compared to transliterators like Aksharamukha or `indic_transliteration`. However, it is on a good trajectory, and I think it will become a compelling transliteratior backend over time. Some notes on design: - This commit does not use any of the code from @skmnktl's in-progress transliterator, but it does borrow the idea of incorporating `indic_transliteration`'s TOML maps directly into the program source code. - I liked @skmnktl's idea of using a `Token` enum as an intermediate representation between the input scheme and the output scheme, but pursing that approach felt cumbersome when mapping between *sequences* of characters (e.g. when working with ITRANS), so I've stayed with the approach used by `indic_transliteration`, i.e. using Devanagari as the intermediate representation. --- Cargo.lock | 4 + vidyut-lipi/.gitignore | 1 + vidyut-lipi/Cargo.toml | 14 + vidyut-lipi/Makefile | 5 + vidyut-lipi/README.md | 75 +- vidyut-lipi/scripts/create_schemes.py | 186 +++ vidyut-lipi/scripts/run-debugger.sh | 16 + vidyut-lipi/src/lib.rs | 645 ++++++--- vidyut-lipi/src/schemes.rs | 1899 +++++++++++++++++++++++++ vidyut-lipi/src/wasm.rs | 38 + vidyut-lipi/tests/basic.rs | 142 ++ vidyut-lipi/www/index.html | 125 ++ vidyut-lipi/www/static/app.js | 266 ++++ 13 files changed, 3230 insertions(+), 186 deletions(-) create mode 100644 vidyut-lipi/.gitignore create mode 100644 vidyut-lipi/Makefile create mode 100755 vidyut-lipi/scripts/create_schemes.py create mode 100755 vidyut-lipi/scripts/run-debugger.sh create mode 100644 vidyut-lipi/src/schemes.rs create mode 100644 vidyut-lipi/src/wasm.rs create mode 100644 vidyut-lipi/tests/basic.rs create mode 100644 vidyut-lipi/www/index.html create mode 100644 vidyut-lipi/www/static/app.js diff --git a/Cargo.lock b/Cargo.lock index d525702..3e2a048 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1004,6 +1004,10 @@ name = "vidyut-lipi" version = "0.1.0" dependencies = [ "clap", + "console_error_panic_hook", + "rustc-hash", + "serde-wasm-bindgen", + "wasm-bindgen", ] [[package]] diff --git a/vidyut-lipi/.gitignore b/vidyut-lipi/.gitignore new file mode 100644 index 0000000..4841ba7 --- /dev/null +++ b/vidyut-lipi/.gitignore @@ -0,0 +1 @@ +www/static/wasm diff --git a/vidyut-lipi/Cargo.toml b/vidyut-lipi/Cargo.toml index f932338..9eb9db2 100644 --- a/vidyut-lipi/Cargo.toml +++ b/vidyut-lipi/Cargo.toml @@ -1,9 +1,23 @@ [package] name = "vidyut-lipi" version = "0.1.0" +authors = ["Arun Prasad "] +description = "A Sanskrit transliterator" +homepage = "https://github.com/ambuda-org/vidyut" +repository = "https://github.com/ambuda-org/vidyut" +categories = ["text-processing"] +keywords = ["sanskrit"] +license = "MIT" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +rustc-hash = "1.1.0" clap = { version = "4.0.12", features = ["derive"] } +wasm-bindgen = "0.2" +serde-wasm-bindgen = "0.4" +console_error_panic_hook = "0.1.7" + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/vidyut-lipi/Makefile b/vidyut-lipi/Makefile new file mode 100644 index 0000000..638329d --- /dev/null +++ b/vidyut-lipi/Makefile @@ -0,0 +1,5 @@ +debugger: + ./scripts/run-debugger.sh + +test: + cargo nextest run --no-fail-fast --status-level=fail diff --git a/vidyut-lipi/README.md b/vidyut-lipi/README.md index 852b75f..8135d07 100644 --- a/vidyut-lipi/README.md +++ b/vidyut-lipi/README.md @@ -1,17 +1,78 @@ -*vidyut-lipi* is a work-in-progress transliterator. It is not ready for public use. +
+

vidyut-lipi

+

A fast Indic transliterator

+
+ +`vidyut-lipi` is an experimental Sanskrit transliteration library that also +supports many of the scripts used within the Indosphere. Our goal is to provide +a standard transliterator for the Sanskrit ecosystem that is easy to bind to +other programming languages. + +This [crate][crate] is under active development as part of the [Ambuda][ambuda] +project. If you enjoy our work and wish to contribute to it, we encourage you +to [join our Discord server][discord], where you can meet other Sanskrit +programmers and enthusiasts. + +An online demo is available [here][demo]. + +[crate]: https://doc.rust-lang.org/book/ch07-01-packages-and-crates.html +[ambuda]: https://ambuda.org +[discord]: https://discord.gg/7rGdTyWY7Z +[demo]: https://ambuda-org.github.io/vidyut-lipi/ + +- [Overview](#overview) +- [Usage](#usage) +- [Design](#design) + + +Overview +-------- + +Communities around the world write Sanskrit and other Indian languages in +different scripts in different contexts. For example, a user might type +Sanskrit in ITRANS, read it in Kannada, and publish it in Devanagari. Such +communities often rely on a *transliterator*, which converts text from one +scheme to another. + +While various transliterators exist, none are both high-quality and widely +available in different programming languages. The result is that maintenance +and feature work is diluted across several different implementations. + +`vidyut-lipi` aims to provide a standard transliterator for the Sanskrit +ecosystem. Our priorities are: + +- quality, including a comprehensive test suite. +- coverage across all of the schemes in common use. +- ease of use (and reuse) for developers. +- high performance across various metrics, including runtime, startup time, and + file size. + +We recommend `vidyut-lipi` if you need a simple and high-quality +transliteration library, and we encourage you to [file an issue][issue] if +`vidyut-lipi` does not support your use case. We are especially excited about +supporting new scripts and new programming languages. + +[issue]: https://github.com/ambuda-org/vidyut/issues + +If `vidyut-lipi` is not right for your needs, we also strongly recommend +the [Aksharamukha][aksharamukha] the [indic-transliteration][indic-trans] +projects, which have each been highly influential in our work on `vidyut-lipi`. + +[aksharamukha]: https://github.com/virtualvinodh/aksharamukha/ +[indic-trans]: https://github.com/indic-transliteration Usage ----- +For simple use cases that aren't very performance-sensitive, we recommend using +`vidyut-lipi` like so: + ```rust use vidyut_lipi::{Scheme, transliterate}; -let result = transliterate("devau", Scheme::Iast, Scheme::Slp1); -assert_eq!(result, "devO"); +let result = transliterate("devO", Scheme::Slp1, Scheme::Iast); +assert_eq!(result, "devau"); ``` -```shell -# Run transliteration -$ cargo run --bin transliterate -- --text rāmau -``` +We are still stabilizing our API and will share more examples here soon. diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py new file mode 100755 index 0000000..5b596cb --- /dev/null +++ b/vidyut-lipi/scripts/create_schemes.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +"""Create schemes for vidyut-lipi and writes them to `src/schemes.rs`. + +We create these mappings by modifying the data in the `common_maps` dir from +the indic-transliteration project. +""" + +import tomllib +import subprocess +from pathlib import Path +from glob import glob +import shutil + +CRATE_DIR = Path(__file__).parent.parent + +VOWEL_TO_MARK = { + "आ": "\u093e", + "इ": "\u093f", + "ई": "\u0940", + "उ": "\u0941", + "ऊ": "\u0942", + "ऋ": "\u0943", + "ॠ": "\u0944", + "ऌ": "\u0962", + "ॡ": "\u0963", + "ऎ": "\u0946", + "ए": "\u0947", + "ऐ": "\u0948", + "ऒ": "\u094a", + "ओ": "\u094b", + "औ": "\u094c", +} + +ALLOWED = { + "BENGALI", + "BRAHMI", + "DEVANAGARI", + "GUJARATI", + "GURMUKHI", + "GRANTHA", + "KANNADA", + "MALAYALAM", + "ORIYA", + "SINHALA", + "TAMIL", + "TELUGU", + "TIBETAN", + + "HK", + "IAST", + "ITRANS", + "SLP1", + "VELTHUIS", +} + + +def _sanitize(s: str) -> str: + return s.replace("\\", "\\\\").replace('"', '\\"') + + +def _maybe_override(name: str, deva: str, raw: str) -> str | None: + if name == "BRAHMI": + if deva == "\u0946": + # short e mark + return None + if deva == "\u094a": + # short o mark + return None + elif name == "HK": + if raw == "|": + return "." + if raw == "||": + return ".." + elif name == "IAST": + if deva == "ळ": + return "ḻ" + if deva == "ऴ": + return None + if raw == "|": + return "." + if raw == "||": + return ".." + elif name == "VELTHUIS": + # These are part of the Velthuis spec but are errors in indic-transliteration. + if deva == "ॠ": + return ".R" + if deva == "ॡ": + return ".L" + return raw + + +def create_scheme_str(name: str, items: list[tuple[str, str]]) -> str: + buf = [] + + buf.append(f"pub const {name}: &[(&str, &str)] = &[") + for deva, raw in items: + deva = _sanitize(deva) + raw = _sanitize(raw) + buf.append(f' ("{deva}", "{raw}"),') + buf.append("];\n") + + return "\n".join(buf) + + +def main(): + repo = "https://github.com/indic-transliteration/common_maps.git" + common_maps = Path("common_maps") + if not common_maps.exists(): + print("Cloning `common_maps` ...") + subprocess.run(f"git clone --depth 1 {repo}", shell=True) + + print("Creating schemes ...") + buf = [ + "#![allow(unused)]", + "", + "//! Auto-generated scheme data.", + "//!", + "//! These schemes were auto-generated from the `common_maps` repository", + "//! from the `indic-transliteration` project.", + "", + ] + for path in sorted(glob("common_maps/**/*.toml")): + with open(path, "rb") as f: + data = tomllib.load(f) + + scheme_name = Path(path).stem.upper() + if scheme_name not in ALLOWED: + continue + + scheme_type = Path(path).parent.stem + assert scheme_type in {"roman", "brahmic"}, scheme_type + + scheme_items = [] + raw_to_deva = {} + + for category in data: + if category.startswith("_"): + # Ignore file comments, etc. + continue + + if category == "shortcuts": + # TODO: support these + continue + + if category.endswith("alternates"): + for raw_main, alts in data[category].items(): + deva = raw_to_deva.get(raw_main) + if deva is None: + continue + for alt in alts: + assert isinstance(deva, str) + assert isinstance(alt, str) + alt = _maybe_override(scheme_name, deva, alt) + if alt is not None: + scheme_items.append((deva, alt)) + else: + for deva, raw in data[category].items(): + assert isinstance(deva, str) + assert isinstance(raw, str) + raw = _maybe_override(scheme_name, deva, raw) + if raw is not None: + raw_to_deva[raw] = deva + scheme_items.append((deva, raw)) + + if scheme_type == "roman" and category == "vowels": + for vowel, raw in data[category].items(): + raw = _maybe_override(scheme_name, vowel, raw) + mark = VOWEL_TO_MARK.get(vowel) + if mark: + assert isinstance(mark, str) + assert isinstance(raw, str) + scheme_items.append((mark, raw)) + + buf.append(create_scheme_str(scheme_name, scheme_items)) + + with open(CRATE_DIR / "src/schemes.rs", "w") as f: + f.write("\n".join(buf)) + + print("Cleaning up ...") + shutil.rmtree(common_maps) + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/vidyut-lipi/scripts/run-debugger.sh b/vidyut-lipi/scripts/run-debugger.sh new file mode 100755 index 0000000..e4f0d40 --- /dev/null +++ b/vidyut-lipi/scripts/run-debugger.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env sh +if [[ ! $(command -v wasm-pack) ]] +then + echo "Our debugger requires wasm-pack. Please install wasm-pack:" + echo "https://rustwasm.github.io/wasm-pack/installer/" + echo + exit 1 +fi + +# `cargo` uses the debug build by default, but `wasm-pack` uses the release +# build by default instead. Creating this release build is slow, but the debug +# build seems to have issues with enum parsing. So, stick with the release +# build. +wasm-pack build --target web --release +mkdir -p www/static/wasm && cp pkg/* www/static/wasm +cd www && python3 -m http.server diff --git a/vidyut-lipi/src/lib.rs b/vidyut-lipi/src/lib.rs index 133c89d..26390c2 100644 --- a/vidyut-lipi/src/lib.rs +++ b/vidyut-lipi/src/lib.rs @@ -3,203 +3,440 @@ #![deny(missing_docs)] #![deny(clippy::unwrap_used)] -use std::cmp; +use rustc_hash::FxHashMap; +use wasm_bindgen::prelude::wasm_bindgen; -/// Defines the available transliteration schemes. +mod schemes; +pub mod wasm; + +type Pair = (&'static str, &'static str); + +/// A method of encoding text. +/// +/// Schemes vary on various dimensions, including: +/// +/// - writing system (alphabet vs. abugida) +/// - text encoding (ASCII vs. Unicode) +/// - support for Sanskrit (complete vs. partial) #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] +#[wasm_bindgen] pub enum Scheme { - /// SlP1 transliteration. - Slp1, + /// Bengali script. + /// + /// https://unicode.org/charts/PDF/U0980.pdf + Bengali, + + /// Brahmi script. + /// + /// https://unicode.org/charts/PDF/U11000.pdf + Brahmi, + + /// Devanagari script. + /// + /// https://unicode.org/charts/PDF/U0900.pdf + Devanagari, + + /// Gujarati script. + /// + /// https://unicode.org/charts/PDF/U0A80.pdf + Gujarati, + + /// Gurmukhi script. + /// + /// https://unicode.org/charts/PDF/U0A00.pdf + Gurmukhi, + + /// Grantha script. + /// + /// http://www.unicode.org/charts/PDF/U11300.pdf + Grantha, + + /// Kannada script. + /// + /// https://unicode.org/charts/PDF/U0C80.pdf + Kannada, + + /// Malayalam script. + /// + /// https://unicode.org/charts/PDF/U0D00.pdf + Malayalam, + + /// Oriya script. + /// + /// https://unicode.org/charts/PDF/U0B00.pdf + Oriya, + + /// Sinhala script. + /// + /// https://unicode.org/charts/PDF/U0D80.pdf + Sinhala, + + /// Tamil script. + /// + /// https://unicode.org/charts/PDF/U0B80.pdf + Tamil, + + /// Tibetan script. + /// + /// https://unicode.org/charts/PDF/U0F00.pdf + // Tibetan, + + /// Telugu script. + /// + /// https://unicode.org/charts/PDF/U0C00.pdf + Telugu, + + /// Harvard-Kyoto transliteration. + /// + /// TODO: find documentation link for HK. + HarvardKyoto, + + /// ITRANS transliteration. + /// + /// https://www.aczoom.com/itrans/online/itrans6/itrans-tables-unicode.pdf + Itrans, + /// IAST transliteration. + /// + /// TODO: find documentation link for IAST. Iast, - /// Devanagari. - Devanagari, + + /// SLP1 transliteration. + /// + /// https://www.sanskritlibrary.org/pub/SLP1LiesAppendixB.pdf + Slp1, + + /// Velthuis transliteration. + /// + /// https://mirrors.mit.edu/CTAN/language/devanagari/velthuis/doc/manual.pdf + Velthuis, } -fn map_char(cur: &str) -> Option<&'static str> { - let val = match cur { - "ā" => "A", - "ī" => "I", - "ū" => "U", - "ṛ" => "f", - "ṝ" => "F", - "ḷ" => "x", - "ḹ" => "X", - "ai" => "E", - "au" => "O", - "ṃ" => "M", - "ḥ" => "H", - "ṅ" => "N", - "kh" => "K", - "gh" => "G", - "ch" => "C", - "jh" => "J", - "ñ" => "Y", - "ṭ" => "w", - "ṭh" => "W", - "ḍ" => "q", - "ḍh" => "Q", - "th" => "T", - "dh" => "D", - "ph" => "P", - "bh" => "B", - "ṇ" => "R", - "ś" => "S", - "ṣ" => "z", - "ḻ" => "L", - &_ => return None, - }; - Some(val) +impl Scheme { + fn token_pairs(&self) -> &[Pair] { + match self { + Scheme::Bengali => schemes::BENGALI, + Scheme::Brahmi => schemes::BRAHMI, + Scheme::Devanagari => schemes::DEVANAGARI, + Scheme::Gujarati => schemes::GUJARATI, + Scheme::Gurmukhi => schemes::GURMUKHI, + Scheme::Grantha => schemes::GRANTHA, + Scheme::Kannada => schemes::KANNADA, + Scheme::Malayalam => schemes::MALAYALAM, + Scheme::Oriya => schemes::ORIYA, + Scheme::Sinhala => schemes::SINHALA, + Scheme::Tamil => schemes::TAMIL, + Scheme::Telugu => schemes::TELUGU, + // Scheme::Tibetan => schemes::TIBETAN, + Scheme::Slp1 => schemes::SLP1, + Scheme::HarvardKyoto => schemes::HK, + Scheme::Itrans => schemes::ITRANS, + Scheme::Iast => schemes::IAST, + Scheme::Velthuis => schemes::VELTHUIS, + } + } + + /// Returns whether this scheme represents an abugida. + pub fn is_abugida(&self) -> bool { + use Scheme::*; + + // Use an exhaustive match (no `_`) so that we explicitly account for all schemes. + match self { + // Abugidas are all `true`. + Bengali | Brahmi | Devanagari | Gujarati | Gurmukhi | Grantha | Kannada | Malayalam + | Oriya | Sinhala | Tamil | Telugu => true, + + // Alphabets are all `false`. + HarvardKyoto | Itrans | Iast | Slp1 | Velthuis => false, + } + } + + /// Returns whether this scheme represents an alphabet. + pub fn is_alphabet(&self) -> bool { + !self.is_abugida() + } + + /// Returns whether this scheme supports all sounds in post-Vedic Sanskrit. + /// + /// This check excludes accent and other vedic symbols. + #[allow(unused)] + pub(crate) fn supports_basic_sanskrit(&self) -> bool { + use Scheme::*; + + matches!( + self, + Devanagari | Gujarati | Grantha | Kannada | Malayalam | Oriya | Sinhala | Telugu + ) + } +} + +/// Defines a mapping between two schemes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Mapping { + from: Scheme, + to: Scheme, + all: FxHashMap, + marks: FxHashMap, + input_virama: String, + output_virama: String, + consonants: FxHashMap, + len_longest_key: usize, +} + +struct OneWayMapping { + // Maps from Devanagari to all options available in the given scheme. + data: FxHashMap>, + virama: String, +} + +fn create_kv_map(pairs: &[Pair]) -> OneWayMapping { + const VIRAMA: &str = "\u{094d}"; + + let mut data = FxHashMap::default(); + let mut virama = String::new(); + for (key, value) in pairs { + let key = key.to_string(); + if key == VIRAMA { + virama += value; + } + let vals: &mut Vec<_> = data.entry(key).or_default(); + vals.push(value.to_string()); + } + OneWayMapping { data, virama } +} + +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] +enum TokenType { + /// A consonant. A following vowel becomes a vowel mark. + Consonant, + /// A vowel mark, which must follow a consonant. + VowelMark, + /// Any other token. + Other, +} + +fn decide_token_type(s: &str) -> TokenType { + const MARK_AA: u32 = 0x093e; + const MARK_AU: u32 = 0x094c; + const MARK_L: u32 = 0x0962; + const MARK_LL: u32 = 0x0963; + const MARK_PRISHTAMATRA_E: u32 = 0x094e; + const MARK_AW: u32 = 0x094f; + + const CONS_KA: u32 = 0x0915; + const CONS_HA: u32 = 0x0939; + const CONS_QA: u32 = 0x0958; + const CONS_YYA: u32 = 0x095f; + const CONS_DDDA: u32 = 0x097e; + const CONS_BBA: u32 = 0x097f; + + // const VIRAMA: u32 = 0x094d; + + if let Some(c) = s.chars().last() { + let code = c as u32; + if (code >= MARK_AA && code <= MARK_AU) + || code == MARK_PRISHTAMATRA_E + || code == MARK_AW + || code == MARK_L + || code == MARK_LL + { + TokenType::VowelMark + } else if (code >= CONS_KA && code <= CONS_HA) + || (code >= CONS_QA && code <= CONS_YYA) + || code == CONS_DDDA + || code == CONS_BBA + // || code == VIRAMA + { + TokenType::Consonant + } else { + TokenType::Other + } + } else { + TokenType::Other + } +} + +impl Mapping { + /// Creates a mappping between the given `Scheme`s. + fn new(from_scheme: Scheme, to_scheme: Scheme) -> Mapping { + let from = create_kv_map(from_scheme.token_pairs()); + let to = create_kv_map(to_scheme.token_pairs()); + + let mut all = FxHashMap::default(); + let mut marks = FxHashMap::default(); + let mut consonants = FxHashMap::default(); + + // Iterate over token pairs so that we maintain the input order. + for (deva_key, f) in from_scheme.token_pairs() { + let token_type = decide_token_type(deva_key); + let to_values = match to.data.get(*deva_key) { + Some(x) => x, + None => continue, + }; + let t = match to_values.first() { + Some(x) => x, + None => continue, + }; + + match token_type { + TokenType::VowelMark => { + marks.insert(f.to_string(), t.to_string()); + } + TokenType::Consonant => { + consonants.insert(f.to_string(), t.to_string()); + } + TokenType::Other => (), + } + + // Insert only the first match seen. Consequences: + // + // - If a sound maps to both a vowel and a vowel mark, we insert the vowel mark, + // which comes first in our representation. + // + // - If a sound has alternates, we store only the first. + if !all.contains_key(*f) { + all.insert(f.to_string(), t.to_string()); + } + } + + let len_longest_key = all.keys().map(|x| x.len()).max().unwrap_or(0); + + Self { + from: from_scheme, + to: to_scheme, + all, + marks, + consonants, + input_virama: from.virama, + output_virama: to.virama, + len_longest_key, + } + } + + /// The input scheme for this mapping. + pub fn from(&self) -> Scheme { + self.from + } + + /// The output scheme for this mapping. + pub fn to(&self) -> Scheme { + self.to + } + + fn get(&self, key: &str) -> Option<&String> { + self.all.get(key) + } } -/// Hackily transliterate from IAST to SLP1. -fn iast_to_slp1(input: &str) -> String { +/// Transliterates from an abugida. +fn transliterate_from_abugida(input: &str, mapping: Mapping) -> String { let chars: Vec = input.chars().collect(); - let mut ret = String::new(); + let is_to_alpha = mapping.to.is_alphabet(); + + let mut output = String::new(); let mut i = 0; + let mut key = String::new(); + let mut had_consonant = false; while i < chars.len() { - let mut next: Option<&str> = None; - let mut offset = 0; - - // Search for matches against our mapping. The longest IAST glyph has two characters, - // so search up to length 2. Start with 2 first so that we match greedily. - for j in [2, 1] { - let limit = cmp::min(i + j, chars.len()); - let cur = String::from_iter(&chars[i..limit]); - offset = limit - i; - - next = map_char(cur.as_str()); - if let Some(_s) = next { - break; - } - } + key.clear(); + key.extend(&chars[i..=i]); - match next { + match mapping.get(&key) { Some(s) => { - ret += s; - i += offset; + if had_consonant + && (mapping.marks.contains_key(&key) || key == mapping.input_virama) + { + // Pop implicit "a" vowel. + output.pop(); + } + + output += s; + + if is_to_alpha && mapping.consonants.contains_key(&key) { + // Add implicit "a" vowel. + output += "a"; + had_consonant = true; + } } None => { - // Use the original character as-is. - ret += &String::from_iter(&chars[i..=i]); - i += 1; + output.push_str(&key); } } + i += 1; } - ret + + output } -fn slp1_to_devanagari(text: &str) -> String { - const VIRAMA: char = '\u{094D}'; - - let mut ret = String::new(); - for c in text.chars() { - let out = match c { - 'a' => "अ", - 'A' => "आ", - 'i' => "इ", - 'I' => "ई", - 'u' => "उ", - 'U' => "ऊ", - 'f' => "ऋ", - 'F' => "ॠ", - 'x' => "ऌ", - 'X' => "ॡ", - 'e' => "ए", - 'E' => "ऐ", - 'o' => "ओ", - 'O' => "औ", - '~' => "\u{0901}", - 'M' => "\u{0902}", - 'H' => "\u{0903}", - 'k' => "क", - 'K' => "ख", - 'g' => "ग", - 'G' => "घ", - 'N' => "ङ", - 'c' => "च", - 'C' => "छ", - 'j' => "ज", - 'J' => "झ", - 'Y' => "ञ", - 'w' => "ट", - 'W' => "ठ", - 'q' => "ड", - 'Q' => "ढ", - 'R' => "ण", - 't' => "त", - 'T' => "थ", - 'd' => "द", - 'D' => "ध", - 'n' => "न", - 'p' => "प", - 'P' => "फ", - 'b' => "ब", - 'B' => "भ", - 'm' => "म", - 'y' => "य", - 'r' => "र", - 'l' => "ल", - 'v' => "व", - 'S' => "श", - 'z' => "ष", - 's' => "स", - 'h' => "ह", - 'L' => "ळ", - other => { - ret.push(other); - continue; - } - }; +/// Transliterates from an alphabet. +fn transliterate_from_alphabet(input: &str, mapping: Mapping) -> String { + let chars: Vec = input.chars().collect(); + let is_to_abugida = mapping.to.is_abugida(); - let vowel_mark = match c { - 'a' => Some(""), - 'A' => Some("\u{093E}"), - 'i' => Some("\u{093F}"), - 'I' => Some("\u{0940}"), - 'u' => Some("\u{0941}"), - 'U' => Some("\u{0942}"), - 'f' => Some("\u{0943}"), - 'F' => Some("\u{0944}"), - 'x' => Some("\u{0962}"), - 'X' => Some("\u{0963}"), - 'e' => Some("\u{0947}"), - 'E' => Some("\u{0948}"), - 'o' => Some("\u{094B}"), - 'O' => Some("\u{094C}"), - _ => None, - }; + let mut output = String::new(); + let mut i = 0; + let mut key = String::new(); + let mut had_consonant = false; + while i < chars.len() { + let mut o: Option<&String> = None; - if ret.chars().last() == Some(VIRAMA) && vowel_mark.is_some() { - // Pop virama and add. - ret.pop(); - ret += vowel_mark.expect("ok"); - } else { - ret += out; + let mut key_len_in_chars = 0; + for j in (1..=mapping.len_longest_key).rev() { + key_len_in_chars = j; + let limit = std::cmp::min(i + j, chars.len()); + key.clear(); + key.extend(&chars[i..limit]); + + o = mapping.get(&key); + if let Some(_s) = o { + break; + } } - let is_consonant = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzshL".contains(c); - if is_consonant { - ret.push(VIRAMA); + match o { + Some(o) => { + if had_consonant { + if let Some(mark) = mapping.marks.get(&key) { + if is_to_abugida { + output.pop(); + } + output += mark; + } else if key == "a" && is_to_abugida { + output.pop(); + } else { + output += o; + } + } else { + output += o; + } + + had_consonant = mapping.consonants.contains_key(&key); + if had_consonant && is_to_abugida { + output += &mapping.output_virama; + } + } + None => { + // Use the original character as-is. + output.push_str(&key); + had_consonant = false; + } } + + // Add length in *chars*, not in *bytes*. Otherwise we get weird output. + debug_assert!(key_len_in_chars > 0); + i += key_len_in_chars; } - ret + output } /// Transliterates the given input text. -/// -/// ### Panics -/// -/// Only the IAST -> SLP1 and SLP1 -> Devanagari mappings are defined. All other mappings will -/// panic. pub fn transliterate(input: &str, from: Scheme, to: Scheme) -> String { - use Scheme::*; - if from == Iast && to == Slp1 { - iast_to_slp1(input) - } else if from == Slp1 && to == Devanagari { - slp1_to_devanagari(input) + let mapping = Mapping::new(from, to); + + if from.is_abugida() { + transliterate_from_abugida(input, mapping) } else { - panic!("Unsupported scheme combination: {from:?} -> {to:?}") + transliterate_from_alphabet(input, mapping) } } @@ -208,19 +445,69 @@ mod tests { use super::*; #[test] - fn test_to_slp1() { - let t = |s| transliterate(s, Scheme::Iast, Scheme::Slp1); - - assert_eq!(t("a ā i ī u ū ṛ ṝ ḷ ḹ"), "a A i I u U f F x X"); - assert_eq!(t("e ai o au ṃ ḥ"), "e E o O M H"); - assert_eq!(t("k kh g gh ṅ"), "k K g G N"); - assert_eq!(t("c ch j jh ñ"), "c C j J Y"); - assert_eq!(t("ṭ ṭh ḍ ḍh ṇ"), "w W q Q R"); - assert_eq!(t("t th d dh n"), "t T d D n"); - assert_eq!(t("p ph b bh m"), "p P b B m"); - assert_eq!(t("y r l v"), "y r l v"); - assert_eq!(t("ś ṣ s h ḻ"), "S z s h L"); - - assert_eq!(t("vāgarthāviva saṃpṛktau"), "vAgarTAviva saMpfktO"); + fn schemes() { + let mark_aa = "\u{093e}"; + + let slp1 = Scheme::Slp1.token_pairs(); + assert!(slp1.contains(&("आ", "A"))); + assert!(slp1.contains(&(mark_aa, "A"))); + + let hk = Scheme::HarvardKyoto.token_pairs(); + assert!(hk.contains(&("आ", "A"))); + assert!(hk.contains(&(mark_aa, "A"))); + + let deva = Scheme::Devanagari.token_pairs(); + assert!(deva.contains(&("आ", "आ"))); + assert!(deva.contains(&(mark_aa, mark_aa))); + + let deva = Scheme::Devanagari; + assert_ne!(deva.is_abugida(), deva.is_alphabet()); + } + + #[test] + fn test_decide_char_type() { + let is_mark = |c| decide_token_type(c) == TokenType::VowelMark; + let is_consonant = |c| decide_token_type(c) == TokenType::Consonant; + let is_other = |c| decide_token_type(c) == TokenType::Other; + + assert!(is_mark("\u{093e}")); + assert!(is_mark("\u{093f}")); + assert!(is_mark("\u{094b}")); + assert!(is_mark("\u{094c}")); + assert!(is_mark("\u{094e}")); + assert!(is_mark("\u{094f}")); + + assert!(is_consonant("क")); + assert!(is_consonant("ख")); + assert!(is_consonant("स")); + assert!(is_consonant("ह")); + // Consonant clusters + assert!(is_consonant("क्ष")); + assert!(is_consonant("ज्ञ")); + + assert!(is_other("१")); + } + + #[test] + fn test_mapping() { + let assert_has = |m: &Mapping, x: &str, y: &str| { + assert_eq!(m.get(x), Some(&y.to_string())); + }; + + let m = Mapping::new(Scheme::Devanagari, Scheme::Itrans); + assert_has(&m, "आ", "A"); + assert_has(&m, "\u{093e}", "A"); + assert_has(&m, "ए", "e"); + assert_has(&m, "\u{0947}", "e"); + + let m = Mapping::new(Scheme::Bengali, Scheme::Itrans); + assert_has(&m, "\u{09be}", "A"); + assert_has(&m, "\u{09c7}", "e"); + } + + #[test] + fn test_transliterate() { + let t = |s| transliterate(s, Scheme::HarvardKyoto, Scheme::Devanagari); + assert_eq!(t("namaskRtya"), "नमस्कृत्य"); } } diff --git a/vidyut-lipi/src/schemes.rs b/vidyut-lipi/src/schemes.rs new file mode 100644 index 0000000..601a17e --- /dev/null +++ b/vidyut-lipi/src/schemes.rs @@ -0,0 +1,1899 @@ +#![allow(unused)] + +//! Auto-generated scheme data. +//! +//! These schemes were auto-generated from the `common_maps` repository +//! from the `indic-transliteration` project. + +pub const BENGALI: &[(&str, &str)] = &[ + ("अ", "অ"), + ("आ", "আ"), + ("इ", "ই"), + ("ई", "ঈ"), + ("उ", "উ"), + ("ऊ", "ঊ"), + ("ऋ", "ঋ"), + ("ॠ", "ৠ"), + ("ऌ", "ঌ"), + ("ॡ", "ৡ"), + ("ऎ", "ऎ"), + ("ए", "এ"), + ("ऐ", "ঐ"), + ("ऒ", "ऒ"), + ("ओ", "ও"), + ("औ", "ঔ"), + ("ा", "া"), + ("ि", "ি"), + ("ी", "ী"), + ("ु", "ু"), + ("ू", "ূ"), + ("ृ", "ৃ"), + ("ॄ", "ৄ"), + ("ॢ", "ৢ"), + ("ॣ", "ৣ"), + ("ॆ", "ॆ"), + ("े", "ে"), + ("ै", "ৈ"), + ("ॊ", "ॊ"), + ("ो", "ো"), + ("ौ", "ৌ"), + ("ं", "ং"), + ("ः", "ঃ"), + ("ँ", "ঁ"), + ("्", "্"), + ("क", "ক"), + ("ख", "খ"), + ("ग", "গ"), + ("घ", "ঘ"), + ("ङ", "ঙ"), + ("च", "চ"), + ("छ", "ছ"), + ("ज", "জ"), + ("झ", "ঝ"), + ("ञ", "ঞ"), + ("ट", "ট"), + ("ठ", "ঠ"), + ("ड", "ড"), + ("ढ", "ঢ"), + ("ण", "ণ"), + ("त", "ত"), + ("थ", "থ"), + ("द", "দ"), + ("ध", "ধ"), + ("न", "ন"), + ("प", "প"), + ("फ", "ফ"), + ("ब", "ব"), + ("भ", "ভ"), + ("म", "ম"), + ("य", "য"), + ("र", "র"), + ("ल", "ল"), + ("व", "ব"), + ("श", "শ"), + ("ष", "ষ"), + ("स", "স"), + ("ह", "হ"), + ("ळ", "ল়"), + ("क्ष", "ক্ষ"), + ("ज्ञ", "জ্ঞ"), + ("०", "০"), + ("१", "১"), + ("२", "২"), + ("३", "৩"), + ("४", "৪"), + ("५", "৫"), + ("६", "৬"), + ("७", "৭"), + ("८", "৮"), + ("९", "৯"), + ("ॐ", "ওঁ"), + ("ऽ", "ঽ"), + ("।", "।"), + ("॥", "॥"), + ("‍", ""), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("ॅ", "ে"), + ("क़", "ক়"), + ("ख़", "খ়"), + ("ग़", "গ়"), + ("ज़", "জ়"), + ("ड़", "ড়"), + ("ढ़", "ঢ়"), + ("फ़", "ফ়"), + ("य़", "য়"), + ("ऱ", "র়"), + ("ऴ", "ষ়"), +]; + +pub const BRAHMI: &[(&str, &str)] = &[ + ("अ", "𑀅"), + ("आ", "𑀆"), + ("इ", "𑀇"), + ("ई", "𑀈"), + ("उ", "𑀉"), + ("ऊ", "𑀊"), + ("ऋ", "𑀋"), + ("ॠ", "𑀌"), + ("ऌ", "𑀍"), + ("ॡ", "𑀎"), + ("ऎ", "𑀏𑁆"), + ("ए", "𑀏"), + ("ऐ", "𑀐"), + ("ऒ", "𑀑𑁆"), + ("ओ", "𑀑"), + ("औ", "𑀒"), + ("ा", "𑀸"), + ("ि", "𑀺"), + ("ी", "𑀻"), + ("ु", "𑀼"), + ("ू", "𑀽"), + ("ृ", "𑀾"), + ("ॄ", "𑀿"), + ("ॢ", "𑁀"), + ("ॣ", "𑁁"), + ("े", "𑁂"), + ("ै", "𑁃"), + ("ो", "𑁄"), + ("ौ", "𑁅"), + ("ं", "𑀁"), + ("ः", "𑀂"), + ("ँ", "𑀀"), + ("्", "𑁆"), + ("क", "𑀓"), + ("ख", "𑀔"), + ("ग", "𑀕"), + ("घ", "𑀖"), + ("ङ", "𑀗"), + ("च", "𑀘"), + ("छ", "𑀙"), + ("ज", "𑀚"), + ("झ", "𑀛"), + ("ञ", "𑀜"), + ("ट", "𑀝"), + ("ठ", "𑀞"), + ("ड", "𑀟"), + ("ढ", "𑀠"), + ("ण", "𑀡"), + ("त", "𑀢"), + ("थ", "𑀣"), + ("द", "𑀤"), + ("ध", "𑀥"), + ("न", "𑀦"), + ("प", "𑀧"), + ("फ", "𑀨"), + ("ब", "𑀩"), + ("भ", "𑀪"), + ("म", "𑀫"), + ("य", "𑀬"), + ("र", "𑀭"), + ("ल", "𑀮"), + ("व", "𑀯"), + ("श", "𑀰"), + ("ष", "𑀱"), + ("स", "𑀲"), + ("ह", "𑀳"), + ("ळ", "𑀴"), + ("क्ष", "𑀓𑁆𑀱"), + ("ज्ञ", "𑀚𑁆𑀜"), + ("०", "𑁦"), + ("१", "𑁧"), + ("२", "𑁨"), + ("३", "𑁩"), + ("४", "𑁪"), + ("५", "𑁫"), + ("६", "𑁬"), + ("७", "𑁭"), + ("८", "𑁮"), + ("९", "𑁯"), + ("ॐ", "𑀑𑀁"), + ("ऽ", ""), + ("।", "𑁇"), + ("॥", "𑁈"), + ("‍", "‍"), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("ॅ", "𑁂"), + ("क़", "𑀓"), + ("ख़", "𑀔"), + ("ग़", "𑀕"), + ("ज़", "𑀚"), + ("ड़", "𑀟"), + ("ढ़", "𑀠"), + ("फ़", "𑀨"), + ("य़", "𑀬"), + ("ऱ", "𑀭"), +]; + +pub const DEVANAGARI: &[(&str, &str)] = &[ + ("अ", "अ"), + ("आ", "आ"), + ("इ", "इ"), + ("ई", "ई"), + ("उ", "उ"), + ("ऊ", "ऊ"), + ("ऋ", "ऋ"), + ("ॠ", "ॠ"), + ("ऌ", "ऌ"), + ("ॡ", "ॡ"), + ("ऎ", "ऎ"), + ("ए", "ए"), + ("ऐ", "ऐ"), + ("ऒ", "ऒ"), + ("ओ", "ओ"), + ("औ", "औ"), + ("ऍ", "ऍ"), + ("ऑ", "ऑ"), + ("ा", "ा"), + ("ि", "ि"), + ("ी", "ी"), + ("ु", "ु"), + ("ू", "ू"), + ("ृ", "ृ"), + ("ॄ", "ॄ"), + ("ॢ", "ॢ"), + ("ॣ", "ॣ"), + ("ॆ", "ॆ"), + ("े", "े"), + ("ै", "ै"), + ("ॊ", "ॊ"), + ("ो", "ो"), + ("ौ", "ौ"), + ("ॅ", "ॅ"), + ("ॉ", "ॉ"), + ("ं", "ं"), + ("ः", "ः"), + ("ँ", "ँ"), + ("ᳵ", "ᳵ"), + ("ᳶ", "ᳶ"), + ("ꣳ", "ꣳ"), + ("्", "्"), + ("क", "क"), + ("ख", "ख"), + ("ग", "ग"), + ("घ", "घ"), + ("ङ", "ङ"), + ("च", "च"), + ("छ", "छ"), + ("ज", "ज"), + ("झ", "झ"), + ("ञ", "ञ"), + ("ट", "ट"), + ("ठ", "ठ"), + ("ड", "ड"), + ("ढ", "ढ"), + ("ण", "ण"), + ("त", "त"), + ("थ", "थ"), + ("द", "द"), + ("ध", "ध"), + ("न", "न"), + ("प", "प"), + ("फ", "फ"), + ("ब", "ब"), + ("भ", "भ"), + ("म", "म"), + ("य", "य"), + ("र", "र"), + ("ल", "ल"), + ("व", "व"), + ("श", "श"), + ("ष", "ष"), + ("स", "स"), + ("ह", "ह"), + ("ळ", "ळ"), + ("क्ष", "क्ष"), + ("ज्ञ", "ज्ञ"), + ("०", "०"), + ("१", "१"), + ("२", "२"), + ("३", "३"), + ("४", "४"), + ("५", "५"), + ("६", "६"), + ("७", "७"), + ("८", "८"), + ("९", "९"), + ("ॐ", "ॐ"), + ("ऽ", "ऽ"), + ("।", "।"), + ("॥", "॥"), + ("‍", "‍"), + ("‌", "‌"), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("᳡", "᳡"), + ("꣡", "꣡"), + ("꣢", "꣢"), + ("꣣", "꣣"), + ("꣤", "꣤"), + ("꣥", "꣥"), + ("꣦", "꣦"), + ("꣧", "꣧"), + ("꣨", "꣨"), + ("꣩", "꣩"), + ("꣪", "꣪"), + ("꣫", "꣫"), + ("꣬", "꣬"), + ("꣭", "꣭"), + ("꣮", "꣮"), + ("꣯", "꣯"), + ("꣰", "꣰"), + ("꣱", "꣱"), + ("ॅ", "ॅ"), + ("क़", "क़"), + ("ख़", "ख़"), + ("ग़", "ग़"), + ("ज़", "ज़"), + ("ड़", "ड़"), + ("ढ़", "ढ़"), + ("फ़", "फ़"), + ("य़", "य़"), + ("ऱ", "ऱ"), + ("ऴ", "ऴ"), + ("ऩ", "ऩ"), + ("क़", "क़"), + ("ख़", "ख़"), + ("फ़", "फ़"), + ("ज़", "ज़"), + ("ऩ", "ऩ"), + ("ड़", "ड़"), + ("ग़", "ग़"), + ("ढ़", "ढ़"), + ("य़", "य़"), + ("ऱ", "ऱ"), + ("ऴ", "ऴ"), +]; + +pub const GRANTHA: &[(&str, &str)] = &[ + ("अ", "𑌅"), + ("आ", "𑌆"), + ("इ", "𑌇"), + ("ई", "𑌈"), + ("उ", "𑌉"), + ("ऊ", "𑌊"), + ("ऋ", "𑌋"), + ("ॠ", "𑍠"), + ("ऌ", "𑌌"), + ("ॡ", "𑍡"), + ("ऎ", "𑌏𑌀"), + ("ए", "𑌏"), + ("ऐ", "𑌐"), + ("ऒ", "𑌓𑌀"), + ("ओ", "𑌓"), + ("औ", "𑌔"), + ("ा", "𑌾"), + ("ि", "𑌿"), + ("ी", "𑍀"), + ("ु", "𑍁"), + ("ू", "𑍂"), + ("ृ", "𑍃"), + ("ॄ", "𑍄"), + ("ॢ", "𑍢"), + ("ॣ", "𑍣"), + ("ॆ", "𑍇𑌀"), + ("े", "𑍇"), + ("ै", "𑍈"), + ("ॊ", "𑍋𑌀"), + ("ो", "𑍋"), + ("ौ", "𑍗"), + ("ं", "𑌂"), + ("ः", "𑌃"), + ("ँ", "𑌁"), + ("्", "𑍍"), + ("क", "𑌕"), + ("ख", "𑌖"), + ("ग", "𑌗"), + ("घ", "𑌘"), + ("ङ", "𑌙"), + ("च", "𑌚"), + ("छ", "𑌛"), + ("ज", "𑌜"), + ("झ", "𑌝"), + ("ञ", "𑌞"), + ("ट", "𑌟"), + ("ठ", "𑌠"), + ("ड", "𑌡"), + ("ढ", "𑌢"), + ("ण", "𑌣"), + ("त", "𑌤"), + ("थ", "𑌥"), + ("द", "𑌦"), + ("ध", "𑌧"), + ("न", "𑌨"), + ("प", "𑌪"), + ("फ", "𑌫"), + ("ब", "𑌬"), + ("भ", "𑌭"), + ("म", "𑌮"), + ("य", "𑌯"), + ("र", "𑌰"), + ("ल", "𑌲"), + ("व", "𑌵"), + ("श", "𑌶"), + ("ष", "𑌷"), + ("स", "𑌸"), + ("ह", "𑌹"), + ("ळ", "𑌳"), + ("क्ष", "𑌕𑍍𑌷"), + ("ज्ञ", "𑌜𑍍𑌞"), + ("०", "௦"), + ("१", "௧"), + ("२", "௨"), + ("३", "௩"), + ("४", "௪"), + ("५", "௫"), + ("६", "௬"), + ("७", "௭"), + ("८", "௮"), + ("९", "௯"), + ("ॐ", "𑍐"), + ("ऽ", "𑌽"), + ("।", "।"), + ("॥", "॥"), + ("ॅ", "𑍇"), + ("क़", "𑌕𑌼"), + ("ख़", "𑌖𑌼"), + ("ग़", "𑌗𑌼"), + ("ज़", "𑌜𑌼"), + ("ड़", "𑌡𑌼"), + ("ढ़", "𑌢𑌼"), + ("फ़", "𑌫𑌼"), + ("य़", "𑌯𑌼"), + ("ऱ", "𑌰𑌼"), +]; + +pub const GUJARATI: &[(&str, &str)] = &[ + ("अ", "અ"), + ("आ", "આ"), + ("इ", "ઇ"), + ("ई", "ઈ"), + ("उ", "ઉ"), + ("ऊ", "ઊ"), + ("ऋ", "ઋ"), + ("ॠ", "ૠ"), + ("ऌ", "ઌ"), + ("ॡ", "ૡ"), + ("ऎ", "ऎ"), + ("ए", "એ"), + ("ऐ", "ઐ"), + ("ऒ", "ऒ"), + ("ओ", "ઓ"), + ("औ", "ઔ"), + ("ा", "ા"), + ("ि", "િ"), + ("ी", "ી"), + ("ु", "ુ"), + ("ू", "ૂ"), + ("ृ", "ૃ"), + ("ॄ", "ૄ"), + ("ॢ", "ૢ"), + ("ॣ", "ૣ"), + ("ॆ", "ॆ"), + ("े", "ે"), + ("ै", "ૈ"), + ("ॊ", "ॊ"), + ("ो", "ો"), + ("ौ", "ૌ"), + ("ं", "ં"), + ("ः", "ઃ"), + ("ँ", "ઁ"), + ("्", "્"), + ("क", "ક"), + ("ख", "ખ"), + ("ग", "ગ"), + ("घ", "ઘ"), + ("ङ", "ઙ"), + ("च", "ચ"), + ("छ", "છ"), + ("ज", "જ"), + ("झ", "ઝ"), + ("ञ", "ઞ"), + ("ट", "ટ"), + ("ठ", "ઠ"), + ("ड", "ડ"), + ("ढ", "ઢ"), + ("ण", "ણ"), + ("त", "ત"), + ("थ", "થ"), + ("द", "દ"), + ("ध", "ધ"), + ("न", "ન"), + ("प", "પ"), + ("फ", "ફ"), + ("ब", "બ"), + ("भ", "ભ"), + ("म", "મ"), + ("य", "ય"), + ("र", "ર"), + ("ल", "લ"), + ("व", "વ"), + ("श", "શ"), + ("ष", "ષ"), + ("स", "સ"), + ("ह", "હ"), + ("ळ", "ળ"), + ("क्ष", "ક્ષ"), + ("ज्ञ", "જ્ઞ"), + ("०", "૦"), + ("१", "૧"), + ("२", "૨"), + ("३", "૩"), + ("४", "૪"), + ("५", "૫"), + ("६", "૬"), + ("७", "૭"), + ("८", "૮"), + ("९", "૯"), + ("ॐ", "ૐ"), + ("ऽ", "ઽ"), + ("।", "।"), + ("॥", "॥"), + ("‍", ""), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("ॅ", "ૅ"), + ("क़", "ક઼"), + ("ख़", "ખ઼"), + ("ग़", "ગ઼"), + ("ज़", "જ઼"), + ("ड़", "ડ઼"), + ("ढ़", "ઢ઼"), + ("फ़", "ફ઼"), + ("य़", "ય઼"), + ("ऱ", "ર઼"), + ("ऴ", "ળ઼"), +]; + +pub const GURMUKHI: &[(&str, &str)] = &[ + ("अ", "ਅ"), + ("आ", "ਆ"), + ("इ", "ਇ"), + ("ई", "ਈ"), + ("उ", "ਉ"), + ("ऊ", "ਊ"), + ("ऋ", ""), + ("ॠ", ""), + ("ऌ", ""), + ("ॡ", ""), + ("ऎ", ""), + ("ए", "ਏ"), + ("ऐ", "ਐ"), + ("ऒ", ""), + ("ओ", "ਓ"), + ("औ", "ਔ"), + ("ा", "ਾ"), + ("ि", "ਿ"), + ("ी", "ੀ"), + ("ु", "ੁ"), + ("ू", "ੂ"), + ("ृ", ""), + ("ॄ", ""), + ("ॢ", ""), + ("ॣ", ""), + ("ॆ", ""), + ("े", "ੇ"), + ("ै", "ੈ"), + ("ॊ", ""), + ("ो", "ੋ"), + ("ौ", "ੌ"), + ("ं", "ਂ"), + ("ः", "ਃ"), + ("ँ", "ਁ"), + ("्", "੍"), + ("क", "ਕ"), + ("ख", "ਖ"), + ("ग", "ਗ"), + ("घ", "ਘ"), + ("ङ", "ਙ"), + ("च", "ਚ"), + ("छ", "ਛ"), + ("ज", "ਜ"), + ("झ", "ਝ"), + ("ञ", "ਞ"), + ("ट", "ਟ"), + ("ठ", "ਠ"), + ("ड", "ਡ"), + ("ढ", "ਢ"), + ("ण", "ਣ"), + ("त", "ਤ"), + ("थ", "ਥ"), + ("द", "ਦ"), + ("ध", "ਧ"), + ("न", "ਨ"), + ("प", "ਪ"), + ("फ", "ਫ"), + ("ब", "ਬ"), + ("भ", "ਭ"), + ("म", "ਮ"), + ("य", "ਯ"), + ("र", "ਰ"), + ("ल", "ਲ"), + ("व", "ਵ"), + ("श", "ਸ਼"), + ("ष", ""), + ("स", "ਸ"), + ("ह", "ਹ"), + ("ळ", "ਲ਼"), + ("क्ष", "ਕ੍ਸ਼"), + ("ज्ञ", "ਜ੍ਞ"), + ("०", "੦"), + ("१", "੧"), + ("२", "੨"), + ("३", "੩"), + ("४", "੪"), + ("५", "੫"), + ("६", "੬"), + ("७", "੭"), + ("८", "੮"), + ("९", "੯"), + ("ॐ", "ੴ"), + ("ऽ", "ऽ"), + ("।", "।"), + ("॥", "॥"), + ("ं", "ੰ"), + ("क़", "ਕ਼"), + ("ख़", "ਖ਼"), + ("ग़", "ਗ਼"), + ("ज़", "ਜ਼"), + ("ड़", "ੜ"), + ("ढ़", "ੜ੍ਹ"), + ("फ़", "ਫ਼"), + ("य़", "ਯ਼"), + ("ऱ", "ਰ਼"), + ("ऴ", "ਲ਼਼"), + ("ऩ", "ਨ਼"), + ("॑", "ੑ"), + ("॒", "॒"), +]; + +pub const KANNADA: &[(&str, &str)] = &[ + ("अ", "ಅ"), + ("आ", "ಆ"), + ("इ", "ಇ"), + ("ई", "ಈ"), + ("उ", "ಉ"), + ("ऊ", "ಊ"), + ("ऋ", "ಋ"), + ("ॠ", "ೠ"), + ("ऌ", "ಌ"), + ("ॡ", "ೡ"), + ("ऎ", "ಎ"), + ("ए", "ಏ"), + ("ऐ", "ಐ"), + ("ऒ", "ಒ"), + ("ओ", "ಓ"), + ("औ", "ಔ"), + ("ा", "ಾ"), + ("ि", "ಿ"), + ("ी", "ೀ"), + ("ु", "ು"), + ("ू", "ೂ"), + ("ृ", "ೃ"), + ("ॄ", "ೄ"), + ("ॢ", "ೢ"), + ("ॣ", "ೣ"), + ("ॆ", "ೆ"), + ("े", "ೇ"), + ("ै", "ೈ"), + ("ॊ", "ೊ"), + ("ो", "ೋ"), + ("ौ", "ೌ"), + ("ं", "ಂ"), + ("ः", "ಃ"), + ("ँ", "ಁ"), + ("ᳵ", "ೱ"), + ("ᳶ", "ೲ"), + ("्", "್"), + ("क", "ಕ"), + ("ख", "ಖ"), + ("ग", "ಗ"), + ("घ", "ಘ"), + ("ङ", "ಙ"), + ("च", "ಚ"), + ("छ", "ಛ"), + ("ज", "ಜ"), + ("झ", "ಝ"), + ("ञ", "ಞ"), + ("ट", "ಟ"), + ("ठ", "ಠ"), + ("ड", "ಡ"), + ("ढ", "ಢ"), + ("ण", "ಣ"), + ("त", "ತ"), + ("थ", "ಥ"), + ("द", "ದ"), + ("ध", "ಧ"), + ("न", "ನ"), + ("प", "ಪ"), + ("फ", "ಫ"), + ("ब", "ಬ"), + ("भ", "ಭ"), + ("म", "ಮ"), + ("य", "ಯ"), + ("र", "ರ"), + ("ल", "ಲ"), + ("व", "ವ"), + ("श", "ಶ"), + ("ष", "ಷ"), + ("स", "ಸ"), + ("ह", "ಹ"), + ("ळ", "ಳ"), + ("क्ष", "ಕ್ಷ"), + ("ज्ञ", "ಜ್ಞ"), + ("०", "೦"), + ("१", "೧"), + ("२", "೨"), + ("३", "೩"), + ("४", "೪"), + ("५", "೫"), + ("६", "೬"), + ("७", "೭"), + ("८", "೮"), + ("९", "೯"), + ("ॐ", "ಓಂ"), + ("ऽ", "ಽ"), + ("।", "।"), + ("॥", "॥"), + ("क़", "ಕ಼"), + ("ख़", "ಖ಼"), + ("ग़", "ಗ಼"), + ("ज़", "ಜ಼"), + ("ड़", "ಡ಼"), + ("ढ़", "ಢ಼"), + ("फ़", "ಫ಼"), + ("य़", "ಯ಼"), + ("ऱ", "ಱ"), + ("ऴ", "ೞ"), + ("ऩ", "ನ಼"), + ("ज़", "ಸ಼"), +]; + +pub const MALAYALAM: &[(&str, &str)] = &[ + ("अ", "അ"), + ("आ", "ആ"), + ("इ", "ഇ"), + ("ई", "ഈ"), + ("उ", "ഉ"), + ("ऊ", "ഊ"), + ("ऋ", "ഋ"), + ("ॠ", "ൠ"), + ("ऌ", "ഌ"), + ("ॡ", "ൡ"), + ("ऎ", "എ"), + ("ए", "ഏ"), + ("ऐ", "ഐ"), + ("ऒ", "ഒ"), + ("ओ", "ഓ"), + ("औ", "ഔ"), + ("ा", "ാ"), + ("ि", "ി"), + ("ी", "ീ"), + ("ु", "ു"), + ("ू", "ൂ"), + ("ृ", "ൃ"), + ("ॄ", "ൄ"), + ("ॢ", "ൢ"), + ("ॣ", "ൣ"), + ("ॆ", "െ"), + ("े", "േ"), + ("ै", "ൈ"), + ("ॊ", "ൊ"), + ("ो", "ോ"), + ("ौ", "ൌ"), + ("ं", "ം"), + ("ः", "ഃ"), + ("ँ", "ഁ"), + ("्", "്"), + ("क", "ക"), + ("ख", "ഖ"), + ("ग", "ഗ"), + ("घ", "ഘ"), + ("ङ", "ങ"), + ("च", "ച"), + ("छ", "ഛ"), + ("ज", "ജ"), + ("झ", "ഝ"), + ("ञ", "ഞ"), + ("ट", "ട"), + ("ठ", "ഠ"), + ("ड", "ഡ"), + ("ढ", "ഢ"), + ("ण", "ണ"), + ("त", "ത"), + ("थ", "ഥ"), + ("द", "ദ"), + ("ध", "ധ"), + ("न", "ന"), + ("प", "പ"), + ("फ", "ഫ"), + ("ब", "ബ"), + ("भ", "ഭ"), + ("म", "മ"), + ("य", "യ"), + ("र", "ര"), + ("ल", "ല"), + ("व", "വ"), + ("श", "ശ"), + ("ष", "ഷ"), + ("स", "സ"), + ("ह", "ഹ"), + ("ळ", "ള"), + ("क्ष", "ക്ഷ"), + ("ज्ञ", "ജ്ഞ"), + ("०", "൦"), + ("१", "൧"), + ("२", "൨"), + ("३", "൩"), + ("४", "൪"), + ("५", "൫"), + ("६", "൬"), + ("७", "൭"), + ("८", "൮"), + ("९", "൯"), + ("ॐ", "ഓം"), + ("ऽ", "ഽ"), + ("।", "।"), + ("॥", "॥"), + ("क़", ""), + ("ख़", ""), + ("ग़", ""), + ("ज़", ""), + ("ड़", ""), + ("ढ़", ""), + ("फ़", ""), + ("य़", ""), + ("ऱ", "റ"), + ("ऴ", "ഴ"), +]; + +pub const ORIYA: &[(&str, &str)] = &[ + ("अ", "ଅ"), + ("आ", "ଆ"), + ("इ", "ଇ"), + ("ई", "ଈ"), + ("उ", "ଉ"), + ("ऊ", "ଊ"), + ("ऋ", "ଋ"), + ("ॠ", "ୠ"), + ("ऌ", "ଌ"), + ("ॡ", "ୡ"), + ("ऎ", "ऎ"), + ("ए", "ଏ"), + ("ऐ", "ଐ"), + ("ऒ", "ऒ"), + ("ओ", "ଓ"), + ("औ", "ଔ"), + ("ा", "ା"), + ("ि", "ି"), + ("ी", "ୀ"), + ("ु", "ୁ"), + ("ू", "ୂ"), + ("ृ", "ୃ"), + ("ॄ", "ୄ"), + ("ॢ", "ୢ"), + ("ॣ", "ୣ"), + ("ॆ", "ॆ"), + ("े", "େ"), + ("ै", "ୈ"), + ("ॊ", "ॊ"), + ("ो", "ୋ"), + ("ौ", "ୌ"), + ("ं", "ଂ"), + ("ः", "ଃ"), + ("ँ", "ଁ"), + ("्", "୍"), + ("क", "କ"), + ("ख", "ଖ"), + ("ग", "ଗ"), + ("घ", "ଘ"), + ("ङ", "ଙ"), + ("च", "ଚ"), + ("छ", "ଛ"), + ("ज", "ଜ"), + ("झ", "ଝ"), + ("ञ", "ଞ"), + ("ट", "ଟ"), + ("ठ", "ଠ"), + ("ड", "ଡ"), + ("ढ", "ଢ"), + ("ण", "ଣ"), + ("त", "ତ"), + ("थ", "ଥ"), + ("द", "ଦ"), + ("ध", "ଧ"), + ("न", "ନ"), + ("प", "ପ"), + ("फ", "ଫ"), + ("ब", "ବ"), + ("भ", "ଭ"), + ("म", "ମ"), + ("य", "ଯ"), + ("र", "ର"), + ("ल", "ଲ"), + ("व", "ଵ"), + ("श", "ଶ"), + ("ष", "ଷ"), + ("स", "ସ"), + ("ह", "ହ"), + ("ळ", "ଳ"), + ("क्ष", "କ୍ଷ"), + ("ज्ञ", "ଜ୍ଞ"), + ("०", "୦"), + ("१", "୧"), + ("२", "୨"), + ("३", "୩"), + ("४", "୪"), + ("५", "୫"), + ("६", "୬"), + ("७", "୭"), + ("८", "୮"), + ("९", "୯"), + ("ॐ", "ଓଁ"), + ("ऽ", "ଽ"), + ("।", "।"), + ("॥", "॥"), + ("‍", ""), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("ॅ", "େ"), + ("क़", "କ଼"), + ("ख़", "ଖ଼"), + ("ग़", "ଗ଼"), + ("ज़", "ଜ଼"), + ("ड़", "ଡ଼"), + ("ढ़", "ଢ଼"), + ("फ़", "ଫ଼"), + ("य़", "ୟ"), + ("ऱ", "ର଼"), + ("ऴ", "ଳ଼"), +]; + +pub const SINHALA: &[(&str, &str)] = &[ + ("अ", "අ"), + ("आ", "ආ"), + ("इ", "ඉ"), + ("ई", "ඊ"), + ("उ", "උ"), + ("ऊ", "ඌ"), + ("ऋ", "ඍ"), + ("ॠ", "ඎ"), + ("ऌ", "ඏ"), + ("ॡ", "ඐ"), + ("ऎ", "එ"), + ("ए", "ඒ"), + ("ऐ", "ඓ"), + ("ऒ", "ඔ"), + ("ओ", "ඕ"), + ("औ", "ඖ"), + ("ा", "ා"), + ("ि", "ි"), + ("ी", "ී"), + ("ु", "ු"), + ("ू", "ූ"), + ("ृ", "ෘ"), + ("ॄ", "ෲ"), + ("ॢ", "ෟ"), + ("ॣ", "ෳ"), + ("ॆ", "ෙ"), + ("े", "ේ"), + ("ै", "ෛ"), + ("ॊ", "ො"), + ("ो", "ෝ"), + ("ौ", "ෞ"), + ("ं", "ං"), + ("ः", "ඃ"), + ("्", "්"), + ("क", "ක"), + ("ख", "ඛ"), + ("ग", "ග"), + ("घ", "ඝ"), + ("ङ", "ඞ"), + ("च", "ච"), + ("छ", "ඡ"), + ("ज", "ජ"), + ("झ", "ඣ"), + ("ञ", "ඤ"), + ("ट", "ට"), + ("ठ", "ඨ"), + ("ड", "ඩ"), + ("ढ", "ඪ"), + ("ण", "ණ"), + ("त", "ත"), + ("थ", "ථ"), + ("द", "ද"), + ("ध", "ධ"), + ("न", "න"), + ("प", "ප"), + ("फ", "ඵ"), + ("ब", "බ"), + ("भ", "භ"), + ("म", "ම"), + ("य", "ය"), + ("र", "ර"), + ("ल", "ල"), + ("व", "ව"), + ("श", "ශ"), + ("ष", "ෂ"), + ("स", "ස"), + ("ह", "හ"), + ("ळ", "ළ"), + ("क्ष", "ක්‍ෂ"), + ("ज्ञ", "ඥ"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "ඕං"), + ("ऽ", "(අ)"), + ("।", "."), + ("॥", ".."), + ("ॅ", "ැ"), +]; + +pub const TAMIL: &[(&str, &str)] = &[ + ("अ", "அ"), + ("आ", "ஆ"), + ("इ", "இ"), + ("ई", "ஈ"), + ("उ", "உ"), + ("ऊ", "ஊ"), + ("ऋ", "ரு'"), + ("ॠ", "ரூ'"), + ("ऌ", "லு'"), + ("ॡ", "லூ'"), + ("ऎ", "எ"), + ("ए", "ஏ"), + ("ऐ", "ஐ"), + ("ऒ", "ஒ"), + ("ओ", "ஓ"), + ("औ", "ஔ"), + ("ा", "ா"), + ("ि", "ி"), + ("ी", "ீ"), + ("ु", "ு"), + ("ू", "ூ"), + ("ृ", "்ரு'"), + ("ॄ", "்ரூ'"), + ("ॢ", "்லு'"), + ("ॣ", "்லூ'"), + ("ॆ", "ெ"), + ("े", "ே"), + ("ै", "ை"), + ("ॊ", "ொ"), + ("ो", "ோ"), + ("ौ", "ௌ"), + ("ं", "ம்"), + ("ः", "ஃ"), + ("ँ", ""), + ("्", "்"), + ("क", "க"), + ("ख", "க"), + ("ग", "க"), + ("घ", "க"), + ("ङ", "ங"), + ("च", "ச"), + ("छ", "ச"), + ("ज", "ஜ"), + ("झ", "ச"), + ("ञ", "ஞ"), + ("ट", "ட"), + ("ठ", "ட"), + ("ड", "ட"), + ("ढ", "ட"), + ("ण", "ண"), + ("त", "த"), + ("थ", "த"), + ("द", "த"), + ("ध", "த"), + ("न", "ந"), + ("प", "ப"), + ("फ", "ப"), + ("ब", "ப"), + ("भ", "ப"), + ("म", "ம"), + ("य", "ய"), + ("र", "ர"), + ("ल", "ல"), + ("व", "வ"), + ("श", "ஶ"), + ("ष", "ஷ"), + ("स", "ஸ"), + ("ह", "ஹ"), + ("ळ", "ள"), + ("क्ष", "க்ஷ"), + ("ज्ञ", "ஜ்ஞ"), + ("०", "௦"), + ("१", "௧"), + ("२", "௨"), + ("३", "௩"), + ("४", "௪"), + ("५", "௫"), + ("६", "௬"), + ("७", "௭"), + ("८", "௮"), + ("९", "௯"), + ("ॐ", "ௐ"), + ("ऽ", "ऽ"), + ("।", "।"), + ("॥", "॥"), + ("क़", "ஃ'க"), + ("ख़", "ஃக²"), + ("ग़", "ஃக³"), + ("ज़", "ஃஜ"), + ("ड़", "ஃட²"), + ("ढ़", "ஃட³"), + ("फ़", "ஃப"), + ("य़", "ஃய"), + ("ऱ", "ற"), + ("ऴ", "ழ"), + ("ऩ", "ன"), +]; + +pub const TELUGU: &[(&str, &str)] = &[ + ("अ", "అ"), + ("आ", "ఆ"), + ("इ", "ఇ"), + ("ई", "ఈ"), + ("उ", "ఉ"), + ("ऊ", "ఊ"), + ("ऋ", "ఋ"), + ("ॠ", "ౠ"), + ("ऌ", "ఌ"), + ("ॡ", "ౡ"), + ("ऎ", "ఎ"), + ("ए", "ఏ"), + ("ऐ", "ఐ"), + ("ऒ", "ఒ"), + ("ओ", "ఓ"), + ("औ", "ఔ"), + ("ा", "ా"), + ("ि", "ి"), + ("ी", "ీ"), + ("ु", "ు"), + ("ू", "ూ"), + ("ृ", "ృ"), + ("ॄ", "ౄ"), + ("ॢ", "ౢ"), + ("ॣ", "ౣ"), + ("ॆ", "ె"), + ("े", "ే"), + ("ै", "ై"), + ("ॊ", "ొ"), + ("ो", "ో"), + ("ौ", "ౌ"), + ("ं", "ం"), + ("ः", "ః"), + ("ँ", "ఁ"), + ("्", "్"), + ("क", "క"), + ("ख", "ఖ"), + ("ग", "గ"), + ("घ", "ఘ"), + ("ङ", "ఙ"), + ("च", "చ"), + ("छ", "ఛ"), + ("ज", "జ"), + ("झ", "ఝ"), + ("ञ", "ఞ"), + ("ट", "ట"), + ("ठ", "ఠ"), + ("ड", "డ"), + ("ढ", "ఢ"), + ("ण", "ణ"), + ("त", "త"), + ("थ", "థ"), + ("द", "ద"), + ("ध", "ధ"), + ("न", "న"), + ("प", "ప"), + ("फ", "ఫ"), + ("ब", "బ"), + ("भ", "భ"), + ("म", "మ"), + ("य", "య"), + ("र", "ర"), + ("ल", "ల"), + ("व", "వ"), + ("श", "శ"), + ("ष", "ష"), + ("स", "స"), + ("ह", "హ"), + ("ळ", "ళ"), + ("क्ष", "క్ష"), + ("ज्ञ", "జ్ఞ"), + ("०", "౦"), + ("१", "౧"), + ("२", "౨"), + ("३", "౩"), + ("४", "౪"), + ("५", "౫"), + ("६", "౬"), + ("७", "౭"), + ("८", "౮"), + ("९", "౯"), + ("ॐ", "ఓం"), + ("ऽ", "ఽ"), + ("।", "।"), + ("॥", "॥"), + ("क़", ""), + ("ख़", ""), + ("ग़", ""), + ("ज़", ""), + ("ड़", ""), + ("ढ़", ""), + ("फ़", ""), + ("य़", ""), + ("ऱ", "ఱ"), + ("ऴ", "ఴ"), + ("ऩ", ""), +]; + +pub const TIBETAN: &[(&str, &str)] = &[ + ("अ", "ཨ"), + ("आ", "ཨཱ"), + ("इ", "ཨི"), + ("ई", "ཨཱི"), + ("उ", "ཨུ"), + ("ऊ", "ཨཱུ"), + ("ऋ", "རྀ"), + ("ॠ", "རཱྀ"), + ("ऌ", "ལྀ"), + ("ॡ", "ལཱྀ"), + ("ऎ", "ཨེ"), + ("ए", "ཨེ"), + ("ऐ", "ཨཻ"), + ("ऒ", "ཨོ"), + ("ओ", "ཨོ"), + ("औ", "ཨཽ"), + ("ा", "ཱ"), + ("ि", "ི"), + ("ी", "ཱི"), + ("ु", "ུ"), + ("ू", "ཱུ"), + ("ृ", "ྲྀ"), + ("ॄ", "ྲཱྀ"), + ("ॢ", "ླྀ"), + ("ॣ", "ླཱྀ"), + ("ॆ", "ེ"), + ("े", "ེ"), + ("ै", "ཻ"), + ("ॊ", "ོ"), + ("ो", "ོ"), + ("ौ", "ཽ"), + ("ं", "ཾ"), + ("ः", "ཿ"), + ("ँ", "ྃ"), + ("्", ""), + ("क", "ཀ"), + ("ख", "ཁ"), + ("ग", "ག"), + ("घ", "གྷ"), + ("ङ", "ང"), + ("च", "ཙ"), + ("छ", "ཚ"), + ("ज", "ཛ"), + ("झ", "ཛྷ"), + ("ञ", "ཉ"), + ("ट", "ཊ"), + ("ठ", "ཋ"), + ("ड", "ཌ"), + ("ढ", "ཌྷ"), + ("ण", "ཎ"), + ("त", "ཏ"), + ("थ", "ཐ"), + ("द", "ད"), + ("ध", "དྷ"), + ("न", "ན"), + ("प", "པ"), + ("फ", "ཕ"), + ("ब", "བ"), + ("भ", "བྷ"), + ("म", "མ"), + ("य", "ཡ"), + ("र", "ར"), + ("ल", "ལ"), + ("व", "བ"), + ("श", "ཤ"), + ("ष", "ཥ"), + ("स", "ས"), + ("ह", "ཧ"), + ("ळ", "ལ༹"), + ("क्ष", "ཀྵ"), + ("ज्ञ", "ཛྙ"), + ("०", "༠"), + ("१", "༡"), + ("२", "༢"), + ("३", "༣"), + ("४", "༤"), + ("५", "༥"), + ("६", "༦"), + ("७", "༧"), + ("८", "༨"), + ("९", "༩"), + ("ॐ", "ༀ"), + ("ऽ", "྅"), + ("।", "།"), + ("॥", "༎"), + ("ॅ", "ེ"), + ("क़", "ཀ༹"), + ("ख़", "ཁ༹"), + ("ग़", "ག༹"), + ("ज़", "ཟ"), + ("ड़", "ཌ༹"), + ("ढ़", "ཌྷ༹"), + ("फ़", "ཕ༹"), + ("य़", "ཡ༹"), + ("ऱ", "ར༹"), +]; + +pub const HK: &[(&str, &str)] = &[ + ("अ", "a"), + ("आ", "A"), + ("इ", "i"), + ("ई", "I"), + ("उ", "u"), + ("ऊ", "U"), + ("ऋ", "R"), + ("ॠ", "RR"), + ("ऌ", "lR"), + ("ॡ", "lRR"), + ("ऎ", "è"), + ("ए", "e"), + ("ऐ", "ai"), + ("ऒ", "ò"), + ("ओ", "o"), + ("औ", "au"), + ("ा", "A"), + ("ि", "i"), + ("ी", "I"), + ("ु", "u"), + ("ू", "U"), + ("ृ", "R"), + ("ॄ", "RR"), + ("ॢ", "lR"), + ("ॣ", "lRR"), + ("ॆ", "è"), + ("े", "e"), + ("ै", "ai"), + ("ॊ", "ò"), + ("ो", "o"), + ("ौ", "au"), + ("ं", "M"), + ("ः", "H"), + ("ँ", "~"), + ("्", ""), + ("क", "k"), + ("ख", "kh"), + ("ग", "g"), + ("घ", "gh"), + ("ङ", "G"), + ("च", "c"), + ("छ", "ch"), + ("ज", "j"), + ("झ", "jh"), + ("ञ", "J"), + ("ट", "T"), + ("ठ", "Th"), + ("ड", "D"), + ("ढ", "Dh"), + ("ण", "N"), + ("त", "t"), + ("थ", "th"), + ("द", "d"), + ("ध", "dh"), + ("न", "n"), + ("प", "p"), + ("फ", "ph"), + ("ब", "b"), + ("भ", "bh"), + ("म", "m"), + ("य", "y"), + ("र", "r"), + ("ल", "l"), + ("व", "v"), + ("श", "z"), + ("ष", "S"), + ("स", "s"), + ("ह", "h"), + ("ळ", "L"), + ("क्ष", "kS"), + ("ज्ञ", "jJ"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "OM"), + ("ऽ", "'"), + ("।", "."), + ("॥", ".."), + ("क़", "q"), + ("ख़", "qh"), + ("ग़", "g2"), + ("ज़", "z2"), + ("ड़", "r3"), + ("ढ़", "r3h"), + ("फ़", "f"), + ("य़", "Y"), + ("ऱ", "r2"), + ("ऴ", "zh"), + ("ऩ", "n2"), +]; + +pub const IAST: &[(&str, &str)] = &[ + ("अ", "a"), + ("आ", "ā"), + ("इ", "i"), + ("ई", "ī"), + ("उ", "u"), + ("ऊ", "ū"), + ("ऋ", "ṛ"), + ("ॠ", "ṝ"), + ("ऌ", "ḷ"), + ("ॡ", "ḹ"), + ("ऎ", "è"), + ("ए", "e"), + ("ऐ", "ai"), + ("ऒ", "ò"), + ("ओ", "o"), + ("औ", "au"), + ("ा", "ā"), + ("ि", "i"), + ("ी", "ī"), + ("ु", "u"), + ("ू", "ū"), + ("ृ", "ṛ"), + ("ॄ", "ṝ"), + ("ॢ", "ḷ"), + ("ॣ", "ḹ"), + ("ॆ", "è"), + ("े", "e"), + ("ै", "ai"), + ("ॊ", "ò"), + ("ो", "o"), + ("ौ", "au"), + ("ं", "ṃ"), + ("ः", "ḥ"), + ("ँ", "~"), + ("ꣳ", "m̐"), + ("्", ""), + ("॑", "̭"), + ("॒", "॒"), + ("᳡", "̀"), + ("꣡", "́"), + ("꣢", "²"), + ("꣣", "³"), + ("꣤", "⁴"), + ("꣥", "⁵"), + ("꣦", "⁶"), + ("꣧", "⁷"), + ("꣨", "⁸"), + ("꣩", "⁹"), + ("꣪", "꣪"), + ("꣫", "꣫"), + ("꣬", "꣬"), + ("꣭", "꣭"), + ("꣮", "꣮"), + ("꣯", "꣯"), + ("꣰", "꣰"), + ("꣱", "꣱"), + ("क", "k"), + ("ख", "kh"), + ("ग", "g"), + ("घ", "gh"), + ("ङ", "ṅ"), + ("च", "c"), + ("छ", "ch"), + ("ज", "j"), + ("झ", "jh"), + ("ञ", "ñ"), + ("ट", "ṭ"), + ("ठ", "ṭh"), + ("ड", "ḍ"), + ("ढ", "ḍh"), + ("ण", "ṇ"), + ("त", "t"), + ("थ", "th"), + ("द", "d"), + ("ध", "dh"), + ("न", "n"), + ("प", "p"), + ("फ", "ph"), + ("ब", "b"), + ("भ", "bh"), + ("म", "m"), + ("य", "y"), + ("र", "r"), + ("ल", "l"), + ("व", "v"), + ("श", "ś"), + ("ष", "ṣ"), + ("स", "s"), + ("ह", "h"), + ("ळ", "ḻ"), + ("क्ष", "kṣ"), + ("ज्ञ", "jñ"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "oṃ"), + ("ऽ", "'"), + ("।", "."), + ("॥", ".."), + ("क़", "q"), + ("ख़", "k͟h"), + ("ग़", "ġ"), + ("ज़", "z"), + ("ड़", "r̤"), + ("ढ़", "r̤h"), + ("फ़", "f"), + ("य़", "ẏ"), + ("ऱ", "ṟ"), + ("ऩ", "ṉ"), + ("ऽ", "`"), + ("ऽ", "’"), + ("ꣳ", "ṁ"), + ("ꣳ", "ṁ"), + ("ं", "ṃ"), + ("ः", "ḥ"), + ("ट", "ṭ"), + ("ठ", "ṭh"), + ("ड", "ḍ"), + ("ढ", "ḍh"), + ("ण", "ṇ"), + ("ष", "ṣ"), + ("ङ", "ṅ"), + ("ञ", "ñ"), + ("ऋ", "r̥"), + ("ऋ", "ṛ"), + ("ॠ", "ṝ"), + ("ॠ", "r̥̄"), + ("ॠ", "r̥̄"), + ("ॠ", "ṝ"), + ("ॠ", "ṝ"), + ("꣡", "¹"), +]; + +pub const ITRANS: &[(&str, &str)] = &[ + ("अ", "a"), + ("आ", "A"), + ("इ", "i"), + ("ई", "I"), + ("उ", "u"), + ("ऊ", "U"), + ("ऋ", "RRi"), + ("ॠ", "RRI"), + ("ऌ", "LLi"), + ("ॡ", "LLI"), + ("ऎ", "è"), + ("ए", "e"), + ("ऐ", "ai"), + ("ऒ", "ò"), + ("ओ", "o"), + ("औ", "au"), + ("ा", "A"), + ("ि", "i"), + ("ी", "I"), + ("ु", "u"), + ("ू", "U"), + ("ृ", "RRi"), + ("ॄ", "RRI"), + ("ॢ", "LLi"), + ("ॣ", "LLI"), + ("ॆ", "è"), + ("े", "e"), + ("ै", "ai"), + ("ॊ", "ò"), + ("ो", "o"), + ("ौ", "au"), + ("ं", "M"), + ("ः", "H"), + ("ँ", ".N"), + ("्", ""), + ("क", "k"), + ("ख", "kh"), + ("ग", "g"), + ("घ", "gh"), + ("ङ", "~N"), + ("च", "ch"), + ("छ", "Ch"), + ("ज", "j"), + ("झ", "jh"), + ("ञ", "~n"), + ("ट", "T"), + ("ठ", "Th"), + ("ड", "D"), + ("ढ", "Dh"), + ("ण", "N"), + ("त", "t"), + ("थ", "th"), + ("द", "d"), + ("ध", "dh"), + ("न", "n"), + ("प", "p"), + ("फ", "ph"), + ("ब", "b"), + ("भ", "bh"), + ("म", "m"), + ("य", "y"), + ("र", "r"), + ("ल", "l"), + ("व", "v"), + ("श", "sh"), + ("ष", "Sh"), + ("स", "s"), + ("ह", "h"), + ("ळ", "L"), + ("क्ष", "kSh"), + ("ज्ञ", "j~n"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "OM"), + ("ऽ", ".a"), + ("।", "|"), + ("॥", "||"), + ("ॅ", ".c"), + ("‍", "{}"), + ("", "_"), + ("॑", "\\'"), + ("॒", "\\_"), + ("क़", "q"), + ("ख़", "K"), + ("ग़", "G"), + ("ज़", "z"), + ("ड़", ".D"), + ("ढ़", ".Dh"), + ("फ़", "f"), + ("य़", "Y"), + ("ऱ", "R"), + ("ऴ", "zh"), + ("आ", "aa"), + ("ई", "ii"), + ("ई", "ee"), + ("ऊ", "uu"), + ("ऊ", "oo"), + ("ऋ", "R^i"), + ("ॠ", "R^I"), + ("ऌ", "L^i"), + ("ॡ", "L^I"), + ("ं", ".m"), + ("ं", ".n"), + ("ङ", "N^"), + ("च", "c"), + ("छ", "C"), + ("छ", "chh"), + ("ञ", "JN"), + ("व", "w"), + ("ष", "S"), + ("ष", "shh"), + ("क्ष", "kS"), + ("क्ष", "x"), + ("ज्ञ", "GY"), + ("ज्ञ", "dny"), + ("ॐ", "AUM"), + ("॒", "\\`"), + ("ऽ", "~"), + ("।", "."), + ("॥", ".."), + ("ज़", "J"), +]; + +pub const SLP1: &[(&str, &str)] = &[ + ("अ", "a"), + ("आ", "A"), + ("इ", "i"), + ("ई", "I"), + ("उ", "u"), + ("ऊ", "U"), + ("ऋ", "f"), + ("ॠ", "F"), + ("ऌ", "x"), + ("ॡ", "X"), + ("ऎ", "è"), + ("ए", "e"), + ("ऐ", "E"), + ("ऒ", "ò"), + ("ओ", "o"), + ("औ", "O"), + ("ा", "A"), + ("ि", "i"), + ("ी", "I"), + ("ु", "u"), + ("ू", "U"), + ("ृ", "f"), + ("ॄ", "F"), + ("ॢ", "x"), + ("ॣ", "X"), + ("ॆ", "è"), + ("े", "e"), + ("ै", "E"), + ("ॊ", "ò"), + ("ो", "o"), + ("ौ", "O"), + ("ं", "M"), + ("ः", "H"), + ("ँ", "~"), + ("्", ""), + ("क", "k"), + ("ख", "K"), + ("ग", "g"), + ("घ", "G"), + ("ङ", "N"), + ("च", "c"), + ("छ", "C"), + ("ज", "j"), + ("झ", "J"), + ("ञ", "Y"), + ("ट", "w"), + ("ठ", "W"), + ("ड", "q"), + ("ढ", "Q"), + ("ण", "R"), + ("त", "t"), + ("थ", "T"), + ("द", "d"), + ("ध", "D"), + ("न", "n"), + ("प", "p"), + ("फ", "P"), + ("ब", "b"), + ("भ", "B"), + ("म", "m"), + ("य", "y"), + ("र", "r"), + ("ल", "l"), + ("व", "v"), + ("श", "S"), + ("ष", "z"), + ("स", "s"), + ("ह", "h"), + ("ळ", "L"), + ("क्ष", "kz"), + ("ज्ञ", "jY"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "AUM"), + ("ऽ", "'"), + ("।", "."), + ("॥", ".."), + ("क़", "k0"), + ("ख़", "K0"), + ("ग़", "g0"), + ("ज़", "j0"), + ("ड़", "q0"), + ("ढ़", "Q0"), + ("फ़", "P0"), + ("य़", "Y0"), + ("ऱ", "r2"), + ("ऴ", "L0"), + ("ऩ", "n0"), +]; + +pub const VELTHUIS: &[(&str, &str)] = &[ + ("अ", "a"), + ("आ", "aa"), + ("इ", "i"), + ("ई", "ii"), + ("उ", "u"), + ("ऊ", "uu"), + ("ऋ", ".r"), + ("ॠ", ".R"), + ("ऌ", ".l"), + ("ॡ", ".L"), + ("ऎ", "è"), + ("ए", "e"), + ("ऐ", "ai"), + ("ऒ", "ò"), + ("ओ", "o"), + ("औ", "au"), + ("ा", "aa"), + ("ि", "i"), + ("ी", "ii"), + ("ु", "u"), + ("ू", "uu"), + ("ृ", ".r"), + ("ॄ", ".R"), + ("ॢ", ".l"), + ("ॣ", ".L"), + ("ॆ", "è"), + ("े", "e"), + ("ै", "ai"), + ("ॊ", "ò"), + ("ो", "o"), + ("ौ", "au"), + ("ं", ".m"), + ("ः", ".h"), + ("ँ", "~m"), + ("्", ""), + ("क", "k"), + ("ख", "kh"), + ("ग", "g"), + ("घ", "gh"), + ("ङ", "\"n"), + ("च", "c"), + ("छ", "ch"), + ("ज", "j"), + ("झ", "jh"), + ("ञ", "~n"), + ("ट", ".t"), + ("ठ", ".th"), + ("ड", ".d"), + ("ढ", ".dh"), + ("ण", ".n"), + ("त", "t"), + ("थ", "th"), + ("द", "d"), + ("ध", "dh"), + ("न", "n"), + ("प", "p"), + ("फ", "ph"), + ("ब", "b"), + ("भ", "bh"), + ("म", "m"), + ("य", "y"), + ("र", "r"), + ("ल", "l"), + ("व", "v"), + ("श", "\"s"), + ("ष", ".s"), + ("स", "s"), + ("ह", "h"), + ("ळ", "L"), + ("क्ष", "k.s"), + ("ज्ञ", "j~n"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), + ("ॐ", "O"), + ("ऽ", ".a"), + ("।", "|"), + ("॥", "||"), +]; diff --git a/vidyut-lipi/src/wasm.rs b/vidyut-lipi/src/wasm.rs new file mode 100644 index 0000000..e83e03b --- /dev/null +++ b/vidyut-lipi/src/wasm.rs @@ -0,0 +1,38 @@ +//! WebAssembly bindings for vidyut-lipi. +use crate::{transliterate, Scheme}; +extern crate console_error_panic_hook; + +use wasm_bindgen::prelude::{wasm_bindgen, JsValue}; + +#[wasm_bindgen] +extern "C" { + /// Exposes `console.error` in case we need to log anything to the JS console. + #[wasm_bindgen(js_namespace = console, js_name = error)] + fn error(s: &str); +} + +/// WebAssembly API for vidyut-prakriya. +/// +/// Within reason, we have tried to mimic a native JavaScript API. At some point, we wish to +/// support optional arguments, perhaps by using `Reflect`. +#[wasm_bindgen] +pub struct VidyutLipi {} + +#[wasm_bindgen] +impl VidyutLipi { + /// Creates a new API manager. + /// + /// This constructor is not called `new` because `new` is a reserved word in JavaScript. + pub fn init() -> Self { + // Logs panics to the console. Without this, panics are logged as "RuntimeError: + // Unreachable executed", which is not useful. + console_error_panic_hook::set_once(); + Self {} + } + + /// Wrapper for `transliterate`. + pub fn transliterate(&self, input: &str, from: Scheme, to: Scheme) -> JsValue { + let output = transliterate(input, from.into(), to.into()); + JsValue::from_str(&output) + } +} diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs new file mode 100644 index 0000000..4694d1a --- /dev/null +++ b/vidyut-lipi/tests/basic.rs @@ -0,0 +1,142 @@ +use vidyut_lipi::Scheme::*; +use vidyut_lipi::{transliterate, Scheme}; + +/// Transliterates all input strings against each other. +/// +/// This function assums that all strings provided are lossless. +fn assert_exhaustive_pairwise(examples: &[(Scheme, &str)]) { + for (from, input) in examples { + for (to, expected) in examples { + let actual = transliterate(input, *from, *to); + assert_eq!(*expected, actual, "t(\"{input}\", {from:?}, {to:?})"); + } + } +} + +#[test] +fn vowels() { + assert_exhaustive_pairwise(&[ + (HarvardKyoto, "a A i I u U R RR lR lRR e ai o au"), + (Iast, "a ā i ī u ū ṛ ṝ ḷ ḹ e ai o au"), + (Itrans, "a A i I u U RRi RRI LLi LLI e ai o au"), + (Slp1, "a A i I u U f F x X e E o O"), + (Velthuis, "a aa i ii u uu .r .R .l .L e ai o au"), + // Indic + (Bengali, "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"), + (Brahmi, "𑀅 𑀆 𑀇 𑀈 𑀉 𑀊 𑀋 𑀌 𑀍 𑀎 𑀏 𑀐 𑀑 𑀒"), + (Devanagari, "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ए ऐ ओ औ"), + (Grantha, "𑌅 𑌆 𑌇 𑌈 𑌉 𑌊 𑌋 𑍠 𑌌 𑍡 𑌏 𑌐 𑌓 𑌔"), + (Gujarati, "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ"), + (Kannada, "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಏ ಐ ಓ ಔ"), + (Malayalam, "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ ഏ ഐ ഓ ഔ"), + (Oriya, "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ"), + (Sinhala, "අ ආ ඉ ඊ උ ඌ ඍ ඎ ඏ ඐ ඒ ඓ ඕ ඖ"), + (Telugu, "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఏ ఐ ఓ ఔ"), + ]); +} + +/// Includes vowel marks and virama +#[test] +fn dependent_marks() { + assert_exhaustive_pairwise(&[ + ( + HarvardKyoto, + "ka kA ki kI ku kU kR kRR klR klRR ke kai ko kau k", + ), + (Iast, "ka kā ki kī ku kū kṛ kṝ kḷ kḹ ke kai ko kau k"), + ( + Itrans, + "ka kA ki kI ku kU kRRi kRRI kLLi kLLI ke kai ko kau k", + ), + (Slp1, "ka kA ki kI ku kU kf kF kx kX ke kE ko kO k"), + // Indic + (Bengali, "ক কা কি কী কু কূ কৃ কৄ কৢ কৣ কে কৈ কো কৌ ক্"), + (Brahmi, "𑀓 𑀓𑀸 𑀓𑀺 𑀓𑀻 𑀓𑀼 𑀓𑀽 𑀓𑀾 𑀓𑀿 𑀓𑁀 𑀓𑁁 𑀓𑁂 𑀓𑁃 𑀓𑁄 𑀓𑁅 𑀓𑁆"), + (Devanagari, "क का कि की कु कू कृ कॄ कॢ कॣ के कै को कौ क्"), + (Grantha, "𑌕 𑌕𑌾 𑌕𑌿 𑌕𑍀 𑌕𑍁 𑌕𑍂 𑌕𑍃 𑌕𑍄 𑌕𑍢 𑌕𑍣 𑌕𑍇 𑌕𑍈 𑌕𑍋 𑌕𑍗 𑌕𑍍"), + (Gujarati, "ક કા કિ કી કુ કૂ કૃ કૄ કૢ કૣ કે કૈ કો કૌ ક્"), + (Kannada, "ಕ ಕಾ ಕಿ ಕೀ ಕು ಕೂ ಕೃ ಕೄ ಕೢ ಕೣ ಕೇ ಕೈ ಕೋ ಕೌ ಕ್"), + (Malayalam, "ക കാ കി കീ കു കൂ കൃ കൄ കൢ കൣ കേ കൈ കോ കൌ ക്"), + (Oriya, "କ କା କି କୀ କୁ କୂ କୃ କୄ କୢ କୣ କେ କୈ କୋ କୌ କ୍"), + (Sinhala, "ක කා කි කී කු කූ කෘ කෲ කෟ කෳ කේ කෛ කෝ කෞ ක්"), + (Telugu, "క కా కి కీ కు కూ కృ కౄ కౢ కౣ కే కై కో కౌ క్"), + ]); +} + +#[test] +fn ayogavahas() { + assert_exhaustive_pairwise(&[ + (HarvardKyoto, "aM aH"), + (Iast, "aṃ aḥ"), + (Itrans, "aM aH"), + (Slp1, "aM aH"), + (Velthuis, "a.m a.h"), + // Indic + (Devanagari, "अं अः"), + (Kannada, "ಅಂ ಅಃ"), + (Malayalam, "അം അഃ"), + (Telugu, "అం అః"), + ]); +} + +#[test] +fn consonants() { + assert_exhaustive_pairwise(&[ + (HarvardKyoto, "ka kha ga gha Ga ca cha ja jha Ja Ta Tha Da Dha Na ta tha da dha na pa pha ba bha ma ya ra la va za Sa sa ha La",), + (Iast, "ka kha ga gha ṅa ca cha ja jha ña ṭa ṭha ḍa ḍha ṇa ta tha da dha na pa pha ba bha ma ya ra la va śa ṣa sa ha ḻa"), + (Itrans, "ka kha ga gha ~Na cha Cha ja jha ~na Ta Tha Da Dha Na ta tha da dha na pa pha ba bha ma ya ra la va sha Sha sa ha La"), + (Slp1, "ka Ka ga Ga Na ca Ca ja Ja Ya wa Wa qa Qa Ra ta Ta da Da na pa Pa ba Ba ma ya ra la va Sa za sa ha La"), + (Velthuis, "ka kha ga gha \"na ca cha ja jha ~na .ta .tha .da .dha .na ta tha da dha na pa pha ba bha ma ya ra la va \"sa .sa sa ha La"), + // Indic + (Brahmi, "𑀓 𑀔 𑀕 𑀖 𑀗 𑀘 𑀙 𑀚 𑀛 𑀜 𑀝 𑀞 𑀟 𑀠 𑀡 𑀢 𑀣 𑀤 𑀥 𑀦 𑀧 𑀨 𑀩 𑀪 𑀫 𑀬 𑀭 𑀮 𑀯 𑀰 𑀱 𑀲 𑀳 𑀴"), + (Devanagari, "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ"), + (Grantha, "𑌕 𑌖 𑌗 𑌘 𑌙 𑌚 𑌛 𑌜 𑌝 𑌞 𑌟 𑌠 𑌡 𑌢 𑌣 𑌤 𑌥 𑌦 𑌧 𑌨 𑌪 𑌫 𑌬 𑌭 𑌮 𑌯 𑌰 𑌲 𑌵 𑌶 𑌷 𑌸 𑌹 𑌳"), + (Gujarati, "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ"), + (Kannada, "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ"), + (Malayalam, "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള"), + (Oriya, "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ"), + (Sinhala, "ක ඛ ග ඝ ඞ ච ඡ ජ ඣ ඤ ට ඨ ඩ ඪ ණ ත ථ ද ධ න ප ඵ බ භ ම ය ර ල ව ශ ෂ ස හ ළ"), + (Telugu, "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ"), + ]); +} + +#[test] +fn symbols() { + assert_exhaustive_pairwise(&[ + (HarvardKyoto, "0 1 2 3 4 5 6 7 8 9 . .. '"), + (Iast, "0 1 2 3 4 5 6 7 8 9 . .. '"), + (Itrans, "0 1 2 3 4 5 6 7 8 9 | || .a"), + (Slp1, "0 1 2 3 4 5 6 7 8 9 . .. '"), + (Velthuis, "0 1 2 3 4 5 6 7 8 9 | || .a"), + // Indic + (Bengali, "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ । ॥ ঽ"), + (Devanagari, "० १ २ ३ ४ ५ ६ ७ ८ ९ । ॥ ऽ"), + (Grantha, "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ । ॥ 𑌽"), + (Gujarati, "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ । ॥ ઽ"), + (Gurmukhi, "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ । ॥ ऽ"), + (Kannada, "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ । ॥ ಽ"), + (Malayalam, "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ । ॥ ഽ"), + (Oriya, "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ । ॥ ଽ"), + (Telugu, "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ । ॥ ఽ"), + ]); +} + +#[test] +fn basic_sentences() { + assert_exhaustive_pairwise(&[ + (HarvardKyoto, "nArAyaNaM namaskRtya naraM caiva narottamam . devIM sarasvatIM caiva tato jayamudIyaret .. 1 ..",), + (Iast, "nārāyaṇaṃ namaskṛtya naraṃ caiva narottamam . devīṃ sarasvatīṃ caiva tato jayamudīyaret .. 1 .."), + (Itrans, "nArAyaNaM namaskRRitya naraM chaiva narottamam | devIM sarasvatIM chaiva tato jayamudIyaret || 1 ||"), + (Slp1, "nArAyaRaM namaskftya naraM cEva narottamam . devIM sarasvatIM cEva tato jayamudIyaret .. 1 .."), + (Velthuis, "naaraaya.na.m namask.rtya nara.m caiva narottamam | devii.m sarasvatii.m caiva tato jayamudiiyaret || 1 ||"), + // Indic + (Devanagari, "नारायणं नमस्कृत्य नरं चैव नरोत्तमम् । देवीं सरस्वतीं चैव ततो जयमुदीयरेत् ॥ १ ॥"), + (Brahmi, "𑀦𑀸𑀭𑀸𑀬𑀡𑀁 𑀦𑀫𑀲𑁆𑀓𑀾𑀢𑁆𑀬 𑀦𑀭𑀁 𑀘𑁃𑀯 𑀦𑀭𑁄𑀢𑁆𑀢𑀫𑀫𑁆 𑁇 𑀤𑁂𑀯𑀻𑀁 𑀲𑀭𑀲𑁆𑀯𑀢𑀻𑀁 𑀘𑁃𑀯 𑀢𑀢𑁄 𑀚𑀬𑀫𑀼𑀤𑀻𑀬𑀭𑁂𑀢𑁆 𑁈 𑁧 𑁈"), + (Grantha, "𑌨𑌾𑌰𑌾𑌯𑌣𑌂 𑌨𑌮𑌸𑍍𑌕𑍃𑌤𑍍𑌯 𑌨𑌰𑌂 𑌚𑍈𑌵 𑌨𑌰𑍋𑌤𑍍𑌤𑌮𑌮𑍍 । 𑌦𑍇𑌵𑍀𑌂 𑌸𑌰𑌸𑍍𑌵𑌤𑍀𑌂 𑌚𑍈𑌵 𑌤𑌤𑍋 𑌜𑌯𑌮𑍁𑌦𑍀𑌯𑌰𑍇𑌤𑍍 ॥ ௧ ॥"), + (Gujarati, "નારાયણં નમસ્કૃત્ય નરં ચૈવ નરોત્તમમ્ । દેવીં સરસ્વતીં ચૈવ તતો જયમુદીયરેત્ ॥ ૧ ॥"), + (Kannada, "ನಾರಾಯಣಂ ನಮಸ್ಕೃತ್ಯ ನರಂ ಚೈವ ನರೋತ್ತಮಮ್ । ದೇವೀಂ ಸರಸ್ವತೀಂ ಚೈವ ತತೋ ಜಯಮುದೀಯರೇತ್ ॥ ೧ ॥"), + (Malayalam, "നാരായണം നമസ്കൃത്യ നരം ചൈവ നരോത്തമമ് । ദേവീം സരസ്വതീം ചൈവ തതോ ജയമുദീയരേത് ॥ ൧ ॥"), + (Oriya, "ନାରାଯଣଂ ନମସ୍କୃତ୍ଯ ନରଂ ଚୈଵ ନରୋତ୍ତମମ୍ । ଦେଵୀଂ ସରସ୍ଵତୀଂ ଚୈଵ ତତୋ ଜଯମୁଦୀଯରେତ୍ ॥ ୧ ॥"), + (Telugu, "నారాయణం నమస్కృత్య నరం చైవ నరోత్తమమ్ । దేవీం సరస్వతీం చైవ తతో జయముదీయరేత్ ॥ ౧ ॥"), + ]); +} diff --git a/vidyut-lipi/www/index.html b/vidyut-lipi/www/index.html new file mode 100644 index 0000000..81f6cab --- /dev/null +++ b/vidyut-lipi/www/index.html @@ -0,0 +1,125 @@ + + + + + + + + vidyut-lipi demo + + + + + + + +
+ + +
+

+

+ (A demo of vidyut-lipi. + Please file bugs here.) +

+
+ +
+ + + +
+
+ + + → + + +
+ +
+ + +
+ +
+

vidyut-lipi is an experimental Sanskrit transliteration + library. Our goal is to provide a standard transliterator in Rust that can + be bound to other languages as needed. Our source code is available here.

+ +

This page demonstrates some of vidyut-lipi's features. The + Simple tab shows a simple transliteration UI. Multi + transliterates to dozens of scripts at once. Grid shows our full + mapping grid.

+
+
+ + +
+
+ + +
+ +
    + +
+
+ +
+ +
+ +
+ +
+ + diff --git a/vidyut-lipi/www/static/app.js b/vidyut-lipi/www/static/app.js new file mode 100644 index 0000000..eb7fa4d --- /dev/null +++ b/vidyut-lipi/www/static/app.js @@ -0,0 +1,266 @@ +/** + * A simple demo interface for vidyut-lipi. + * + * + * Constraints + * =========== + * - This demo is served on GitHub pages. So, no databases -- everything should + * be done client side! + * - This demo should use our wasm build's public API. + * - Although this is a production site, the stakes are low -- do things the + * hacky way if that fixes the problem. + */ + +import init, { VidyutLipi as VidyutWasm, Scheme } from "/static/wasm/vidyut_lipi.js"; + +// =================================================== +// vidyut-lipi +// =================================================== + +class Vidyut { + // Call `init()` before calling this so that you initialize the WASM environment. + constructor() { + this.wasm = VidyutWasm.init(); + console.log("Constructed Vidyut."); + } + + transliterate(input, from, to) { + return this.wasm.transliterate(input, from, to); + } +} + +// =================================================== +// App +// =================================================== + +let vowels = [ + "अ", + "आ", + "इ", + "ई", + "उ", + "ऊ", + "ऋ", + "ॠ", + "ऌ", + "ॡ", + "ए", + "ऐ", + "ओ", + "औ", +]; + +let marks = [ + "क", + "का", + "कि", + "की", + "कु", + "कू", + "कृ", + "कॄ", + "कॢ", + "कॣ", + "के", + "कै", + "को", + "कौ", + "क्" +]; + +let consonants = [ + "क", + "ख", + "ग", + "घ", + "ङ", + "च", + "छ", + "ज", + "झ", + "ञ", + "ट", + "ठ", + "ड", + "ढ", + "ण", + "त", + "थ", + "द", + "ध", + "न", + "प", + "फ", + "ब", + "भ", + "म", + "य", + "र", + "ल", + "व", + "श", + "ष", + "स", + "ह", + "ळ" +]; + +let symbols = [ + "०", + "१", + "२", + "३", + "४", + "५", + "६", + "७", + "८", + "९", + "।", + "॥", + "ऽ" +]; + +let schemes = [ + Scheme.Devanagari, + Scheme.Bengali, + Scheme.Brahmi, + Scheme.Grantha, + Scheme.Gujarati, + Scheme.Gurmukhi, + Scheme.Kannada, + Scheme.Malayalam, + Scheme.Oriya, + Scheme.Sinhala, + Scheme.Tamil, + Scheme.Telugu, + Scheme.Tibetan, + + Scheme.HarvardKyoto, + Scheme.Iast, + Scheme.Itrans, + Scheme.Slp1, + Scheme.Velthuis, +]; + +let schemeNames = { + [Scheme.Devanagari]: "Devanagari", + [Scheme.Bengali]: "Bengali", + [Scheme.Brahmi]: "Brahmi", + [Scheme.Grantha]: "Grantha", + [Scheme.Gujarati]: "Gujarati", + [Scheme.Gurmukhi]: "Gurmukhi", + [Scheme.Kannada]: "Kannada", + [Scheme.Malayalam]: "Malayalam", + [Scheme.Oriya]: "Oriya", + [Scheme.Sinhala]: "Sinhala", + [Scheme.Tamil]: "Tamil", + [Scheme.Telugu]: "Telugu", + [Scheme.Tibetan]: "Tibetan", + + [Scheme.HarvardKyoto]: "Harvard-Kyoto", + [Scheme.Iast]: "IAST", + [Scheme.Itrans]: "ITRANS", + [Scheme.Slp1]: "SLP1", + [Scheme.Velthuis]: "Velthuis", +} + +const App = () => ({ + activeTab: null, + + simpleInput: "", + simpleFrom: "HarvardKyoto", + simpleTo: "Devanagari", + + multiInput: "", + multiFrom: "HarvardKyoto", + + async init() { + this.activeTab = 'simple'; + await this.initVidyut(); + }, + + async initVidyut() { + await init(); + this.vidyut = new Vidyut(); + }, + + tab(s) { + if (s === this.activeTab) { + return "font-bold p-2 bg-sky-100 rounded text-sky-800"; + } else { + return ""; + } + }, + + setTab(tab) { + this.activeTab = tab; + }, + + simpleOutput() { + const input = this.simpleInput; + const from = Scheme[this.simpleFrom]; + const to = Scheme[this.simpleTo]; + if (!this.vidyut) { + return ""; + } + return this.vidyut.transliterate(input, from, to); + }, + + multiOutput(toScheme) { + const input = this.multiInput; + const from = Scheme[this.multiFrom]; + if (!this.vidyut) { + return ""; + } + return this.vidyut.transliterate(input, from, toScheme); + }, + + + // Schemes, mapping from strings to numeric values. + schemes() { + let ints = Object.values(Scheme).filter(Number.isInteger); + + let data = []; + ints.forEach((i) => { + data.push({ text: schemeNames[i], value: Scheme[i] }); + }); + return data; + }, + + async deva(text) { + await this.initVidyut(); + return this.vidyut.transliterate(text, Scheme.Slp1, Scheme.Devanagari); + }, + + async soundTable() { + await this.initVidyut(); + let allTables = []; + [vowels, marks, consonants, symbols].forEach((group) => { + let table = []; + schemes.forEach((toScheme) => { + let name = schemeNames[toScheme]; + let row = []; + group.forEach((sound) => { + let out = this.vidyut.transliterate(sound, Scheme.Devanagari, toScheme); + row.push(out); + }); + table.push({ + name, + values: row, + }); + }); + allTables.push(table); + }); + + return allTables; + } +}); + +window.Scheme = Scheme; +window.Vidyut = Vidyut; + +// Initialize the app. +window.addEventListener('alpine:init', async () => { + Alpine.data("app", App) +});