diff --git a/vidyut-lipi/README.md b/vidyut-lipi/README.md index 9e8e3c7..bd646e5 100644 --- a/vidyut-lipi/README.md +++ b/vidyut-lipi/README.md @@ -28,7 +28,7 @@ An online demo is available [here][demo]. Overview -------- -Communities around the world write Sanskrit and other Indian languages in +Communities around the world write Sanskrit and other Indian languages with different scripts in different contexts. For example, a user might type Sanskrit in ITRANS, read it in Kannada, and publish it in Devanagari. Such communities often rely on a *transliterator*, which converts text from one @@ -42,24 +42,48 @@ and feature work is diluted across several different implementations. ecosystem. Our priorities are: - quality, including a comprehensive test suite. -- coverage across all of the schemes in common use. -- ease of use (and reuse) for developers. +- test coverage across all of the schemes in common use. +- a precise and ergonomic API. +- availability in multiple languages, including Python and WebAssembly. - high performance across various metrics, including runtime, startup time, and file size. -We recommend `vidyut-lipi` if you need a simple and high-quality -transliteration library, and we encourage you to [file an issue][issue] if -`vidyut-lipi` does not support your use case. We are especially excited about -supporting new scripts and new programming languages. +We encourage you to [file an issue][issue] if `vidyut-lipi` does not support +your use case. We are especially excited about supporting new scripts and new +programming languages. [issue]: https://github.com/ambuda-org/vidyut/issues -If `vidyut-lipi` is not right for your needs, we also strongly recommend -the [Aksharamukha][aksharamukha] the [indic-transliteration][indic-trans] -projects, which have each been highly influential in our work on `vidyut-lipi`. -[aksharamukha]: https://github.com/virtualvinodh/aksharamukha/ -[indic-trans]: https://github.com/indic-transliteration +Alternatives to `vidyut-lipi` +----------------------------- + +There are two main alternatives to `vidyut-lipi`, both of which have been +influential on the design of `vidyut-lipi`: + +- [Aksharamukha][am] offers high quality and supports more than a hundred + different scripts. Aksharamukha offers best-in-class transliteration, but it + is available only in Python. + +- [indic-transliteration][it] implements the same basic transliterator in + multiple programming languages. indic-transliteration supports a large + software ecosystem, but its different implementations each have their own + quirks and limitations. + +[am]: https://github.com/virtualvinodh/aksharamukha/ +[it]: https://github.com/indic-transliteration + +Our long-term goal is to combine the quality of Aksharamukha with the +availability of indic-transliteration. Until then, `vidyut-lipi` provides the +following short-term benefits: + +- High-quality transliteration for Rust and WebAssembly. +- Smooth support for other programming languages through projects like + [pyo3][pyo3] (Python), [magnus][magnus] (Ruby), [cxx][cxx] (C++), etc. + +[pyo3]: https://pyo3.rs/v0.20.2/ +[magnus]: https://github.com/matsadler/magnus +[cxx]: https://cxx.rs/ Usage @@ -102,10 +126,11 @@ for scheme in Scheme::iter() { } ``` -As of 2023-12-29, this code prints the following: +As of 2024-01-27, this code prints the following: ```text Balinese ᬲᬂᬲ᭄ᬓᬺᬢᬫ᭄ +BarahaSouth saMskRutam Bengali সংস্কৃতম্ Brahmi 𑀲𑀁𑀲𑁆𑀓𑀾𑀢𑀫𑁆 Burmese သံသ်ကၖတမ် @@ -113,20 +138,27 @@ Devanagari संस्कृतम् Grantha 𑌸𑌂𑌸𑍍𑌕𑍃𑌤𑌮𑍍 Gujarati સંસ્કૃતમ્ Gurmukhi ਸਂਸ੍ਕਤਮ੍ -BarahaSouth saMskRutam HarvardKyoto saMskRtam Iast saṃskṛtam +Iso15919 saṁskr̥tam Itrans saMskRRitam Javanese ꦱꦁꦱ꧀ꦏꦽꦠꦩ꧀ Kannada ಸಂಸ್ಕೃತಮ್ +Khmer សំស្ក្ឫតម៑ Malayalam സംസ്കൃതമ് +Modi 𑘭𑘽𑘭𑘿𑘎𑘵𑘝𑘦𑘿 +Newa 𑐳𑑄𑐳𑑂𑐎𑐺𑐟𑐩𑑂 Odia ସଂସ୍କୃତମ୍ +Saurashtra ꢱꢀꢱ꣄ꢒꢺꢡꢪ꣄ Sharada 𑆱𑆁𑆱𑇀𑆑𑆸𑆠𑆩𑇀 Siddham 𑖭𑖽𑖭𑖿𑖎𑖴𑖝𑖦𑖿 Sinhala සංස්කෘතම් Slp1 saMskftam -Tamil ஸம்ஸ்க்ரு'தம் +Tamil ஸம்ʼஸ்க்ருʼதம் Telugu సంస్కృతమ్ +Thai สํสฺกฺฤตมฺ +Tibetan སཾསྐྲྀཏམ +Tirhuta 𑒮𑓀𑒮𑓂𑒏𑒵𑒞𑒧𑓂 Velthuis sa.msk.rtam Wx saMskqwam ``` diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py index 3ed7388..1184f14 100755 --- a/vidyut-lipi/scripts/create_schemes.py +++ b/vidyut-lipi/scripts/create_schemes.py @@ -37,20 +37,29 @@ "BENGALI", "BRAHMI", "BURMESE", + "CHAM", "DEVANAGARI", "GUJARATI", "GURMUKHI", "GRANTHA", "JAVANESE", "KANNADA", + "KHMER", + "LAO", "MALAYALAM", + "MODI", + "NEWA", "ORIYA", "SHARADA", "SIDDHAM", "SINHALA", - "TAMIL", + # Not yet on indic-transliteration/master + "SAURASHTRA", + "TAMIL_SUPERSCRIPTED", "TELUGU", + "THAI", "TIBETAN", + "TIRHUTA_MAITHILI", "BARAHA", "HK", @@ -93,7 +102,7 @@ def to_unique(xs: list) -> list: def _maybe_override(name: str, deva: str, raw: str) -> str | None: overrides = {} - if name in {"BRAHMI", "BALINESE", "BURMESE", "SIDDHAM", "TIBETAN"}: + if name in {"BRAHMI", "BALINESE", "BURMESE", "SIDDHAM"}: if deva in {"\u0946", "\u094a", "\u090e", "\u0912"}: # - short e mark # - short o mark @@ -110,6 +119,14 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None: "\ua8e2": None, "\ua8e3": None, } + elif name == "CHAM": + overrides = { + # Short e and o, plus vowel marks + "\u0946": None, + "\u094a": None, + "\u090e": None, + "\u0912": None, + } elif name == "GRANTHA": overrides = { # vowel sign AU @@ -124,6 +141,9 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None: overrides = { "।": ".", "॥": "..", + "ख़": "k͟h", + # Delete -- common_maps maps this to "ḳ", which we need for aytam. + # We'll add a valid mapping for क़: further below. "क़": None, } elif name == "IAST": @@ -135,10 +155,64 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None: # candrabindu "\u0901": "m̐", } - elif name == "TAMIL": + elif name == "KHMER": + overrides = { + "।": "។", + "॥": "៕", + } + elif name == "MODI": + overrides = { + "\u0907": "\U00011602", # letter i + "\u0908": "\U00011603", # letter ii + "\u0909": "\U00011604", # letter u + "\u090a": "\U00011605", # letter uu + "\u090b": "\U00011606", # letter vocalic r + "\u090c": "\U00011608", # letter vocalic l + "\u093f": "\U00011631", # sign i + "\u0940": "\U00011632", # sign ii + "\u0941": "\U00011633", # sign u + "\u0942": "\U00011634", # sign uu + "\u0943": "\U00011635", # sign vocalic r + "\u0944": "\U00011636", # sign vocalic rr + "\u0960": "\U00011607", # letter vocalic rr + "\u0961": "\U00011609", # letter vocalic ll + "\u0962": "\U00011637", # sign vocalic l + "\u0963": "\U00011638", # sign vocalic ll + + "\u0964": "\U00011641", # danda + "\u0965": "\U00011642", # double danda + } + + elif name == "NEWA": overrides = { - # Visarga - "\u0903": None, + "\u0964": "\U0001144b", # danda + "\u0965": "\U0001144c", # double danda + } + elif name == "TAMIL_SUPERSCRIPTED": + # Use roman digits per Aksharamukha + overrides = { + "०": "0", + "१": "1", + "२": "2", + "३": "3", + "४": "4", + "५": "5", + "६": "6", + "७": "7", + "८": "8", + "९": "9", + } + elif name == "TIBETAN": + overrides = { + # Virama + "\u094d": "\u0f84", + # Short e and o, plus vowel marks + "\u0946": None, + "\u094a": None, + "\u090e": None, + "\u0912": None, + # Use distinct "va" character instead of "ba". + "व": "\u0f5d", } elif name == "VELTHUIS": # These are part of the Velthuis spec but are errors in indic-transliteration. @@ -185,7 +259,9 @@ def create_scheme_entry(name: str, items: list[tuple[str, str]]) -> str: def main(): - repo = "https://github.com/indic-transliteration/common_maps.git" + # We're waiting on some changes to be pushed to indic-transliteration, so + # use a fork for now. + repo = "https://github.com/akprasad/common_maps.git" common_maps = Path("common_maps") if not common_maps.exists(): print("Cloning `common_maps` ...") @@ -333,6 +409,11 @@ def main(): # AU (AA + AU length mark) ("\u094c", "\U00011347\U00011357"), ]) + elif scheme_name == "ITRANS": + scheme_items.extend([ + # Vedic anusvara (just render as candrabindu) + ("\u0901", "{\\m+}"), + ]) elif scheme_name == "ISO": scheme_items.extend([ # Aytam @@ -355,7 +436,7 @@ def main(): # Anudatta ("\u0952", "\\"), ]) - elif scheme_name == "TAMIL": + elif scheme_name == "TAMIL_SUPERSCRIPTED": scheme_items.extend([ # Aytam ("\u0b83", "\u0b83"), @@ -382,6 +463,10 @@ def main(): ("\u092b\u093c", "f"), ]) + if scheme_name == "TAMIL_SUPERSCRIPTED": + scheme_name = "TAMIL" + elif scheme_name == "TIRHUTA_MAITHILI": + scheme_name = "TIRHUTA" buf.append(create_scheme_entry(scheme_name, scheme_items)) with open(CRATE_DIR / "src/autogen_schemes.rs", "w") as f: diff --git a/vidyut-lipi/src/autogen_schemes.rs b/vidyut-lipi/src/autogen_schemes.rs index d501a4c..e74e758 100644 --- a/vidyut-lipi/src/autogen_schemes.rs +++ b/vidyut-lipi/src/autogen_schemes.rs @@ -403,6 +403,104 @@ pub const BURMESE: &[(&str, &str)] = &[ ("ऱ", "ရ"), ]; +pub const CHAM: &[(&str, &str)] = &[ + ("अ", "ꨀ"), + ("आ", "ꨀꨩ"), + ("इ", "ꨁ"), + ("ई", "ꨁꨩ"), + ("उ", "ꨂ"), + ("ऊ", "ꨂꨩ"), + ("ऋ", "ꨣꨮ"), + ("ॠ", "ꨣꨮꨩ"), + ("ऌ", "ꨤꨮ"), + ("ॡ", "ꨤꨮꨩ"), + ("ए", "ꨃ"), + ("ऐ", "ꨄ"), + ("ओ", "ꨅ"), + ("औ", "ꨀꨯꨱ"), + ("ा", "ꨩ"), + ("ि", "ꨪ"), + ("ी", "ꨫ"), + ("ु", "ꨭ"), + ("ू", "ꨭꨩ"), + ("ृ", "ꨴꨮ"), + ("ॄ", "ꨴꨮꨩ"), + ("ॢ", "ꨵꨮ"), + ("ॣ", "ꨵꨮꨩ"), + ("े", "ꨯꨮ"), + ("ै", "ꨰ"), + ("ो", "ꨯ"), + ("ौ", "ꨯꨱ"), + ("ं", "ꩌ"), + ("ः", "ꩍ"), + ("ँ", "ꩃ"), + ("्", "ʾ"), + ("क", "ꨆ"), + ("ख", "ꨇ"), + ("ग", "ꨈ"), + ("घ", "ꨉ"), + ("ङ", "ꨋ"), + ("च", "ꨌ"), + ("छ", "ꨍ"), + ("ज", "ꨎ"), + ("झ", "ꨏ"), + ("ञ", "ꨑ"), + ("ट", "ꨓ"), + ("ठ", "ꨔ"), + ("ड", "ꨕ"), + ("ढ", "ꨖ"), + ("ण", "ꨘ"), + ("त", "ꨓ"), + ("थ", "ꨔ"), + ("द", "ꨕ"), + ("ध", "ꨖ"), + ("न", "ꨘ"), + ("प", "ꨚ"), + ("फ", "ꨜ"), + ("ब", "ꨝ"), + ("भ", "ꨞ"), + ("म", "ꨠ"), + ("य", "ꨢ"), + ("र", "ꨣ"), + ("ल", "ꨤ"), + ("व", "ꨥ"), + ("श", "ꨦ"), + ("ष", "ꨦ"), + ("स", "ꨧ"), + ("ह", "ꨨ"), + ("ळ", "ꨤ"), + ("क्ष", "ꩀꨦ"), + ("ज्ञ", "ꩄꨑ"), + ("०", "꩐"), + ("१", "꩑"), + ("२", "꩒"), + ("३", "꩓"), + ("४", "꩔"), + ("५", "꩕"), + ("६", "꩖"), + ("७", "꩗"), + ("८", "꩘"), + ("९", "꩙"), + ("ॐ", "ꨅꩌ"), + ("ऽ", "'"), + ("।", "꩝"), + ("॥", "꩞"), + ("‍", "‍"), + ("", ""), + ("॑", "॑"), + ("॒", "॒"), + ("ॅ", "ꨯꨮ"), + ("क़", "ꨆ"), + ("ख़", "ꨇ"), + ("ग़", "ꨈ"), + ("ज़", "ꨎ"), + ("ड़", "ꨕ"), + ("ढ़", "ꨖ"), + ("फ़", "ꨜ"), + ("य़", "ꨥ"), + ("ऱ", "ꨣ"), +]; + pub const DEVANAGARI: &[(&str, &str)] = &[ ("अ", "अ"), ("आ", "आ"), @@ -1042,6 +1140,184 @@ pub const KANNADA: &[(&str, &str)] = &[ ("᳚", "᳚"), ]; +pub const KHMER: &[(&str, &str)] = &[ + ("अ", "អ"), + ("आ", "អា"), + ("इ", "ឥ"), + ("ई", "ឦ"), + ("उ", "ឧ"), + ("ऊ", "ឩ"), + ("ऋ", "ឫ"), + ("ॠ", "ឬ"), + ("ऌ", "ឭ"), + ("ॡ", "ឮ"), + ("ऎ", ""), + ("ए", "ឯ"), + ("ऐ", "ឰ"), + ("ऒ", ""), + ("ओ", "ឱ"), + ("औ", "ឳ"), + ("ा", "ា"), + ("ि", "ិ"), + ("ी", "ី"), + ("ु", "ុ"), + ("ू", "ូ"), + ("ृ", "្ឫ"), + ("ॄ", "្ឬ"), + ("ॢ", "្ឭ"), + ("ॣ", "្ឮ"), + ("ॆ", ""), + ("े", "េ"), + ("ै", "ៃ"), + ("ॊ", ""), + ("ो", "ោ"), + ("ौ", "ៅ"), + ("ं", "ំ"), + ("ः", "ះ"), + ("ँ", "ំ"), + ("्", "៑"), + ("क", "ក"), + ("ख", "ខ"), + ("ग", "គ"), + ("घ", "ឃ"), + ("ङ", "ង"), + ("च", "ច"), + ("छ", "ឆ"), + ("ज", "ជ"), + ("झ", "ឈ"), + ("ञ", "ញ"), + ("ट", "ដ"), + ("ठ", "ឋ"), + ("ड", "ឌ"), + ("ढ", "ឍ"), + ("ण", "ណ"), + ("त", "ត"), + ("थ", "ថ"), + ("द", "ទ"), + ("ध", "ធ"), + ("न", "ន"), + ("प", "ប"), + ("फ", "ផ"), + ("ब", "ព"), + ("भ", "ភ"), + ("म", "ម"), + ("य", "យ"), + ("र", "រ"), + ("ल", "ល"), + ("व", "វ"), + ("श", "ឝ"), + ("ष", "ឞ"), + ("स", "ស"), + ("ह", "ហ"), + ("ळ", "ឡ"), + ("क्ष", "ក្ឞ"), + ("ज्ञ", "ជ្ញ"), + ("०", "០"), + ("१", "១"), + ("२", "២"), + ("३", "៣"), + ("४", "៤"), + ("५", "៥"), + ("६", "៦"), + ("७", "៧"), + ("८", "៨"), + ("९", "៩"), + ("ॐ", "ឱំ"), + ("ऽ", "ៜ"), + ("।", "។"), + ("॥", "៕"), + ("ॅ", "េ"), +]; + +pub const LAO: &[(&str, &str)] = &[ + ("अ", "ອະ"), + ("आ", "ອາ"), + ("इ", "ອິ"), + ("ई", "ອີ"), + ("उ", "ອຸ"), + ("ऊ", "ອູ"), + ("ऋ", "ຣຸ"), + ("ॠ", "ຣູ"), + ("ऌ", "ລຸ"), + ("ॡ", "ລູ"), + ("ऎ", ""), + ("ए", "ເອ"), + ("ऐ", "ໄອ"), + ("ऒ", ""), + ("ओ", "ໂອ"), + ("औ", "ເອົາ"), + ("ा", "າ"), + ("ि", "ິ"), + ("ी", "ີ"), + ("ु", "ຸ"), + ("ू", "ູ"), + ("ृ", "ຣຸ"), + ("ॄ", "ຣູ"), + ("ॢ", "ລຸ"), + ("ॣ", "ລູ"), + ("ॆ", ""), + ("े", "ເ"), + ("ै", "ໄ"), + ("ॊ", ""), + ("ो", "ໂ"), + ("ौ", "ເົາ"), + ("ं", "ງ"), + ("ः", "ຫ"), + ("ँ", "ງ"), + ("्", ""), + ("क", "ກະ"), + ("ख", "ຂະ"), + ("ग", "ຄະ"), + ("घ", "ຄະ"), + ("ङ", "ງະ"), + ("च", "ຈະ"), + ("छ", "ຈະ"), + ("ज", "ຊະ"), + ("झ", "ຊະ"), + ("ञ", "ຍະ"), + ("ट", "ຕະ"), + ("ठ", "ຖະ"), + ("ड", "ທະ"), + ("ढ", "ທະ"), + ("ण", "ນະ"), + ("त", "ຕະ"), + ("थ", "ຖະ"), + ("द", "ທະ"), + ("ध", "ທະ"), + ("न", "ນະ"), + ("प", "ປະ"), + ("फ", "ຜະ"), + ("ब", "ພະ"), + ("भ", "ພະ"), + ("म", "ມະ"), + ("य", "ຢະ"), + ("र", "ຣະ"), + ("ल", "ລະ"), + ("व", "ວະ"), + ("श", "ສະ"), + ("ष", "ສະ"), + ("स", "ສະ"), + ("ह", "ຫະ"), + ("ळ", "ລະ"), + ("क्ष", "ກສະ"), + ("ज्ञ", "ຊຍະ"), + ("०", "໐"), + ("१", "໑"), + ("२", "໒"), + ("३", "໓"), + ("४", "໔"), + ("५", "໕"), + ("६", "໖"), + ("७", "໗"), + ("८", "໘"), + ("९", "໙"), + ("ॐ", "ໂອງ"), + ("ऽ", "'"), + ("।", "।"), + ("॥", "॥"), + ("ॅ", "ແະ"), +]; + pub const MALAYALAM: &[(&str, &str)] = &[ ("अ", "അ"), ("आ", "ആ"), @@ -1143,6 +1419,184 @@ pub const MALAYALAM: &[(&str, &str)] = &[ ("᳚", "᳚"), ]; +pub const MODI: &[(&str, &str)] = &[ + ("अ", "𑘀"), + ("आ", "𑘁"), + ("इ", "𑘂"), + ("ई", "𑘃"), + ("उ", "𑘄"), + ("ऊ", "𑘅"), + ("ऋ", "𑘆"), + ("ॠ", "𑘇"), + ("ऌ", "𑘈"), + ("ॡ", "𑘉"), + ("ऎ", ""), + ("ए", "𑘊"), + ("ऐ", "𑘋"), + ("ऒ", ""), + ("ओ", "𑘌"), + ("औ", "𑘍"), + ("ा", "𑘰"), + ("ि", "𑘱"), + ("ी", "𑘲"), + ("ु", "𑘳"), + ("ू", "𑘴"), + ("ृ", "𑘵"), + ("ॄ", "𑘶"), + ("ॢ", "𑘷"), + ("ॣ", "𑘸"), + ("ॆ", ""), + ("े", "𑘹"), + ("ै", "𑘺"), + ("ॊ", ""), + ("ो", "𑘻"), + ("ौ", "𑘼"), + ("ं", "𑘽"), + ("ः", "𑘾"), + ("ँ", "𑘽"), + ("्", "𑘿"), + ("क", "𑘎"), + ("ख", "𑘏"), + ("ग", "𑘐"), + ("घ", "𑘑"), + ("ङ", "𑘒"), + ("च", "𑘓"), + ("छ", "𑘔"), + ("ज", "𑘕"), + ("झ", "𑘖"), + ("ञ", "𑘗"), + ("ट", "𑘘"), + ("ठ", "𑘙"), + ("ड", "𑘚"), + ("ढ", "𑘛"), + ("ण", "𑘜"), + ("त", "𑘝"), + ("थ", "𑘞"), + ("द", "𑘟"), + ("ध", "𑘠"), + ("न", "𑘡"), + ("प", "𑘢"), + ("फ", "𑘣"), + ("ब", "𑘤"), + ("भ", "𑘥"), + ("म", "𑘦"), + ("य", "𑘧"), + ("र", "𑘨"), + ("ल", "𑘩"), + ("व", "𑘪"), + ("श", "𑘫"), + ("ष", "𑘬"), + ("स", "𑘭"), + ("ह", "𑘮"), + ("ळ", "𑘯"), + ("क्ष", "𑘎𑘿𑘬"), + ("ज्ञ", "𑘕𑘿𑘗"), + ("०", "𑙐"), + ("१", "𑙑"), + ("२", "𑙒"), + ("३", "𑙓"), + ("४", "𑙔"), + ("५", "𑙕"), + ("६", "𑙖"), + ("७", "𑙗"), + ("८", "𑙘"), + ("९", "𑙙"), + ("ॐ", "𑘌𑘽"), + ("ऽ", "ऽ"), + ("।", "𑙁"), + ("॥", "𑙂"), + ("ॅ", "𑙀"), +]; + +pub const NEWA: &[(&str, &str)] = &[ + ("अ", "𑐀"), + ("आ", "𑐁"), + ("इ", "𑐂"), + ("ई", "𑐃"), + ("उ", "𑐄"), + ("ऊ", "𑐅"), + ("ऋ", "𑐆"), + ("ॠ", "𑐇"), + ("ऌ", "𑐈"), + ("ॡ", "𑐉"), + ("ऎ", ""), + ("ए", "𑐊"), + ("ऐ", "𑐋"), + ("ऒ", ""), + ("ओ", "𑐌"), + ("औ", "𑐍"), + ("ा", "𑐵"), + ("ि", "𑐶"), + ("ी", "𑐷"), + ("ु", "𑐸"), + ("ू", "𑐹"), + ("ृ", "𑐺"), + ("ॄ", "𑐻"), + ("ॢ", "𑐼"), + ("ॣ", "𑐽"), + ("ॆ", ""), + ("े", "𑐾"), + ("ै", "𑐿"), + ("ॊ", ""), + ("ो", "𑑀"), + ("ौ", "𑑁"), + ("ं", "𑑄"), + ("ः", "𑑅"), + ("ँ", "𑑃"), + ("्", "𑑂"), + ("क", "𑐎"), + ("ख", "𑐏"), + ("ग", "𑐐"), + ("घ", "𑐑"), + ("ङ", "𑐒"), + ("च", "𑐔"), + ("छ", "𑐕"), + ("ज", "𑐖"), + ("झ", "𑐗"), + ("ञ", "𑐘"), + ("ट", "𑐚"), + ("ठ", "𑐛"), + ("ड", "𑐜"), + ("ढ", "𑐝"), + ("ण", "𑐞"), + ("त", "𑐟"), + ("थ", "𑐠"), + ("द", "𑐡"), + ("ध", "𑐢"), + ("न", "𑐣"), + ("प", "𑐥"), + ("फ", "𑐦"), + ("ब", "𑐧"), + ("भ", "𑐨"), + ("म", "𑐩"), + ("य", "𑐫"), + ("र", "𑐬"), + ("ल", "𑐮"), + ("व", "𑐰"), + ("श", "𑐱"), + ("ष", "𑐲"), + ("स", "𑐳"), + ("ह", "𑐴"), + ("ळ", "𑐮𑑆"), + ("क्ष", "𑐎𑑂𑐲"), + ("ज्ञ", "𑐖𑑂𑐘"), + ("०", "𑑐"), + ("१", "𑑑"), + ("२", "𑑒"), + ("३", "𑑓"), + ("४", "𑑔"), + ("५", "𑑕"), + ("६", "𑑖"), + ("७", "𑑗"), + ("८", "𑑘"), + ("९", "𑑙"), + ("ॐ", "𑑉"), + ("ऽ", "𑑇"), + ("।", "𑑋"), + ("॥", "𑑌"), + ("ॅ", "𑐾"), +]; + pub const ORIYA: &[(&str, &str)] = &[ ("अ", "ଅ"), ("आ", "ଆ"), @@ -1247,6 +1701,106 @@ pub const ORIYA: &[(&str, &str)] = &[ ("᳚", "᳚"), ]; +pub const SAURASHTRA: &[(&str, &str)] = &[ + ("अ", "ꢂ"), + ("आ", "ꢃ"), + ("इ", "ꢄ"), + ("ई", "ꢅ"), + ("उ", "ꢆ"), + ("ऊ", "ꢇ"), + ("ऋ", "ꢈ"), + ("ॠ", "ꢉ"), + ("ऌ", "ꢊ"), + ("ॡ", "ꢋ"), + ("ऎ", "ꢌ"), + ("ए", "ꢍ"), + ("ऐ", "ꢎ"), + ("ऒ", "ꢏ"), + ("ओ", "ꢐ"), + ("औ", "ꢑ"), + ("ऍ", "ꢌ"), + ("ऑ", "ꢃ"), + ("ा", "ꢵ"), + ("ि", "ꢶ"), + ("ी", "ꢷ"), + ("ु", "ꢸ"), + ("ू", "ꢹ"), + ("ृ", "ꢺ"), + ("ॄ", "ꢻ"), + ("ॢ", "ꢼ"), + ("ॣ", "ꢽ"), + ("ॆ", "ꢾ"), + ("े", "ꢿ"), + ("ै", "ꣀ"), + ("ॊ", "ꣁ"), + ("ो", "ꣂ"), + ("ौ", "ꣃ"), + ("ॅ", "ꢾ"), + ("ॉ", "ꢵ"), + ("ं", "ꢀ"), + ("ः", "ꢁ"), + ("ँ", "ꣅ"), + ("्", "꣄"), + ("क", "ꢒ"), + ("ख", "ꢓ"), + ("ग", "ꢔ"), + ("घ", "ꢕ"), + ("ङ", "ꢖ"), + ("च", "ꢗ"), + ("छ", "ꢘ"), + ("ज", "ꢙ"), + ("झ", "ꢚ"), + ("ञ", "ꢛ"), + ("ट", "ꢜ"), + ("ठ", "ꢝ"), + ("ड", "ꢞ"), + ("ढ", "ꢟ"), + ("ण", "ꢠ"), + ("त", "ꢡ"), + ("थ", "ꢢ"), + ("द", "ꢣ"), + ("ध", "ꢤ"), + ("न", "ꢥ"), + ("प", "ꢦ"), + ("फ", "ꢧ"), + ("ब", "ꢨ"), + ("भ", "ꢩ"), + ("म", "ꢪ"), + ("य", "ꢫ"), + ("र", "ꢬ"), + ("ल", "ꢭ"), + ("व", "ꢮ"), + ("श", "ꢯ"), + ("ष", "ꢰ"), + ("स", "ꢱ"), + ("ह", "ꢲ"), + ("ळ", "ꢳ"), + ("क्ष", "ꢒ꣄‍ꢰ"), + ("ज्ञ", "ꢙ꣄ꢛ"), + ("०", "꣐"), + ("१", "꣑"), + ("२", "꣒"), + ("३", "꣓"), + ("४", "꣔"), + ("५", "꣕"), + ("६", "꣖"), + ("७", "꣗"), + ("८", "꣘"), + ("९", "꣙"), + ("ॐ", "ꢐꢀ"), + ("ऽ", "ఽ"), + ("।", "꣎"), + ("॥", "꣏"), + ("क़", "ꢒ"), + ("ख़", "ꢓ"), + ("ग़", "ꢔ"), + ("ज़", "ꢙ"), + ("ड़", "ꢞ"), + ("ढ़", "ꢟ"), + ("फ़", "ꢧ"), + ("य़", "ꢫ"), +]; + pub const SHARADA: &[(&str, &str)] = &[ ("अ", "𑆃"), ("आ", "𑆄"), @@ -1541,10 +2095,10 @@ pub const TAMIL: &[(&str, &str)] = &[ ("ई", "ஈ"), ("उ", "உ"), ("ऊ", "ஊ"), - ("ऋ", "ரு'"), - ("ॠ", "ரூ'"), - ("ऌ", "லு'"), - ("ॡ", "லூ'"), + ("ऋ", "ருʼ"), + ("ॠ", "ரூʼ"), + ("ऌ", "லுʼ"), + ("ॡ", "லூʼ"), ("ऎ", "எ"), ("ए", "ஏ"), ("ऐ", "ஐ"), @@ -1556,43 +2110,44 @@ pub const TAMIL: &[(&str, &str)] = &[ ("ी", "ீ"), ("ु", "ு"), ("ू", "ூ"), - ("ृ", "்ரு'"), - ("ॄ", "்ரூ'"), - ("ॢ", "்லு'"), - ("ॣ", "்லூ'"), + ("ृ", "்ருʼ"), + ("ॄ", "்ரூʼ"), + ("ॢ", "்லுʼ"), + ("ॣ", "்லூʼ"), ("ॆ", "ெ"), ("े", "ே"), ("ै", "ை"), ("ॊ", "ொ"), ("ो", "ோ"), ("ौ", "ௌ"), - ("ं", "ம்"), - ("ँ", ""), + ("ं", "ம்ʼ"), + ("ः", "꞉"), + ("ँ", "ம்ˮ"), ("्", "்"), ("क", "க"), - ("ख", "க"), - ("ग", "க"), - ("घ", "க"), + ("ख", "க²"), + ("ग", "க³"), + ("घ", "க⁴"), ("ङ", "ங"), ("च", "ச"), - ("छ", "ச"), + ("छ", "ச²"), ("ज", "ஜ"), - ("झ", "ச"), + ("झ", "ஜ²"), ("ञ", "ஞ"), ("ट", "ட"), - ("ठ", "ட"), - ("ड", "ட"), - ("ढ", "ட"), + ("ठ", "ட²"), + ("ड", "ட³"), + ("ढ", "ட⁴"), ("ण", "ண"), ("त", "த"), - ("थ", "த"), - ("द", "த"), - ("ध", "த"), + ("थ", "த²"), + ("द", "த³"), + ("ध", "த⁴"), ("न", "ந"), ("प", "ப"), - ("फ", "ப"), - ("ब", "ப"), - ("भ", "ப"), + ("फ", "ப²"), + ("ब", "ப³"), + ("भ", "ப⁴"), ("म", "ம"), ("य", "ய"), ("र", "ர"), @@ -1605,21 +2160,22 @@ pub const TAMIL: &[(&str, &str)] = &[ ("ळ", "ள"), ("क्ष", "க்ஷ"), ("ज्ञ", "ஜ்ஞ"), - ("०", "௦"), - ("१", "௧"), - ("२", "௨"), - ("३", "௩"), - ("४", "௪"), - ("५", "௫"), - ("६", "௬"), - ("७", "௭"), - ("८", "௮"), - ("९", "௯"), + ("०", "0"), + ("१", "1"), + ("२", "2"), + ("३", "3"), + ("४", "4"), + ("५", "5"), + ("६", "6"), + ("७", "7"), + ("८", "8"), + ("९", "9"), ("ॐ", "ௐ"), - ("ऽ", "ऽ"), - ("।", "।"), - ("॥", "॥"), - ("क़", "ஃ'க"), + ("ऽ", "(அ)"), + ("।", "."), + ("॥", ".."), + ("ॅ", "ே"), + ("क़", "ஃʼக"), ("ख़", "ஃக²"), ("ग़", "ஃக³"), ("ज़", "ஃஜ"), @@ -1735,6 +2291,104 @@ pub const TELUGU: &[(&str, &str)] = &[ ("᳚", "᳚"), ]; +pub const THAI: &[(&str, &str)] = &[ + ("अ", "อ"), + ("आ", "อา"), + ("इ", "อิ"), + ("ई", "อี"), + ("उ", "อุ"), + ("ऊ", "อู"), + ("ऋ", "ฤ"), + ("ॠ", "ฤๅ"), + ("ऌ", "ฦ"), + ("ॡ", "ฦๅ"), + ("ऎ", "เอะ"), + ("ए", "เอ"), + ("ऐ", "ไอ"), + ("ऒ", "โอะ"), + ("ओ", "โอ"), + ("औ", "เอา"), + ("ा", "า"), + ("ि", "ิ"), + ("ी", "ี"), + ("ु", "ุ"), + ("ू", "ู"), + ("ृ", "ฺฤ"), + ("ॄ", "ฺฤๅ"), + ("ॢ", "ฺฦ"), + ("ॣ", "ฺฦๅ"), + ("ॆ", "เะ"), + ("े", "เ"), + ("ै", "ไ"), + ("ॊ", "โะ"), + ("ो", "โ"), + ("ौ", "เา"), + ("ं", "ํ"), + ("ः", "ห์"), + ("ँ", "ํ"), + ("्", "ฺ"), + ("क", "ก"), + ("ख", "ข"), + ("ग", "ค"), + ("घ", "ฆ"), + ("ङ", "ง"), + ("च", "จ"), + ("छ", "ฉ"), + ("ज", "ช"), + ("झ", "ฌ"), + ("ञ", "ญ"), + ("ट", "ฏ"), + ("ठ", "ฐ"), + ("ड", "ฑ"), + ("ढ", "ฒ"), + ("ण", "ณ"), + ("त", "ต"), + ("थ", "ถ"), + ("द", "ท"), + ("ध", "ธ"), + ("न", "น"), + ("प", "ป"), + ("फ", "ผ"), + ("ब", "พ"), + ("भ", "ภ"), + ("म", "ม"), + ("य", "ย"), + ("र", "ร"), + ("ल", "ล"), + ("व", "ว"), + ("श", "ศ"), + ("ष", "ษ"), + ("स", "ส"), + ("ह", "ห"), + ("ळ", "ฬ"), + ("क्ष", "กฺษ"), + ("ज्ञ", "ชฺญ"), + ("०", "๐"), + ("१", "๑"), + ("२", "๒"), + ("३", "๓"), + ("४", "๔"), + ("५", "๕"), + ("६", "๖"), + ("७", "๗"), + ("८", "๘"), + ("९", "๙"), + ("ॐ", "โอํ"), + ("ऽ", "'"), + ("।", "ฯ"), + ("॥", "๚"), + ("ॅ", "แะ"), + ("क़", "ก"), + ("ख़", "ข"), + ("ग़", "ค"), + ("ज़", "ซ"), + ("ड़", "ร"), + ("ढ़", "รฺห"), + ("फ़", "ฟ"), + ("य़", "ย"), + ("ऱ", "ร"), +]; + pub const TIBETAN: &[(&str, &str)] = &[ ("अ", "ཨ"), ("आ", "ཨཱ"), @@ -1766,7 +2420,7 @@ pub const TIBETAN: &[(&str, &str)] = &[ ("ं", "ཾ"), ("ः", "ཿ"), ("ँ", "ྃ"), - ("्", ""), + ("्", "྄"), ("क", "ཀ"), ("ख", "ཁ"), ("ग", "ག"), @@ -1795,7 +2449,7 @@ pub const TIBETAN: &[(&str, &str)] = &[ ("य", "ཡ"), ("र", "ར"), ("ल", "ལ"), - ("व", "བ"), + ("व", "ཝ"), ("श", "ཤ"), ("ष", "ཥ"), ("स", "ས"), @@ -1829,6 +2483,104 @@ pub const TIBETAN: &[(&str, &str)] = &[ ("ऱ", "ར༹"), ]; +pub const TIRHUTA: &[(&str, &str)] = &[ + ("अ", "𑒁"), + ("आ", "𑒂"), + ("इ", "𑒃"), + ("ई", "𑒄"), + ("उ", "𑒅"), + ("ऊ", "𑒆"), + ("ऋ", "𑒇"), + ("ॠ", "𑒈"), + ("ऌ", "𑒉"), + ("ॡ", "𑒊"), + ("ऎ", "𑒁𑒺"), + ("ए", "𑒋"), + ("ऐ", "𑒌"), + ("ऒ", "𑒁𑒽"), + ("ओ", "𑒍"), + ("औ", "𑒎"), + ("ा", "𑒰"), + ("ि", "𑒱"), + ("ी", "𑒲"), + ("ु", "𑒳"), + ("ू", "𑒴"), + ("ृ", "𑒵"), + ("ॄ", "𑒶"), + ("ॢ", "𑒷"), + ("ॣ", "𑒸"), + ("ॆ", "𑒺"), + ("े", "𑒹"), + ("ै", "𑒻"), + ("ॊ", "𑒽"), + ("ो", "𑒼"), + ("ौ", "𑒾"), + ("ं", "𑓀"), + ("ः", "𑓁"), + ("ँ", "𑒿"), + ("्", "𑓂"), + ("क", "𑒏"), + ("ख", "𑒐"), + ("ग", "𑒑"), + ("घ", "𑒒"), + ("ङ", "𑒓"), + ("च", "𑒔"), + ("छ", "𑒕"), + ("ज", "𑒖"), + ("झ", "𑒗"), + ("ञ", "𑒘"), + ("ट", "𑒙"), + ("ठ", "𑒚"), + ("ड", "𑒛"), + ("ढ", "𑒜"), + ("ण", "𑒝"), + ("त", "𑒞"), + ("थ", "𑒟"), + ("द", "𑒠"), + ("ध", "𑒡"), + ("न", "𑒢"), + ("प", "𑒣"), + ("फ", "𑒤"), + ("ब", "𑒥"), + ("भ", "𑒦"), + ("म", "𑒧"), + ("य", "𑒨"), + ("र", "𑒩"), + ("ल", "𑒪"), + ("व", "𑒫"), + ("श", "𑒬"), + ("ष", "𑒭"), + ("स", "𑒮"), + ("ह", "𑒯"), + ("ळ", "𑒝𑓃"), + ("क्ष", "𑒏𑓂𑒭"), + ("ज्ञ", "𑒖𑓂𑒘"), + ("०", "𑓐"), + ("१", "𑓑"), + ("२", "𑓒"), + ("३", "𑓓"), + ("४", "𑓔"), + ("५", "𑓕"), + ("६", "𑓖"), + ("७", "𑓗"), + ("८", "𑓘"), + ("९", "𑓙"), + ("ॐ", "𑓇"), + ("ऽ", "𑓄"), + ("।", "।"), + ("॥", "॥"), + ("ॅ", "𑒹"), + ("क़", "𑒏𑓃"), + ("ख़", "𑒐𑓃"), + ("ग़", "𑒑𑓃"), + ("ज़", "𑒖𑓃"), + ("ड़", "𑒛𑓃"), + ("ढ़", "𑒜𑓃"), + ("फ़", "𑒤𑓃"), + ("य़", "𑒨𑓃"), + ("ऱ", "𑒩"), +]; + pub const BARAHA: &[(&str, &str)] = &[ ("अ", "a"), ("आ", "A"), @@ -2328,8 +3080,6 @@ pub const ISO: &[(&str, &str)] = &[ ("व़", "w"), ("ऽ", "`"), ("ँ", "ṁ"), - ("ख़", "ḵẖ"), - ("ख़", "ḵh"), ("व़", "ẉ"), ("ग़", "g̠ẖ"), ("꣡", "¹"), @@ -2477,6 +3227,7 @@ pub const ITRANS: &[(&str, &str)] = &[ ("।", "."), ("॥", ".."), ("ज़", "J"), + ("ँ", "{\\m+}"), ]; pub const SLP1: &[(&str, &str)] = &[ diff --git a/vidyut-lipi/src/detect.rs b/vidyut-lipi/src/detect.rs index ca1967f..a6d9675 100644 --- a/vidyut-lipi/src/detect.rs +++ b/vidyut-lipi/src/detect.rs @@ -1,3 +1,5 @@ +//! Utilities for detecting the `Scheme` used by some text. + use crate::scheme::Scheme; /// Detcts the scheme used by the given text. @@ -27,8 +29,8 @@ use crate::scheme::Scheme; /// /// `detect` analyzes the input string by applying various heuristic tests. For non-ASCII scripts, /// `detect` checks whether characters are in a specific unicode range. For ASCII scripts, `detect` -/// checks for bigrams and trigrams associated with specific encodings (for example, "R^i" is -/// indicative of ITRANS). Currently, `detect` returns the first match found and does not do any +/// checks for bigrams and trigrams associated with specific encodings. (For example, `R^i` is +/// indicative of ITRANS.) Currently, `detect` returns the first match found and does not do any /// kind of scoring, ranking, statistical modeling, etc. /// /// Our goal is to provide an implementation that is fast, small, and good enough. In the future, @@ -42,6 +44,15 @@ fn detect_inner(input: &str) -> Option { type Range = std::ops::RangeInclusive; + // These are Latin supplements for IAST, ISO-15919, etc. + // + // - https://unicode.org/charts/PDF/U0080.pdf + // - https://unicode.org/charts/PDF/U0100.pdf + // - https://unicode.org/charts/PDF/U1E00.pdf + const LATIN_1_SUPPLEMENT: Range = 0x0080..=0x00ff; + const LATIN_EXTENDED_A: Range = 0x0100..=0x017f; + const LATIN_EXTENDED: Range = 0x01e00..=0x01eff; + // These are ranges of Unicode code points as defined by unicode.org. To see the official spec // for each scheme, see the comments on `Scheme`. const DEVANAGARI: Range = 0x0900..=0x097f; @@ -57,23 +68,23 @@ fn detect_inner(input: &str) -> Option { const KANNADA: Range = 0x0c80..=0x0cff; const MALAYALAM: Range = 0x0d00..=0x0d7f; const SINHALA: Range = 0x0d80..=0x0dff; - // const TIBETAN: Range = 0x0f00..=0x0fff; + const THAI: Range = 0x0e00..=0x0e7f; + const TIBETAN: Range = 0x0f00..=0x0fff; const BURMESE: Range = 0x1000..=0x109f; + const KHMER: Range = 0x1780..=0x17ff; const BALINESE: Range = 0x1b00..=0x1b7f; + const SAURASHTRA: Range = 0xa880..=0xa8df; const JAVANESE: Range = 0xa980..=0xa9df; const BRAHMI: Range = 0x11000..=0x1107f; + const SHARADA: Range = 0x11180..=0x111df; const GRANTHA: Range = 0x11300..=0x1137f; const SIDDHAM: Range = 0x11580..=0x115ff; - - //https://unicode.org/charts/PDF/U0100.pdf - const LATIN_1_SUPPLEMENT: Range = 0x0080..=0x00ff; - //https://unicode.org/charts/PDF/U0100.pdf - const LATIN_EXTENDED_A: Range = 0x0100..=0x017f; - // https://unicode.org/charts/PDF/U1E00.pdf - const LATIN_EXTENDED: Range = 0x01e00..=0x01eff; + const NEWA: Range = 0x11400..=0x1147f; + const TIRHUTA: Range = 0x11480..=0x114df; + const MODI: Range = 0x11600..=0x1165f; // Wraps all of the ranges above. - const INDIC: Range = *DEVANAGARI.start()..=*SIDDHAM.end(); + const INDIC: Range = *DEVANAGARI.start()..=*MODI.end(); const ASCII: Range = 0..=0xff; for (i, c) in input.char_indices() { @@ -81,7 +92,7 @@ fn detect_inner(input: &str) -> Option { // Rust supports [range matching][1], but only if the range is "inlined" and not in a // const. But having a bunch of inlined hex ranges (as opposed to our consts above) seems - // unreadable, so just use an if-else change. + // unreadable, so just use an if-else chain. // // [1]: https://doc.rust-lang.org/book/ch18-03-pattern-syntax.html if LATIN_1_SUPPLEMENT.contains(&code) @@ -94,13 +105,12 @@ fn detect_inner(input: &str) -> Option { return Some(Iast); } } else if INDIC.contains(&code) { - let maybe = if DEVANAGARI.contains(&code) { - Some(Devanagari) - } else if DEVANAGARI_EXTENDED.contains(&code) { - Some(Devanagari) - } else if DEVANAGARI_EXTENDED_A.contains(&code) { - Some(Devanagari) - } else if VEDIC_EXTENSIONS.contains(&code) { + println!("Checking code {code:x} from char {c} at index {i}"); + let maybe = if DEVANAGARI.contains(&code) + || DEVANAGARI_EXTENDED.contains(&code) + || DEVANAGARI_EXTENDED_A.contains(&code) + || VEDIC_EXTENSIONS.contains(&code) + { Some(Devanagari) } else if BENGALI.contains(&code) { Some(Bengali) @@ -120,18 +130,34 @@ fn detect_inner(input: &str) -> Option { Some(Malayalam) } else if SINHALA.contains(&code) { Some(Sinhala) + } else if THAI.contains(&code) { + Some(Thai) + } else if TIBETAN.contains(&code) { + Some(Tibetan) } else if BURMESE.contains(&code) { Some(Burmese) + } else if KHMER.contains(&code) { + Some(Khmer) } else if BALINESE.contains(&code) { Some(Balinese) + } else if SAURASHTRA.contains(&code) { + Some(Saurashtra) } else if JAVANESE.contains(&code) { Some(Javanese) } else if BRAHMI.contains(&code) { Some(Brahmi) + } else if SHARADA.contains(&code) { + Some(Sharada) } else if GRANTHA.contains(&code) { Some(Grantha) } else if SIDDHAM.contains(&code) { Some(Siddham) + } else if NEWA.contains(&code) { + Some(Newa) + } else if TIRHUTA.contains(&code) { + Some(Tirhuta) + } else if MODI.contains(&code) { + Some(Modi) } else { None }; @@ -166,9 +192,9 @@ fn detect_inner(input: &str) -> Option { return Some(Itrans); } else if SLP1_ONLY_BIGRAMS.contains(&bigram) { return Some(Slp1); - } else if VELTHUIS_ONLY_BIGRAMS.contains(&bigram) { - return Some(Velthuis); - } else if bigram[0] == b'.' && b"mhnrltds".contains(&bigram[1]) { + } else if VELTHUIS_ONLY_BIGRAMS.contains(&bigram) + || bigram[0] == b'.' && b"mhnrltds".contains(&bigram[1]) + { return Some(Velthuis); } } @@ -200,23 +226,30 @@ mod tests { const TEST_CASES: &[(&str, Scheme)] = &[ // Indic // ----- - ("नारायणः", Devanagari), - ("নারাযণঃ", Bengali), - ("ਗੁਰਮੁਖੀ", Gurmukhi), - ("નારાયણઃ", Gujarati), - ("ନାରାଯଣଃ", Odia), - ("நாராயணஃ", Tamil), - ("నారాయణః", Telugu), - ("ನಾರಾಯಣಃ", Kannada), - ("നാരായണഃ", Malayalam), - ("නාරායණඃ", Sinhala), - // ("སཾ་སྐྲྀ་ཏ་མ྄", Tibetan), - ("သံသ်ကၖတမ်", Burmese), - ("ᬲᬂᬲ᭄ᬓᬺᬢᬫ᭄", Balinese), - ("ꦱꦁꦱ꧀ꦏꦽꦠꦩ꧀", Javanese), - ("𑀦𑀸𑀭𑀸𑀬𑀡𑀂", Brahmi), - ("𑌨𑌾𑌰𑌾𑌯𑌣𑌃", Grantha), - ("𑖭𑖽𑖭𑖿𑖎𑖴𑖝𑖦𑖿", Siddham), + ("ᬅᬕ᭄ᬦᬶᬫ᭄", Balinese), + ("অগ্নিম্", Bengali), + ("𑀅𑀕𑁆𑀦𑀺𑀫𑁆", Brahmi), + ("အဂ်နိမ်", Burmese), + ("अग्निम्", Devanagari), + ("𑌅𑌗𑍍𑌨𑌿𑌮𑍍", Grantha), + ("ਅਗ੍ਨਿਮ੍", Gurmukhi), + ("અગ્નિમ્", Gujarati), + ("ꦄꦒ꧀ꦤꦶꦩ꧀", Javanese), + ("ಅಗ್ನಿಮ್", Kannada), + ("អគ្និម៑", Khmer), + ("അഗ്നിമ്", Malayalam), + ("𑘀𑘐𑘿𑘡𑘱𑘦𑘿", Modi), + ("𑐀𑐐𑑂𑐣𑐶𑐩𑑂", Newa), + ("ଅଗ୍ନିମ୍", Odia), + ("ꢂꢔ꣄ꢥꢶꢪ꣄", Saurashtra), + ("𑆃𑆓𑇀𑆤𑆴𑆩𑇀", Sharada), + ("𑖀𑖐𑖿𑖡𑖰𑖦𑖿", Siddham), + ("අග්නිම්", Sinhala), + ("அக்³நிம்", Tamil), + ("అగ్నిమ్", Telugu), + ("ཨགིམ", Tibetan), + ("𑒁𑒑𑓂𑒢𑒱𑒧𑓂", Tirhuta), + ("อคฺนิมฺ", Thai), // IAST // ---- ("rāga", Iast), diff --git a/vidyut-lipi/src/lib.rs b/vidyut-lipi/src/lib.rs index 05f32b8..5d46976 100644 --- a/vidyut-lipi/src/lib.rs +++ b/vidyut-lipi/src/lib.rs @@ -1,4 +1,3 @@ -//! Hacky transliteration functions that other crates might need. #![doc = include_str!("../README.md")] #![deny(missing_docs)] #![deny(clippy::unwrap_used)] @@ -8,6 +7,7 @@ mod detect; mod lipika; mod mapping; mod numerals; +mod reshape; mod scheme; mod transliterate; mod unicode_norm; diff --git a/vidyut-lipi/src/lipika.rs b/vidyut-lipi/src/lipika.rs index 78c11fc..066b833 100644 --- a/vidyut-lipi/src/lipika.rs +++ b/vidyut-lipi/src/lipika.rs @@ -1,3 +1,5 @@ +//! Provides a convenient transliteration API for end users. + use crate::mapping::Mapping; use crate::scheme::Scheme; use crate::transliterate::transliterate; @@ -6,6 +8,11 @@ use crate::transliterate::transliterate; const CACHE_CAPACITY: usize = 10; /// A `Mapping` as stored in `Lipika`'s internal cache. +/// +/// While creating a `Mapping` is cheap, doing so repeatedly within an inner loop will add some +/// unnecessary overhead. So, cache common mappings so that callers can reuse them. Essentially, we +/// are memoizing creating a `Mapping`. +#[derive(Clone, Eq, PartialEq)] struct CachedMapping { /// A "timestamp" that represents when the mapping was last used. stamp: i32, @@ -46,6 +53,7 @@ struct CachedMapping { /// let original = lipika.transliterate(deva, detected, Scheme::HarvardKyoto); /// assert_eq!(original, "saMskRtam"); /// ``` +#[derive(Clone, Default, Eq, PartialEq)] pub struct Lipika { cache: Vec, // Indicates when a mapping was last used. @@ -72,7 +80,7 @@ impl Lipika { /// For details on the underrlying algorithm, see comments on the `transliterate` method. pub fn transliterate(&mut self, input: impl AsRef, from: Scheme, to: Scheme) -> String { let mapping = self.find_or_create_mapping(from, to); - transliterate(input.as_ref(), &mapping) + transliterate(input.as_ref(), mapping) } /// Finds an existing mapping to reuse, or creates one if absent. diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs index c1e8571..5eee585 100644 --- a/vidyut-lipi/src/mapping.rs +++ b/vidyut-lipi/src/mapping.rs @@ -3,9 +3,30 @@ use crate::scheme::Scheme; use rustc_hash::{FxHashMap, FxHashSet}; +/// An output token, which we append to our output string when transliterating. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Token { + /// The text of this token. + pub text: String, + /// The token type. `kind` controls how this token combines with neighboring tokens. + pub kind: TokenKind, +} + +impl Token { + /// Creates a new `Token`. + pub fn new(text: String, kind: TokenKind) -> Self { + Self { text, kind } + } + + /// Returns whether this token represents a consonant. + pub fn is_consonant(&self) -> bool { + self.kind == TokenKind::Consonant + } +} + /// Models how a token behaves in relation to other tokens. #[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] -enum TokenType { +pub(crate) enum TokenKind { /// A consonant. A following vowel generally a vowel mark. Consonant, /// A vowel mark, which generally must follow a consonant. @@ -14,43 +35,45 @@ enum TokenType { Other, } -fn decide_token_type(s: &str) -> TokenType { - const MARK_AA: u32 = 0x093e; - const MARK_AU: u32 = 0x094c; - const MARK_L: u32 = 0x0962; - const MARK_LL: u32 = 0x0963; - const MARK_PRISHTAMATRA_E: u32 = 0x094e; - const MARK_AW: u32 = 0x094f; - - const CONS_KA: u32 = 0x0915; - const CONS_HA: u32 = 0x0939; - const CONS_QA: u32 = 0x0958; - const CONS_YYA: u32 = 0x095f; - const CONS_DDDA: u32 = 0x097e; - const CONS_BBA: u32 = 0x097f; - const NUKTA: u32 = 0x093c; - - if let Some(c) = s.chars().last() { - let code = c as u32; - if (code >= MARK_AA && code <= MARK_AU) - || code == MARK_PRISHTAMATRA_E - || code == MARK_AW - || code == MARK_L - || code == MARK_LL - { - TokenType::VowelMark - } else if (code >= CONS_KA && code <= CONS_HA) - || (code >= CONS_QA && code <= CONS_YYA) - || code == CONS_DDDA - || code == CONS_BBA - || code == NUKTA - { - TokenType::Consonant +impl TokenKind { + fn from_devanagari_key(s: &str) -> Self { + const MARK_AA: u32 = 0x093e; + const MARK_AU: u32 = 0x094c; + const MARK_L: u32 = 0x0962; + const MARK_LL: u32 = 0x0963; + const MARK_PRISHTAMATRA_E: u32 = 0x094e; + const MARK_AW: u32 = 0x094f; + + const CONS_KA: u32 = 0x0915; + const CONS_HA: u32 = 0x0939; + const CONS_QA: u32 = 0x0958; + const CONS_YYA: u32 = 0x095f; + const CONS_DDDA: u32 = 0x097e; + const CONS_BBA: u32 = 0x097f; + const NUKTA: u32 = 0x093c; + + if let Some(c) = s.chars().last() { + let code = c as u32; + if (MARK_AA..=MARK_AU).contains(&code) + || code == MARK_PRISHTAMATRA_E + || code == MARK_AW + || code == MARK_L + || code == MARK_LL + { + TokenKind::VowelMark + } else if (CONS_KA..=CONS_HA).contains(&code) + || (CONS_QA..=CONS_YYA).contains(&code) + || code == CONS_DDDA + || code == CONS_BBA + || code == NUKTA + { + TokenKind::Consonant + } else { + TokenKind::Other + } } else { - TokenType::Other + TokenKind::Other } - } else { - TokenType::Other } } @@ -174,8 +197,8 @@ impl OneWayMapping { let v = vals.first()?; out.push_str(v); - let token_type = decide_token_type(&deva_char); - if self.to_scheme.is_alphabet() && token_type == TokenType::Consonant { + let token_kind = TokenKind::from_devanagari_key(&deva_char); + if self.to_scheme.is_alphabet() && token_kind == TokenKind::Consonant { out.push('a'); } } @@ -192,6 +215,10 @@ impl OneWayMapping { } } + pub(crate) fn get(&self, key: &str) -> Option<&Vec> { + self.data.get(key) + } + #[allow(unused)] pub(crate) fn dump(&self) { let mut items: Vec<_> = self.data.iter().collect(); @@ -211,11 +238,10 @@ impl OneWayMapping { pub struct Mapping { pub(crate) from: Scheme, pub(crate) to: Scheme, - pub(crate) all: FxHashMap, + pub(crate) all: FxHashMap, pub(crate) marks: FxHashMap, pub(crate) input_virama: String, pub(crate) output_virama: String, - pub(crate) consonants: FxHashMap, pub(crate) len_longest_key: usize, pub(crate) numeral_to_int: FxHashMap, pub(crate) int_to_numeral: FxHashMap, @@ -230,7 +256,7 @@ impl Mapping { /// We start with two mappings: one from `A` to `X`, and one from `B` to `X`. Here `A` is our /// input scheme, `B` is our output scheme, and `X` is our intermediate representation. /// - /// If we reverse our `B to `X` mapping to get an `X` to `B` mapping, we can join these two + /// If we reverse our `B` to `X` mapping to get an `X` to `B` mapping, we can join these two /// mappings to get an `A` to `B` mapping. This approach is workable but needs extra support /// for these two cases: /// @@ -238,7 +264,7 @@ impl Mapping { /// where `|` is an SLP1 character and `ळ` is not defined in B. In this case, we /// transliterate `x` to scheme `B` then programmatically create a new `a --> b` mapping. /// - /// 2. A mapping `x --> b` without a corresponding `a --> b`. For example, consider "ळ --> |`, + /// 2. A mapping `x --> b` without a corresponding `a --> x`. For example, consider `ळ --> |`, /// where `|` is again an SLP1 character and `ळ` is not defined in A. In this case, we /// transliterate `x` to scheme `A` then programmatically create a new `a --> b` mapping. pub fn new(from: Scheme, to: Scheme) -> Mapping { @@ -250,16 +276,14 @@ impl Mapping { let mut all = FxHashMap::default(); let mut marks = FxHashMap::default(); - let mut consonants = FxHashMap::default(); let mut seen_b: FxHashSet<&str> = FxHashSet::default(); // Iterate over `from.token_pairs()` so that we maintain a predictable input order. for (deva_key, _) in from.token_pairs() { // But, use the values in `a_map` instead of the values from `token_pairs` so that we // pick up Unicode equivalents. - for a in a_map.data.get(*deva_key).expect("present") { - let token_type = decide_token_type(deva_key); - let bs = match b_map.data.get(*deva_key) { + for a in a_map.get(deva_key).expect("present") { + let bs = match b_map.get(deva_key) { Some(bs) => bs, None => continue, }; @@ -268,14 +292,9 @@ impl Mapping { None => continue, }; - match token_type { - TokenType::VowelMark => { - marks.insert(a.to_string(), b.to_string()); - } - TokenType::Consonant => { - consonants.insert(a.to_string(), b.to_string()); - } - TokenType::Other => (), + let token_kind = TokenKind::from_devanagari_key(deva_key); + if token_kind == TokenKind::VowelMark { + marks.insert(a.to_string(), b.to_string()); } // Insert only the first match seen. Consequences: @@ -285,15 +304,15 @@ impl Mapping { // // - If a sound has alternates, we store only the first. if !all.contains_key(a) { - all.insert(a.to_string(), b.to_string()); + all.insert(a.to_string(), Token::new(b.to_string(), token_kind)); seen_b.insert(b); } } } for (deva_key, a) in from.token_pairs() { - let token_type = decide_token_type(deva_key); - if !all.contains_key(*a) && b_map.data.get(*deva_key).is_none() { + let token_kind = TokenKind::from_devanagari_key(deva_key); + if !all.contains_key(*a) && b_map.get(deva_key).is_none() { // Mapping `a --> x` doesn't have a corresponding `x --> b`. // So, create one. let new_b = match b_map.transliterate_key(deva_key) { @@ -301,16 +320,10 @@ impl Mapping { None => continue, }; - match token_type { - TokenType::VowelMark => { - marks.insert(a.to_string(), new_b.clone()); - } - TokenType::Consonant => { - consonants.insert(a.to_string(), new_b.clone()); - } - TokenType::Other => (), + if token_kind == TokenKind::VowelMark { + marks.insert(a.to_string(), new_b.clone()); } - all.insert(a.to_string(), new_b); + all.insert(a.to_string(), Token::new(new_b, token_kind)); } } @@ -324,19 +337,13 @@ impl Mapping { None => continue, }; - let token_type = decide_token_type(deva_key); + let token_kind = TokenKind::from_devanagari_key(deva_key); if !new_a.is_empty() && !all.contains_key(&new_a) { - match token_type { - TokenType::VowelMark => { - marks.insert(new_a.clone(), b.to_string()); - } - TokenType::Consonant => { - consonants.insert(new_a.clone(), b.to_string()); - } - TokenType::Other => (), + if token_kind == TokenKind::VowelMark { + marks.insert(new_a.clone(), b.to_string()); } - all.insert(new_a, b.to_string()); + all.insert(new_a, Token::new(b.to_string(), token_kind)); } } @@ -351,7 +358,6 @@ impl Mapping { to, all, marks, - consonants, input_virama: a_map.virama, output_virama: b_map.virama, len_longest_key, @@ -370,7 +376,7 @@ impl Mapping { self.to } - pub(crate) fn get(&self, key: &str) -> Option<&String> { + pub(crate) fn get(&self, key: &str) -> Option<&Token> { self.all.get(key) } @@ -380,8 +386,8 @@ impl Mapping { items.sort_by(|x, y| x.0.cmp(y.0)); for (k, v) in items { let k_codes: Vec<_> = k.chars().map(|c| c as u32).collect(); - let v_codes: Vec<_> = v.chars().map(|c| c as u32).collect(); - println!("{k} ({k_codes:x?}) --> {v} ({v_codes:x?})"); + let v_codes: Vec<_> = v.text.chars().map(|c| c as u32).collect(); + println!("{k} ({k_codes:x?}) --> {} ({v_codes:x?})", v.text); } } } @@ -393,9 +399,9 @@ mod tests { #[test] fn test_decide_token_type() { - let is_mark = |c| decide_token_type(c) == TokenType::VowelMark; - let is_consonant = |c| decide_token_type(c) == TokenType::Consonant; - let is_other = |c| decide_token_type(c) == TokenType::Other; + let is_mark = |c| TokenKind::from_devanagari_key(c) == TokenKind::VowelMark; + let is_consonant = |c| TokenKind::from_devanagari_key(c) == TokenKind::Consonant; + let is_other = |c| TokenKind::from_devanagari_key(c) == TokenKind::Other; assert!(is_mark("\u{093e}")); assert!(is_mark("\u{093f}")); @@ -445,31 +451,36 @@ mod tests { #[test] fn test_mapping() { + let other = |x: &str| Token::new(x.to_string(), TokenKind::Other); + let mark = |x: &str| Token::new(x.to_string(), TokenKind::VowelMark); + let m = Mapping::new(Devanagari, Itrans); + assert_eq!(m.from(), Devanagari); assert_eq!(m.to(), Itrans); - let assert_has = |m: &Mapping, x: &str, y: &str| { - assert_eq!(m.get(x), Some(&y.to_string())); + let assert_has = |m: &Mapping, x: &str, y: &Token| { + assert_eq!(m.get(x).unwrap(), y); }; let m = Mapping::new(Devanagari, Itrans); - assert_has(&m, "आ", "A"); - assert_has(&m, "\u{093e}", "A"); - assert_has(&m, "ए", "e"); - assert_has(&m, "\u{0947}", "e"); + assert_has(&m, "आ", &other("A")); + assert_has(&m, "\u{093e}", &mark("A")); + assert_has(&m, "ए", &other("e")); + assert_has(&m, "\u{0947}", &mark("e")); let m = Mapping::new(Bengali, Itrans); - assert_has(&m, "\u{09be}", "A"); - assert_has(&m, "\u{09c7}", "e"); + assert_has(&m, "\u{09be}", &mark("A")); + assert_has(&m, "\u{09c7}", &mark("e")); } #[test] fn test_mapping_with_unicode_decompositions() { // Maps to NFD let m = Mapping::new(Velthuis, Devanagari); - assert_eq!(m.get("R").unwrap(), "\u{0921}\u{093c}"); - assert_eq!(m.get("Rh").unwrap(), "\u{0922}\u{093c}"); + let cons = |x: &str| Token::new(x.to_string(), TokenKind::Consonant); + assert_eq!(m.get("R").unwrap(), &cons("\u{0921}\u{093c}")); + assert_eq!(m.get("Rh").unwrap(), &cons("\u{0922}\u{093c}")); // Maps from NFD and composed let m = Mapping::new(Devanagari, Velthuis); @@ -484,8 +495,8 @@ mod tests { assert_eq!(velthuis.data.get("\u{0921}\u{093c}").unwrap(), &vec!["R"]); assert_eq!(velthuis.data.get("\u{095c}"), None); - assert_eq!(m.get("\u{0921}\u{093c}").unwrap(), "R"); - assert_eq!(m.get("\u{095c}").unwrap(), "R"); - assert_eq!(m.get("\u{095d}").unwrap(), "Rh"); + assert_eq!(m.get("\u{0921}\u{093c}").unwrap(), &cons("R")); + assert_eq!(m.get("\u{095c}").unwrap(), &cons("R")); + assert_eq!(m.get("\u{095d}").unwrap(), &cons("Rh")); } } diff --git a/vidyut-lipi/src/numerals.rs b/vidyut-lipi/src/numerals.rs index 063122e..f0739a7 100644 --- a/vidyut-lipi/src/numerals.rs +++ b/vidyut-lipi/src/numerals.rs @@ -22,11 +22,7 @@ fn grantha_to_decimal(buffer: &mut String, numeral: &str, digit_to_int: &DigitTo .flat_map(|digit| { let mut temp = [0u8; 4]; let digit_str = digit.encode_utf8(&mut temp); - if let Some(i) = digit_to_int.get(digit_str) { - Some(*i as u8) - } else { - None - } + digit_to_int.get(digit_str).map(|i| *i as u8) }) .collect(); // Pad so that we can iterate by chunks of `CHUNK_SIZE`. @@ -34,12 +30,12 @@ fn grantha_to_decimal(buffer: &mut String, numeral: &str, digit_to_int: &DigitTo ints.push(0); } // Reorder so most significant digit is first. - ints.iter().rev().map(|d| *d).collect() + ints.iter().rev().copied().collect() }; // Special case for 0. if ints.iter().all(|d| *d == 0) { - buffer.push_str("௦"); + buffer.push('௦'); return; } @@ -213,17 +209,16 @@ fn decimal_to_grantha(buffer: &mut String, grantha: &str, int_to_digit: &IntToDi /// Procedure: /// - If from `Grantha` or to `Grantha`, use Grantha-specific logic. /// - Otherwise, transliterate digit by digit. -pub fn transliterate_numeral(buffer: &mut String, numerals: &[char], mapping: &Mapping) { - let numeral: String = numerals.iter().collect(); +pub fn transliterate_numeral(buffer: &mut String, numeral: &str, mapping: &Mapping) { if mapping.from() == mapping.to() { // Leave the number unchanged. - buffer.push_str(&numeral); + buffer.push_str(numeral); } else if mapping.from() == Scheme::Grantha { // Convert to Grantha place notation. - decimal_to_grantha(buffer, &numeral, &mapping.int_to_numeral); + decimal_to_grantha(buffer, numeral, &mapping.int_to_numeral); } else if mapping.to() == Scheme::Grantha { // Convert from Grantha place notation. - grantha_to_decimal(buffer, &numeral, &mapping.numeral_to_int) + grantha_to_decimal(buffer, numeral, &mapping.numeral_to_int) } else { // For decimal-decimal, transliterate one char at a time. for glyph in numeral.chars().flat_map(|c| { @@ -231,7 +226,7 @@ pub fn transliterate_numeral(buffer: &mut String, numerals: &[char], mapping: &M let glyph_str = c.encode_utf8(&mut temp); mapping.all.get(glyph_str) }) { - buffer.push_str(&glyph); + buffer.push_str(&glyph.text); } } } diff --git a/vidyut-lipi/src/reshape.rs b/vidyut-lipi/src/reshape.rs new file mode 100644 index 0000000..dbd9dfa --- /dev/null +++ b/vidyut-lipi/src/reshape.rs @@ -0,0 +1,362 @@ +//! Utilities for reshaping text before and after transliteration. +//! +//! When transliterating simple schemes, we can process the input string in a single left-to-right +//! pass. Backtracking, if needed at all, is minimal and limited to single characters, such as the +//! virama. +//! +//! Some schemes, however, are complex enough that they are challenging to process in a single +//! left-to-right pass. Such schemes typically store characters in a byte order that differs from +//! their phonetic order. For example, the Tamil superscript sequence "கோ⁴" ("gho") stores its +//! numeric superscript (which describes the consonant) after the vowel sign. +//! +//! This module provides utilities for reshaping a piece of text both before transliteration (so +//! that the text's phonetic and byte orders match) and after transliteration (so that the final +//! string matches the conventions of the output scheme). +use crate::mapping::Mapping; +use crate::scheme::Scheme; +use std::borrow::Cow; + +/// "ra" consonant +const KHMER_LETTER_RO: char = '\u{179a}'; + +/// Special diacritic for repha. It follows (in byte order) the consonant it precedes (in +/// pronunciation order). +const KHMER_SIGN_ROBAT: char = '\u{17cc}'; + +/// Virama +const KHMER_SIGN_VIRIAM: char = '\u{17d1}'; + +/// Like virama, but indicates that next char should be subscripted. +const KHMER_SIGN_COENG: char = '\u{17d2}'; + +/// Used instead of space (' ') in tibetan +const TIBETAN_MARK_INTERSYLLABLIC_TSHEG: char = '\u{0f0b}'; + +/// Tibetan "ba" +const TIBETAN_LETTER_BA: char = '\u{0f56}'; + +/// Tibetan "va" (archaic) +const TIBETAN_LETTER_WA: char = '\u{0f5d}'; + +/// Unsure how to use this. For now, used during transliteration and removed afterward. +const TIBETAN_MARK_HALANTA: char = '\u{0f84}'; + +/// The second component of a voiced aspirate stop consonant (gha, jha, ...). +const TIBETAN_SUBJOINED_LETTER_HA: char = '\u{0fb7}'; + +fn is_khmer_consonant(c: char) -> bool { + // Range is defined in Khmer unicode spec. + ('\u{1780}'..='\u{17a2}').contains(&c) +} + +fn is_tamil_superscript(c: char) -> bool { + ['²', '³', '⁴'].contains(&c) +} + +/// Returns whether `c` denotes a Tamil marker that must precede the superscript sign. +fn is_tamil_preceding_mark(c: char) -> bool { + const TA_VOWEL_AA: char = '\u{0bbe}'; + const TA_VIRAMA: char = '\u{0bcd}'; + (TA_VOWEL_AA..=TA_VIRAMA).contains(&c) +} + +/// Returns whether `c` denotes a Thai vowel sign that must precede the consonant it modifies. +fn is_thai_preceding_vowel(c: char) -> bool { + // Range is defined in Thai unicode spec. + ('\u{0e40}'..='\u{0e44}').contains(&c) +} + +fn is_thai_consonant(c: char) -> bool { + // Range is defined in Thai unicode spec. Ignore THAI CHARACTER O ANG, which is used for + // independent vowels. + const THAI_CHARACTER_O_ANG: char = '\u{0e2d}'; + ('\u{0e01}'..='\u{0e2e}').contains(&c) && c != THAI_CHARACTER_O_ANG +} + +fn is_tibetan_r_l_vowel_mark(x: char, y: char) -> bool { + // (x is subjoined r OR l) AND (y is vowel sign aa OR i OR ii) + // + // The vowel sign condition is complex because the "original" code points are decomposed in + // practice. + matches!(x, '\u{0fb2}' | '\u{0fb3}') && matches!(y, '\u{0f71}' | '\u{0f80}' | '\u{0f81}') +} + +fn is_tibetan_standard_consonant(c: char) -> bool { + // Range is defined in the Tibetan unicode spec. + ('\u{0f40}'..='\u{0f6a}').contains(&c) +} + +/// Converts a Tibetan subjoined consonant to a standard consonant, or `None` if the input is not a +/// subjoined consonant. +fn to_tibetan_standard_consonant(c: char) -> Option { + if is_tibetan_subjoined_consonant(c) { + let code = c as u32; + char::from_u32(code - 0x0050) + } else { + None + } +} + +fn is_tibetan_subjoined_consonant(c: char) -> bool { + // Range is defined in the Tibetan unicode spec. + ('\u{0f90}'..='\u{0fb9}').contains(&c) +} + +/// Converts a Tibetan standard consonant to a subjoined consonant, or `None` if the input is not a +/// standard consonant. +fn to_tibetan_subjoined_consonant(c: char) -> Option { + if is_tibetan_standard_consonant(c) { + let code = c as u32; + char::from_u32(code + 0x0050) + } else { + None + } +} + +/// Returns the number of bytes taken by the first `num_chars` chars in `text`. +fn chars_to_byte_offset(text: &str, num_chars: usize) -> usize { + text.chars().take(num_chars).map(|c| c.len_utf8()).sum() +} + +pub fn reshape_before<'a>(input: &'a str, mapping: &Mapping) -> Cow<'a, str> { + match mapping.from() { + Scheme::Khmer => { + let mut ret = String::new(); + let mut i = 0; + let seek = |i, num_chars| i + chars_to_byte_offset(&input[i..], num_chars); + + // TODO: rewrite anusvara per Aksharamukha. + while i < input.len() { + let mut chars = input[i..].chars(); + let x = chars.next().expect("text remaining"); + let y = chars.next(); + + if x == KHMER_SIGN_COENG { + // COENG + (cons) --> VIRIAM + (cons) + if y.map_or(false, is_khmer_consonant) { + ret.push(KHMER_SIGN_VIRIAM); + i = seek(i, 1); + continue; + } + } else if is_khmer_consonant(x) { + // (cons) + ROBAT --> RO + VIRIAM + (cons) + if y == Some(KHMER_SIGN_ROBAT) { + ret.extend([KHMER_LETTER_RO, KHMER_SIGN_VIRIAM, x]); + i = seek(i, 2); + continue; + } + } + ret.push(x); + i = seek(i, 1); + } + Cow::Owned(ret) + } + Scheme::Tamil => { + let mut ret = String::new(); + let mut chars = input.chars(); + + // Move superscripts next to the consonants they modify. + while let Some(x) = chars.next() { + if is_tamil_preceding_mark(x) { + let y = chars.next(); + if y.map_or(false, is_tamil_superscript) { + y.map(|c| ret.push(c)); + ret.push(x); + } else { + ret.push(x); + y.map(|c| ret.push(c)); + } + } else { + ret.push(x) + } + } + Cow::Owned(ret) + } + Scheme::Thai => { + // Move certain Thai vowel signs right by one index. + // + // For Thai, a vowel mark that appears visually to the left of a consonant is stored + // logically before the consonant. + let mut ret = String::new(); + let mut chars = input.chars(); + while let Some(x) = chars.next() { + if is_thai_preceding_vowel(x) { + let y = chars.next(); + if let Some(y) = y { + if is_thai_consonant(y) { + ret.extend(&[y, x]); + } else { + ret.extend(&[x, y]); + } + } else { + ret.push(x); + } + } else { + ret.push(x); + } + } + Cow::Owned(ret) + } + Scheme::Tibetan => { + const TIBETAN_CATURTHA_HALF: &[char] = + &['\u{0f42}', '\u{0f4c}', '\u{0f51}', '\u{0f56}', '\u{0f5b}']; + + let seek = |i, n| i + chars_to_byte_offset(&input[i..], n); + + let mut ret = String::new(); + let mut i = 0; + while i < input.len() { + let mut chars = input[i..].chars(); + let x = chars.next().expect("text remaining"); + + if x == TIBETAN_MARK_INTERSYLLABLIC_TSHEG { + // tsheg --> space + ret.push(' '); + } else { + // (subjoined cons) --> virama + (cons) + let x_new = to_tibetan_standard_consonant(x); + if let Some(x_new) = x_new { + // Unwrap to dummy characters for simpler logic below. + let w = ret.chars().last().unwrap_or('_'); + let y = chars.next().unwrap_or('_'); + let is_voiced_aspirated_consonant = + TIBETAN_CATURTHA_HALF.contains(&w) && x == TIBETAN_SUBJOINED_LETTER_HA; + + if is_voiced_aspirated_consonant || is_tibetan_r_l_vowel_mark(x, y) { + // But, not for voiced aspirated consonants, which we transliterate as + // single units. + // + // Nor for certain dependent vowel marks (SLP f, F, x, X), which we + // likewise transliterate as signle units. + ret.push(x); + } else { + ret.extend(&[TIBETAN_MARK_HALANTA, x_new]); + } + } else { + ret.push(x); + } + } + + i = seek(i, 1); + } + Cow::Owned(ret) + } + _ => Cow::Borrowed(input), + } +} + +pub fn reshape_after(output: String, mapping: &Mapping) -> String { + match mapping.to() { + Scheme::Khmer => { + let mut ret = String::new(); + let mut i = 0; + + let seek = |i, n| i + chars_to_byte_offset(&output[i..], n); + + // TODO: rewrite anusvara per Aksharamukha. + while i < output.len() { + let mut chars = output[i..].chars(); + let x = chars.next().expect("text remaining"); + let y = chars.next(); + + if x == KHMER_SIGN_VIRIAM { + // VIRIAM + (cons) --> COENG + (cons) + if y.map_or(false, is_khmer_consonant) { + ret.push(KHMER_SIGN_COENG); + i = seek(i, 1); + continue; + }; + } else if x == KHMER_LETTER_RO { + // RO + VIRIAM + (cons) --> (cons) + ROBAT + let z = chars.next(); + if y == Some(KHMER_SIGN_VIRIAM) && z.map_or(false, is_khmer_consonant) { + let z = z.expect("is consonant"); + ret.extend(&[z, KHMER_SIGN_ROBAT]); + i = seek(i, 3); + continue; + } + } + ret.push(x); + i = seek(i, 1) + } + ret + } + Scheme::Tamil => { + let mut ret = String::new(); + let mut chars = output.chars(); + + // Move superscripts after the marks they should follow. + while let Some(x) = chars.next() { + if is_tamil_superscript(x) { + let y = chars.next(); + if y.map_or(false, is_tamil_preceding_mark) { + y.map(|c| ret.push(c)); + ret.push(x); + } else { + ret.push(x); + y.map(|c| ret.push(c)); + } + } else { + ret.push(x) + } + } + ret + } + Scheme::Thai => { + let mut ret = String::new(); + for y in output.chars() { + if is_thai_preceding_vowel(y) { + if let Some(x) = ret.pop() { + if is_thai_consonant(x) { + ret.extend(&[y, x]); + } else { + ret.extend(&[x, y]); + } + } else { + ret.push(y); + } + } else { + ret.push(y); + } + } + ret + } + Scheme::Tibetan => { + let mut ret = String::new(); + let mut i = 0; + + let seek = |i, n| i + chars_to_byte_offset(&output[i..], n); + + while i < output.len() { + let mut chars = output[i..].chars(); + let x = chars.next().expect("text remaining"); + if x == ' ' { + // space --> tsheg + ret.push(TIBETAN_MARK_INTERSYLLABLIC_TSHEG); + i = seek(i, 1); + } else if x == TIBETAN_LETTER_WA { + // va --> ba + // + // This is for consistency with Aksharamukha. + ret.push(TIBETAN_LETTER_BA); + i = seek(i, 1); + } else if x == TIBETAN_MARK_HALANTA { + // virama + (cons) --> (subjoined cons) + let maybe_y = chars.next().and_then(to_tibetan_subjoined_consonant); + if let Some(y) = maybe_y { + ret.push(y); + i = seek(i, 2); + } else { + // Don't push halanta, per Aksharamukha. + i = seek(i, 1); + } + } else { + ret.push(x); + i = seek(i, 1); + } + } + ret + } + _ => output, + } +} diff --git a/vidyut-lipi/src/scheme.rs b/vidyut-lipi/src/scheme.rs index ba9283a..194881b 100644 --- a/vidyut-lipi/src/scheme.rs +++ b/vidyut-lipi/src/scheme.rs @@ -34,104 +34,149 @@ pub(crate) enum Coverage { pub enum Scheme { /// Balinese script. /// - /// https://unicode.org/charts/PDF/U1B00.pdf + /// Docs: Balinese, /// Bengali script. /// - /// https://unicode.org/charts/PDF/U0980.pdf + /// Docs: Bengali, /// Brahmi script. /// - /// https://unicode.org/charts/PDF/U11000.pdf + /// Docs: Brahmi, /// Burmese script. /// - /// https://unicode.org/charts/PDF/U1000.pdf + /// Docs: Burmese, + /// Cham script. + /// + /// + // Cham, + /// Devanagari script. /// - /// https://unicode.org/charts/PDF/U0900.pdf - /// https://unicode.org/charts/PDF/UA8E0.pdf (Devanagari Extended) - /// https://unicode.org/charts/PDF/U11B00.pdf (Devanagari Extended-A) - /// https://unicode.org/charts/PDF/U1CD0.pdf (Vedic Extensions) + /// Docs: + /// - + /// - (Devanagari Extended) + /// - (Devanagari Extended-A) + /// - (Vedic Extensions) Devanagari, /// Gujarati script. /// - /// https://unicode.org/charts/PDF/U0A80.pdf + /// Docs: Gujarati, /// Grantha script. /// - /// Documentation: - /// - http://www.unicode.org/charts/PDF/U11300.pdf - /// - https://unicode.org/L2/L2009/09372-grantha.pdf + /// Docs: + /// - + /// - Grantha, /// Gurmukhi script. /// - /// https://unicode.org/charts/PDF/U0A00.pdf + /// Docs: Gurmukhi, /// Javanese script. /// - /// https://unicode.org/charts/PDF/UA980.pdf + /// Docs: Javanese, /// Kannada script. /// - /// https://unicode.org/charts/PDF/U0C80.pdf + /// Docs: Kannada, + /// Khmer script. + /// + /// + Khmer, + /// Malayalam script. /// - /// https://unicode.org/charts/PDF/U0D00.pdf + /// Docs: Malayalam, + /// Modi script. + /// + /// + Modi, + + /// Lao script. + /// + /// Documentation: + /// - + /// - + // Lao, + + /// Newa script. + /// + /// + Newa, + /// Odia script. /// - /// https://unicode.org/charts/PDF/U0B00.pdf + /// Docs: Odia, + /// Saurashtra script. + /// + /// Docs: + Saurashtra, + /// Sharada script. /// - /// https://unicode.org/charts/PDF/U11180.pdf + /// Docs: Sharada, /// Siddham script. /// - /// https://unicode.org/charts/PDF/U11580.pdf + /// Docs: Siddham, /// Sinhala script. /// - /// https://unicode.org/charts/PDF/U0D80.pdf + /// Docs: Sinhala, /// Tamil script. /// - /// https://unicode.org/charts/PDF/U0B80.pdf + /// Docs: Tamil, + /// Telugu script. + /// + /// Docs: + Telugu, + + /// Thai script. + /// + /// + Thai, + /// Tibetan script. /// - /// https://unicode.org/charts/PDF/U0F00.pdf - // Tibetan, + /// **Status: buggy and partial.** + /// + /// Docs: + Tibetan, - /// Telugu script. + /// Tirhuta script. /// - /// https://unicode.org/charts/PDF/U0C00.pdf - Telugu, + /// Docs: + Tirhuta, /// Baraha transliteration. /// - /// Documentation: - /// - https://baraha.com/help//Keyboards/dev-phonetic.htm (Baraha North) - /// - https://baraha.com/help/special-symbols.htm + /// Docs: + /// - (Baraha North) + /// - BarahaSouth, /// Harvard-Kyoto transliteration. @@ -149,19 +194,25 @@ pub enum Scheme { /// TODO: find a free documentation link for ISO 15919. Iso15919, - /// ITRANS transliteration. + /// ITRANS 5.3 transliteration. + /// + /// Docs: + /// - https://www.aczoom.com/itrans/ (official ITRANS site for version 5.3) + /// - https://www.aczoom.com/itrans/html/dvng/node3.html (DEVNAG table) + /// - http://www.sanskritweb.net/itrans/itmanual2003.pdf (Itranslator 2003 manual) /// - /// https://www.aczoom.com/itrans/online/itrans6/itrans-tables-unicode.pdf + /// ITRANS appears in various versions, some of which conflict with each other. Version 5.3 + /// seems to be the most widely used, and it is supported by software like Itranslator 2003. Itrans, /// SLP1 transliteration. /// - /// https://www.sanskritlibrary.org/pub/SLP1LiesAppendixB.pdf + /// Docs: Slp1, /// Velthuis transliteration. /// - /// https://mirrors.mit.edu/CTAN/language/devanagari/velthuis/doc/manual.pdf + /// Docs: Velthuis, /// WX transliteration. @@ -179,6 +230,7 @@ impl Scheme { use Scheme::*; const SCHEMES: &[Scheme] = &[ Balinese, + BarahaSouth, Bengali, Brahmi, Burmese, @@ -186,20 +238,27 @@ impl Scheme { Grantha, Gujarati, Gurmukhi, - BarahaSouth, HarvardKyoto, Iast, + Iso15919, Itrans, Javanese, Kannada, + Khmer, Malayalam, + Modi, + Newa, Odia, + Saurashtra, Sharada, Siddham, Sinhala, Slp1, Tamil, Telugu, + Thai, + Tibetan, + Tirhuta, Velthuis, Wx, ]; @@ -214,20 +273,28 @@ impl Scheme { Scheme::Bengali => auto::BENGALI, Scheme::Brahmi => auto::BRAHMI, Scheme::Burmese => auto::BURMESE, + // Scheme::Cham => auto::CHAM, Scheme::Devanagari => auto::DEVANAGARI, Scheme::Gujarati => auto::GUJARATI, Scheme::Gurmukhi => auto::GURMUKHI, Scheme::Grantha => auto::GRANTHA, Scheme::Javanese => auto::JAVANESE, Scheme::Kannada => auto::KANNADA, + Scheme::Khmer => auto::KHMER, + // Scheme::Lao => auto::LAO, Scheme::Malayalam => auto::MALAYALAM, + Scheme::Modi => auto::MODI, + Scheme::Newa => auto::NEWA, Scheme::Odia => auto::ORIYA, + Scheme::Saurashtra => auto::SAURASHTRA, Scheme::Sharada => auto::SHARADA, Scheme::Siddham => auto::SIDDHAM, Scheme::Sinhala => auto::SINHALA, Scheme::Tamil => auto::TAMIL, Scheme::Telugu => auto::TELUGU, - // Scheme::Tibetan => auto::TIBETAN, + Scheme::Thai => auto::THAI, + Scheme::Tibetan => auto::TIBETAN, + Scheme::Tirhuta => auto::TIRHUTA, Scheme::BarahaSouth => auto::BARAHA, Scheme::HarvardKyoto => auto::HK, Scheme::Iast => auto::IAST, @@ -260,6 +327,7 @@ impl Scheme { Sinhala => u::SINHALA_NFD, Tamil => u::TAMIL_NFD, Telugu => u::TELUGU_NFD, + Tirhuta => u::TIRHUTA_NFD, Iast | Iso15919 => u::LATIN_NFD, _ => &[], } @@ -292,9 +360,9 @@ impl Scheme { // Use an exhaustive match (no `_`) so that we explicitly account for all schemes. match self { // Abugidas are all `true`. - Balinese | Bengali | Brahmi | Burmese | Devanagari | Gujarati | Gurmukhi | Grantha - | Javanese | Kannada | Malayalam | Odia | Sharada | Siddham | Sinhala | Tamil - | Telugu => true, + Balinese | Bengali | Brahmi | Burmese | Devanagari | Grantha | Gujarati | Gurmukhi + | Javanese | Kannada | Khmer | Malayalam | Modi | Newa | Odia | Saurashtra + | Sharada | Siddham | Sinhala | Tamil | Telugu | Thai | Tibetan | Tirhuta => true, // Alphabets are all `false`. BarahaSouth | HarvardKyoto | Iso15919 | Itrans | Iast | Slp1 | Velthuis | Wx => false, @@ -332,7 +400,9 @@ impl Scheme { Javanese => Classical, Kannada => Classical, Malayalam => Classical, + Newa => Classical, Odia => Classical, + Saurashtra => Classical, Sharada => Classical, Sinhala => Classical, Telugu => Classical, @@ -361,10 +431,11 @@ mod tests { // // Don't use `_`, as that would defeat the point of this test. match s { - Devanagari | Balinese | Bengali | Tamil | Brahmi | Burmese | Grantha | Gujarati - | Gurmukhi | Javanese | Odia | Sharada | Kannada | Malayalam | Siddham - | Sinhala | Telugu | Itrans | HarvardKyoto | Slp1 | Velthuis | Iast | Wx - | Iso15919 | BarahaSouth => { + Balinese | BarahaSouth | Bengali | Brahmi | Burmese | Devanagari | Grantha + | Gujarati | Gurmukhi | HarvardKyoto | Iast | Iso15919 | Itrans | Javanese + | Kannada | Khmer | Malayalam | Modi | Newa | Odia | Saurashtra | Sharada + | Siddham | Sinhala | Slp1 | Tamil | Telugu | Thai | Tibetan | Tirhuta + | Velthuis | Wx => { expected.push(*s); } } @@ -397,6 +468,11 @@ mod tests { fn token_pairs_are_all_nfc() { for scheme in Scheme::iter() { for (key, value) in scheme.token_pairs() { + assert!( + !value.contains('\u{25cc}'), + "{value} contains the dreaded 25cc" + ); + let key_nfc: String = key.nfc().collect(); let value_nfc: String = value.nfc().collect(); assert_eq!(&key_nfc, key); diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs index 9a36d4a..dbd55fb 100644 --- a/vidyut-lipi/src/transliterate.rs +++ b/vidyut-lipi/src/transliterate.rs @@ -1,193 +1,212 @@ use crate::mapping::Mapping; use crate::numerals; +use crate::reshape::{reshape_after, reshape_before}; use crate::scheme::Scheme; -/// Transliterates from an abugida. -fn transliterate_from_abugida(input: &str, mapping: &Mapping) -> String { - let chars: Vec = input.chars().collect(); - let is_to_alpha = mapping.to.is_alphabet(); +/// Transliterates the input string with the provided `Mapping`. +/// +/// For most use cases, we recommend using the API on `Lipika` instead. +/// +/// ### Usage +/// +/// ``` +/// use vidyut_lipi::{transliterate, Mapping, Scheme}; +/// +/// let mapping = Mapping::new(Scheme::HarvardKyoto, Scheme::Devanagari); +/// let result = transliterate("saMskRtam", &mapping); +/// assert_eq!(result, "संस्कृतम्"); +/// ``` +pub fn transliterate(input: impl AsRef, mapping: &Mapping) -> String { + transliterate_inner(input.as_ref(), mapping) +} + +/// Transliterates the input string with the provided `Mapping`. +/// +/// ### Implementation +/// +/// We iterate through `input` in one pass. We build `output` by repeatedly matching the prefix of +/// the remaining text (`input[i..]`) against `mapping` and appending the best match to the output +/// buffer. +/// +/// For most scheme pairs, this simple proceduce is sufficient. But some schemes are complex enough +/// that further post-processing is necessary. In general, we post-process iteratively so that we +/// can avoid making a second pass through the output string. +fn transliterate_inner(input: &str, mapping: &Mapping) -> String { + let input = reshape_before(input, mapping); + + let is_to_alphabet = mapping.to.is_alphabet(); + let is_from_abugida = mapping.from.is_abugida(); + let is_to_abugida = mapping.to.is_abugida(); + let is_from_itrans = mapping.from == Scheme::Itrans; + let uses_non_decimal = mapping.from.has_non_decimal_numerals() || mapping.to.has_non_decimal_numerals(); let mut output = String::new(); let mut i = 0; - let mut key = String::new(); - let mut had_consonant = false; - while i < chars.len() { + let mut had_virama = false; + while i < input.len() { + // Special case: Numerals that don't use decimal place notation are transliterated + // separately. if uses_non_decimal { - let num_numerals = chars[i..] - .iter() - .take_while(|c| { - let mut temp = [0u8; 4]; - let digit_str = c.encode_utf8(&mut temp); - mapping.numeral_to_int.contains_key(digit_str) - }) - .count(); - if num_numerals > 0 { - numerals::transliterate_numeral(&mut output, &chars[i..i + num_numerals], mapping); - i += num_numerals; + let next_i = find_end_of_numeral_span(mapping, input.as_ref(), i); + if let Some(next_i) = next_i { + debug_assert!(next_i > i, "next_i = {next_i}, i = {i}"); + numerals::transliterate_numeral(&mut output, &input[i..next_i], mapping); + i = next_i; continue; } } - let mut o: Option<&String> = None; - let mut key_len_in_chars = 0; + // 1. Find the largest prefix of `input[i..]` that is defined in `mapping`. + // + // We must check for the *largest* match to distinguish between `b` and `bh`, `R` and `RR`, + // etc. + let mut token = None; + let mut key: &str = ""; + let mut next_i = i; for len_key in (1..=mapping.len_longest_key).rev() { - let j = std::cmp::min(i + len_key, chars.len()); - key.clear(); - key.extend(&chars[i..j]); - key_len_in_chars = j - i; + // `nth` is 0-indexed. + let j = input[i..].char_indices().nth(len_key).map(|(i, _)| i); + next_i = if let Some(j) = j { i + j } else { input.len() }; - o = mapping.get(&key); - if o.is_some() { + key = &input[i..next_i]; + token = mapping.get(key); + if token.is_some() { break; } } - - match o { - Some(s) => { - if had_consonant - && (mapping.marks.contains_key(&key) || key == mapping.input_virama) + debug_assert!(next_i > i, "next_i = {next_i}, i = {i}"); + + // 2. Append the mapped result, if it exists. + if let Some(token) = token { + // Abugidas and alphabets have distinct logic here, so keep their code neatly separate. + if is_from_abugida { + if output.ends_with('a') + && (mapping.marks.contains_key(key) || key == mapping.input_virama) { - // Pop implicit "a" vowel. + // `key` maps to a token that blocks the default "a" vowel, so pop the "a" that + // we added in the previous iteration. output.pop(); } - output += s; + output += &token.text; - if is_to_alpha && mapping.consonants.contains_key(&key) { - // Add implicit "a" vowel. + if is_to_alphabet && token.is_consonant() { + // Add an implicit "a" vowel. + // + // (The next loop iteration might pop this "a" off of `output`.) output += "a"; - had_consonant = true; } - } - None => { - output.push_str(&key); - } - } - - // Add length in *chars*, not in *bytes*. Otherwise we get weird output. - debug_assert!(key_len_in_chars > 0); - i += key_len_in_chars; - } - - output -} - -/// Transliterates from an alphabet. -fn transliterate_from_alphabet(input: &str, mapping: &Mapping) -> String { - let chars: Vec = input.chars().collect(); - let is_to_abugida = mapping.to.is_abugida(); - let is_itrans = mapping.from == Scheme::Itrans; - - let mut output = String::new(); - let mut i = 0; - let mut key = String::new(); - let mut had_consonant = false; - while i < chars.len() { - if mapping.to().has_non_decimal_numerals() { - let num_numerals = chars[i..] - .iter() - .take_while(|c| { - let mut temp = [0u8; 4]; - let digit_str = c.encode_utf8(&mut temp); - mapping.numeral_to_int.contains_key(digit_str) - }) - .count(); - if num_numerals > 0 { - numerals::transliterate_numeral(&mut output, &chars[i..i + num_numerals], mapping); - i += num_numerals; - continue; - } - } - - let mut o: Option<&String> = None; - let mut key_len_in_chars = 0; - for len_key in (1..=mapping.len_longest_key).rev() { - let j = std::cmp::min(i + len_key, chars.len()); - key.clear(); - key.extend(&chars[i..j]); - key_len_in_chars = j - i; - - o = mapping.get(&key); - if o.is_some() { - break; - } - } - - match o { - Some(o) => { - if had_consonant { - if let Some(mark) = mapping.marks.get(&key) { - if is_to_abugida { + } else { + // Transliterate from alphabet + if had_virama && key == "a" { + // `key` is the default "a" vowel, so pop the virama that we added in the + // previous iteration. + output.pop(); + had_virama = false; + } else { + let mut text = &token.text; + if had_virama { + if let Some(mark) = mapping.marks.get(key) { output.pop(); + text = mark; } - output += mark; - } else if key == "a" && is_to_abugida { - output.pop(); - } else { - output += o; } - } else { - output += o; - } - had_consonant = mapping.consonants.contains_key(&key); - if had_consonant && is_to_abugida { - output += &mapping.output_virama; + output += text; + + if token.is_consonant() && is_to_abugida { + // We have not seen a vowel mark yet, so push a virama for now. + // + // (The next loop iteration might pop this virama off of `output`.) + output += &mapping.output_virama; + had_virama = true; + } } } - None => { - // ITRANS: `\` skips the next character. - if is_itrans && chars[i] == '\\' { - if let Some(c) = chars.get(i + 1) { - output.push(*c); + } else { + // ITRANS: `\` skips the next character. + if is_from_itrans { + let mut chars = input[i..].chars(); + if chars.next() == Some('\\') { + i += 1; + if let Some(c) = chars.next() { + output.push(c); + i += c.len_utf8(); } - i += 2; continue; } - - // Use the original character as-is. - output.push_str(&key); - had_consonant = false; } + + // Use the original character as-is. + output.push_str(key); + had_virama = false; } - // Add length in *chars*, not in *bytes*. Otherwise we get weird output. - debug_assert!(key_len_in_chars > 0); - i += key_len_in_chars; + // Prepare for next loop. + i = next_i; } - output + + reshape_after(output, mapping) } -/// Transliterates the input string with the provided `Mapping`. -/// -/// For most use cases, we recommend using the API on `Lipika` instead. +/// Finds the end byte of a sequence of numerals starting at byte offset `i`. /// -/// ### Usage -/// -/// ``` -/// use vidyut_lipi::{transliterate, Mapping, Scheme}; -/// -/// let mapping = Mapping::new(Scheme::HarvardKyoto, Scheme::Devanagari); -/// let result = transliterate("saMskRtam", &mapping); -/// assert_eq!(result, "संस्कृतम्"); -/// ``` -pub fn transliterate(input: impl AsRef, mapping: &Mapping) -> String { - if mapping.from.is_abugida() { - transliterate_from_abugida(input.as_ref(), mapping) +/// Returns one of: +/// - `Some(j)` if `input[i..]` starts with a numeral. +/// - `None` if `input[i..]` does not start wtih a numeral. +fn find_end_of_numeral_span(mapping: &Mapping, input: &str, i: usize) -> Option { + let j = input[i..] + .char_indices() + .find(|(_, c)| { + let mut temp = [0u8; 4]; + let digit_str = c.encode_utf8(&mut temp); + !mapping.numeral_to_int.contains_key(digit_str) + }) + .map(|(i, _)| i); + + if let Some(j) = j { + if j == 0 { + // input[i..] does not start with a number. + None + } else { + Some(i + j) + } } else { - transliterate_from_alphabet(input.as_ref(), mapping) + // input[i..] is numeric until the end of the string. + Some(input.len()) } } #[cfg(test)] mod tests { use super::*; + use Scheme::*; + + #[test] + fn test_find_end_of_numeral_span() { + let m = Mapping::new(HarvardKyoto, Devanagari); + + // Basic cases + assert_eq!(find_end_of_numeral_span(&m, "1a", 0), Some(1)); + assert_eq!(find_end_of_numeral_span(&m, "a1a", 1), Some(2)); + + // Followed by non-numeric + assert_eq!(find_end_of_numeral_span(&m, "1a", 0), Some(1)); + + // Non-numeric + assert_eq!(find_end_of_numeral_span(&m, "a1", 0), None); + + // Numeric until end of string + assert_eq!(find_end_of_numeral_span(&m, "1", 0), Some(1)); + assert_eq!(find_end_of_numeral_span(&m, "10", 0), Some(2)); + } + /// For more detailed tests, see our integration test file. #[test] fn test_transliterate() { - let mapping = Mapping::new(Scheme::HarvardKyoto, Scheme::Devanagari); + let mapping = Mapping::new(HarvardKyoto, Devanagari); let t = |s| transliterate(s, &mapping); assert_eq!(t("namaskRtya"), "नमस्कृत्य"); } diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs index d75738a..c2edbb8 100644 --- a/vidyut-lipi/src/unicode_norm.rs +++ b/vidyut-lipi/src/unicode_norm.rs @@ -33,8 +33,10 @@ pub const LATIN_NFD: &[(&str, &str)] = &[ // C1 Controls and Latin-1 Supplement (https://unicode.org/charts/PDF/U0080.pdf) ("\u{00d1}", "N\u{0303}"), // Ñ ("\u{00e8}", "e\u{0300}"), // è + ("\u{00ea}", "e\u{0302}"), // ê ("\u{00f1}", "n\u{0303}"), // ñ ("\u{00f2}", "o\u{0300}"), // ò + ("\u{00f4}", "o\u{0302}"), // ô // Latin Extended-A (https://unicode.org/charts/PDF/U0100.pdf) ("\u{0100}", "A\u{0304}"), // Ā ("\u{0101}", "a\u{0304}"), // ā @@ -50,6 +52,8 @@ pub const LATIN_NFD: &[(&str, &str)] = &[ ("\u{015b}", "s\u{0301}"), // ś ("\u{016a}", "U\u{0304}"), // Ū ("\u{016b}", "u\u{0304}"), // ū + ("\u{017c}", "z\u{0307}"), // ż + ("\u{017e}", "z\u{030c}"), // ž // Latin Extended Additional (https://unicode.org/charts/PDF/U1E00.pdf) ("\u{1e0c}", "D\u{0323}"), // Ḍ ("\u{1e0d}", "d\u{0323}"), // ḍ @@ -83,8 +87,12 @@ pub const LATIN_NFD: &[(&str, &str)] = &[ ("\u{1e63}", "s\u{0323}"), // ṣ ("\u{1e6c}", "T\u{0323}"), // Ṭ ("\u{1e6d}", "t\u{0323}"), // ṭ + ("\u{1e89}", "w\u{0323}"), // ẉ ("\u{1e8e}", "Y\u{0307}"), // Ẏ ("\u{1e8f}", "y\u{0307}"), // ẏ + ("\u{1e93}", "z\u{0323}"), // ẓ + ("\u{1e95}", "z\u{0331}"), // ž + ("\u{1e96}", "h\u{0331}"), // ẖ ]; /// NFD/NFC mapping for Devanagari. @@ -225,6 +233,13 @@ pub const TELUGU_NFD: &[(&str, &str)] = &[ ("\u{0c48}", "\u{0c46}\u{0c56}"), // vowel sign ai ]; +/// Spec: https://www.unicode.org/charts/PDF/U11480.pdf +pub const TIRHUTA_NFD: &[(&str, &str)] = &[ + ("\u{114bb}", "\u{114b9}\u{114ba}"), // vowel sign ai + ("\u{114bc}", "\u{114b9}\u{114b0}"), // vowel sign o + ("\u{114be}", "\u{114b9}\u{114bd}"), // vowel sign au +]; + #[allow(unused)] pub(crate) fn to_nfc(s: &str) -> String { let mut map = FxHashMap::default(); @@ -291,7 +306,7 @@ pub(crate) fn to_nfd(s: &str) -> String { if let Some(nfd) = map.get(char_str) { ret.push_str(nfd) } else { - ret.push_str(&char_str); + ret.push_str(char_str); } } ret diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs index b51e56f..b6ffa37 100644 --- a/vidyut-lipi/tests/basic.rs +++ b/vidyut-lipi/tests/basic.rs @@ -9,7 +9,20 @@ fn t(input: &str, from: Scheme, to: Scheme) -> String { fn assert_transliterate(input: &str, from: Scheme, to: Scheme, expected: &str) { let actual = t(input, from, to); - assert_eq!(expected, actual, "t(\"{input}\", {from:?}, {to:?})"); + let e_codes: Vec<_> = expected.chars().map(|c| c as u32).collect(); + let a_codes: Vec<_> = actual.chars().map(|c| c as u32).collect(); + assert_eq!( + *expected, actual, + "input: {input} ({from:?} --> {to:?}) + +expected: {} + {e_codes:x?} + +actual: {} + {a_codes:x?} +", + expected, actual + ); } /// Transliterates all input strings against each other. @@ -21,14 +34,28 @@ fn assert_two_way_pairwise(examples: &[(Scheme, &str)]) { // Also test the case where "from == to." In this case, we should return the original // input string as-is. let actual = t(input, *from, *to); - assert_eq!(*expected, actual, "t(\"{input}\", {from:?}, {to:?})"); + let e_codes: Vec<_> = expected.chars().map(|c| c as u32).collect(); + let a_codes: Vec<_> = actual.chars().map(|c| c as u32).collect(); + assert_eq!( + *expected, actual, + "input: {input} ({from:?} --> {to:?}) + +expected: {} + {e_codes:x?} + +actual: {} + {a_codes:x?} +", + expected, actual + ); } } } /// Transliterates `reference` into each item in `examples`. /// -/// Use this function is ideal if the transliteration is lossy. +/// Use this function if transliteration is lossy from `reference` to the given `examples`. All +/// `examples` are also round-tripped with each other through `assert_two_way_pairwise`. fn assert_one_way_pairwise(reference: (Scheme, &str), examples: &[(Scheme, &str)]) { let from = reference.0; let input = reference.1; @@ -39,6 +66,8 @@ fn assert_one_way_pairwise(reference: (Scheme, &str), examples: &[(Scheme, &str) let actual = t(input, from, *to); assert_eq!(*expected, actual, "t(\"{input}\", {from:?}, {to:?})"); } + + assert_two_way_pairwise(examples); } // Sanskrit (Basic) @@ -60,16 +89,25 @@ fn sanskrit_independent_vowels() { (Bengali, "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"), (Brahmi, "𑀅 𑀆 𑀇 𑀈 𑀉 𑀊 𑀋 𑀌 𑀍 𑀎 𑀏 𑀐 𑀑 𑀒"), (Burmese, "အ အာ ဣ ဤ ဥ ဦ ၒ ၓ ၔ ၕ ဧ အဲ ဩ ဪ"), + // (Cham, "ꨀ ꨀꨩ ꨁ ꨁꨩ ꨂ ꨂꨩ ꨣꨮ ꨣꨮꨩ ꨤꨮ ꨤꨮꨩ ꨃ ꨄ ꨅ ꨀꨯꨱ"), (Devanagari, "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ए ऐ ओ औ"), (Grantha, "𑌅 𑌆 𑌇 𑌈 𑌉 𑌊 𑌋 𑍠 𑌌 𑍡 𑌏 𑌐 𑌓 𑌔"), (Gujarati, "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ"), (Javanese, "ꦄ ꦄꦴ ꦆ ꦇ ꦈ ꦈꦴ ꦉ ꦉꦴ ꦊ ꦋ ꦌ ꦍ ꦎ ꦎꦴ"), (Kannada, "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಏ ಐ ಓ ಔ"), + (Khmer, "អ អា ឥ ឦ ឧ ឩ ឫ ឬ ឭ ឮ ឯ ឰ ឱ ឳ"), (Malayalam, "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ ഏ ഐ ഓ ഔ"), + (Modi, "𑘀 𑘁 𑘂 𑘃 𑘄 𑘅 𑘆 𑘇 𑘈 𑘉 𑘊 𑘋 𑘌 𑘍"), + (Newa, "𑐀 𑐁 𑐂 𑐃 𑐄 𑐅 𑐆 𑐇 𑐈 𑐉 𑐊 𑐋 𑐌 𑐍"), (Odia, "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ"), + (Saurashtra, "ꢂ ꢃ ꢄ ꢅ ꢆ ꢇ ꢈ ꢉ ꢊ ꢋ ꢍ ꢎ ꢐ ꢑ"), (Sharada, "𑆃 𑆄 𑆅 𑆆 𑆇 𑆈 𑆉 𑆊 𑆋 𑆌 𑆍 𑆎 𑆏 𑆐"), (Sinhala, "අ ආ ඉ ඊ උ ඌ ඍ ඎ ඏ ඐ ඒ ඓ ඕ ඖ"), + (Tamil, "அ ஆ இ ஈ உ ஊ ருʼ ரூʼ லுʼ லூʼ ஏ ஐ ஓ ஔ"), (Telugu, "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఏ ఐ ఓ ఔ"), + (Thai, "อ อา อิ อี อุ อู ฤ ฤๅ ฦ ฦๅ เอ ไอ โอ เอา"), + (Tirhuta, "𑒁 𑒂 𑒃 𑒄 𑒅 𑒆 𑒇 𑒈 𑒉 𑒊 𑒋 𑒌 𑒍 𑒎"), + (Tibetan, "ཨ་ཨཱ་ཨི་ཨཱི་ཨུ་ཨཱུ་རྀ་རཱྀ་ལྀ་ལཱྀ་ཨེ་ཨཻ་ཨོ་ཨཽ"), ]); // Scripts with no vocalic L @@ -82,7 +120,6 @@ fn sanskrit_independent_vowels() { assert_two_way_pairwise(&[ (Slp1, "a A i I u U e E o O"), (Gurmukhi, "ਅ ਆ ਇ ਈ ਉ ਊ ਏ ਐ ਓ ਔ"), - (Tamil, "அ ஆ இ ஈ உ ஊ ஏ ஐ ஓ ஔ"), ]); } @@ -114,16 +151,25 @@ fn sanskrit_dependent_vowels() { (Bengali, "ক কা কি কী কু কূ কৃ কৄ কৢ কৣ কে কৈ কো কৌ"), (Brahmi, "𑀓 𑀓𑀸 𑀓𑀺 𑀓𑀻 𑀓𑀼 𑀓𑀽 𑀓𑀾 𑀓𑀿 𑀓𑁀 𑀓𑁁 𑀓𑁂 𑀓𑁃 𑀓𑁄 𑀓𑁅"), (Burmese, "က ကာ ကိ ကီ ကု ကူ ကၖ ကၗ ကၘ ကၙ ကေ ကဲ ကော ကော်"), + // (Cham, "ꨆ ꨆꨩ ꨆꨪ ꨆꨫ ꨆꨭ ꨆꨭꨩ ꨆꨴꨮ ꨆꨴꨮꨩ ꨆꨵꨮ ꨆꨵꨮꨩ ꨆꨯꨮ ꨆꨰ ꨆꨯ ꨆꨯꨱ"), (Devanagari, "क का कि की कु कू कृ कॄ कॢ कॣ के कै को कौ"), (Grantha, "𑌕 𑌕𑌾 𑌕𑌿 𑌕𑍀 𑌕𑍁 𑌕𑍂 𑌕𑍃 𑌕𑍄 𑌕𑍢 𑌕𑍣 𑌕𑍇 𑌕𑍈 𑌕𑍋 𑌕𑍌"), (Gujarati, "ક કા કિ કી કુ કૂ કૃ કૄ કૢ કૣ કે કૈ કો કૌ"), (Javanese, "ꦏ ꦏꦴ ꦏꦶ ꦏꦷ ꦏꦸ ꦏꦹ ꦏꦽ ꦏ꧀ꦉꦴ ꦏ꧀ꦊ ꦏ꧀ꦋ ꦏꦺ ꦏꦻ ꦏꦺꦴ ꦏꦻꦴ"), (Kannada, "ಕ ಕಾ ಕಿ ಕೀ ಕು ಕೂ ಕೃ ಕೄ ಕೢ ಕೣ ಕೇ ಕೈ ಕೋ ಕೌ"), + (Khmer, "ក កា កិ កី កុ កូ ក្ឫ ក្ឬ ក្ឭ ក្ឮ កេ កៃ កោ កៅ"), (Malayalam, "ക കാ കി കീ കു കൂ കൃ കൄ കൢ കൣ കേ കൈ കോ കൌ"), + (Modi, "𑘎 𑘎𑘰 𑘎𑘱 𑘎𑘲 𑘎𑘳 𑘎𑘴 𑘎𑘵 𑘎𑘶 𑘎𑘷 𑘎𑘸 𑘎𑘹 𑘎𑘺 𑘎𑘻 𑘎𑘼"), + (Newa, "𑐎 𑐎𑐵 𑐎𑐶 𑐎𑐷 𑐎𑐸 𑐎𑐹 𑐎𑐺 𑐎𑐻 𑐎𑐼 𑐎𑐽 𑐎𑐾 𑐎𑐿 𑐎𑑀 𑐎𑑁"), (Odia, "କ କା କି କୀ କୁ କୂ କୃ କୄ କୢ କୣ କେ କୈ କୋ କୌ"), + (Saurashtra, "ꢒ ꢒꢵ ꢒꢶ ꢒꢷ ꢒꢸ ꢒꢹ ꢒꢺ ꢒꢻ ꢒꢼ ꢒꢽ ꢒꢿ ꢒꣀ ꢒꣂ ꢒꣃ"), (Sharada, "𑆑 𑆑𑆳 𑆑𑆴 𑆑𑆵 𑆑𑆶 𑆑𑆷 𑆑𑆸 𑆑𑆹 𑆑𑆺 𑆑𑆻 𑆑𑆼 𑆑𑆽 𑆑𑆾 𑆑𑆿"), (Sinhala, "ක කා කි කී කු කූ කෘ කෲ කෟ කෳ කේ කෛ කෝ කෞ"), + (Tamil, "க கா கி கீ கு கூ க்ருʼ க்ரூʼ க்லுʼ க்லூʼ கே கை கோ கௌ"), (Telugu, "క కా కి కీ కు కూ కృ కౄ కౢ కౣ కే కై కో కౌ"), + (Thai, "ก กา กิ กี กุ กู กฺฤ กฺฤๅ กฺฦ กฺฦๅ เก ไก โก เกา"), + (Tibetan, "ཀ་ཀཱ་ཀི་ཀཱི་ཀུ་ཀཱུ་ཀྲྀ་ཀྲཱྀ་ཀླྀ་ཀླཱྀ་ཀེ་ཀཻ་ཀོ་ཀཽ"), + (Tirhuta, "𑒏 𑒏𑒰 𑒏𑒱 𑒏𑒲 𑒏𑒳 𑒏𑒴 𑒏𑒵 𑒏𑒶 𑒏𑒷 𑒏𑒸 𑒏𑒹 𑒏𑒻 𑒏𑒼 𑒏𑒾"), ]); // Scripts without vocalic L @@ -136,7 +182,6 @@ fn sanskrit_dependent_vowels() { assert_two_way_pairwise(&[ (Slp1, "ka kA ki kI ku kU ke kE ko kO"), (Gurmukhi, "ਕ ਕਾ ਕਿ ਕੀ ਕੁ ਕੂ ਕੇ ਕੈ ਕੋ ਕੌ"), - (Tamil, "க கா கி கீ கு கூ கே கை கோ கௌ"), ]); } @@ -155,34 +200,48 @@ fn sanskrit_ayogavahas_etc() { (Balinese, "ᬅᬂ ᬅᬄ ᬅᬁ"), (Bengali, "অং অঃ অঁ"), (Brahmi, "𑀅𑀁 𑀅𑀂 𑀅𑀀"), + // (Cham, "ꨀꩌ ꨀꩍ ꨀꩃ"), (Devanagari, "अं अः अँ"), (Grantha, "𑌅𑌂 𑌅𑌃 𑌅𑌁"), (Gujarati, "અં અઃ અઁ"), (Javanese, "ꦄꦁ ꦄꦃ ꦄꦀ"), (Kannada, "ಅಂ ಅಃ ಅಁ"), (Malayalam, "അം അഃ അഁ"), + (Newa, "𑐀𑑄 𑐀𑑅 𑐀𑑃"), (Odia, "ଅଂ ଅଃ ଅଁ"), + (Saurashtra, "ꢂꢀ ꢂꢁ ꢂꣅ"), (Sharada, "𑆃𑆁 𑆃𑆂 𑆃𑆀"), (Siddham, "𑖀𑖽 𑖀𑖾 𑖀𑖼"), + (Tamil, "அம்ʼ அ꞉ அம்ˮ"), (Telugu, "అం అః అఁ"), + (Tibetan, "ཨཾ་ཨཿ་ཨྃ"), + (Tirhuta, "𑒁𑓀 𑒁𑓁 𑒁𑒿"), ]); // Scripts without a chandrabindu assert_one_way_pairwise( (Slp1, "aM aH a~"), - &[(Burmese, "အံ အး အံ"), (Sinhala, "අං අඃ අං")], + &[ + (Burmese, "အံ အး အံ"), + (Khmer, "អំ អះ អំ"), + (Modi, "𑘀𑘽 𑘀𑘾 𑘀𑘽"), + (Sinhala, "අං අඃ අං"), + (Thai, "อํ อห์ อํ"), + ], ); } #[test] fn sanskrit_consonants_non_vedic() { + let slp1 = "ka Ka ga Ga Na ca Ca ja Ja Ya wa Wa qa Qa Ra ta Ta da Da na pa Pa ba Ba ma ya ra la va Sa za sa ha"; + assert_two_way_pairwise(&[ (BarahaSouth, "ka kha ga gha ~ga cha Cha ja jha ~ja Ta Tha Da Dha Na ta tha da dha na pa pha ba bha ma ya ra la va sha Sha sa ha"), (HarvardKyoto, "ka kha ga gha Ga ca cha ja jha Ja Ta Tha Da Dha Na ta tha da dha na pa pha ba bha ma ya ra la va za Sa sa ha"), (Iast, "ka kha ga gha ṅa ca cha ja jha ña ṭa ṭha ḍa ḍha ṇa ta tha da dha na pa pha ba bha ma ya ra la va śa ṣa sa ha"), (Iso15919, "ka kha ga gha ṅa ca cha ja jha ña ṭa ṭha ḍa ḍha ṇa ta tha da dha na pa pha ba bha ma ya ra la va śa ṣa sa ha"), (Itrans, "ka kha ga gha ~Na cha Cha ja jha ~na Ta Tha Da Dha Na ta tha da dha na pa pha ba bha ma ya ra la va sha Sha sa ha"), - (Slp1, "ka Ka ga Ga Na ca Ca ja Ja Ya wa Wa qa Qa Ra ta Ta da Da na pa Pa ba Ba ma ya ra la va Sa za sa ha"), + (Slp1, slp1), (Velthuis, "ka kha ga gha \"na ca cha ja jha ~na .ta .tha .da .dha .na ta tha da dha na pa pha ba bha ma ya ra la va \"sa .sa sa ha"), (Wx, "ka Ka ga Ga fa ca Ca ja Ja Fa ta Ta da Da Na wa Wa xa Xa na pa Pa ba Ba ma ya ra la va Sa Ra sa ha"), // Indic @@ -194,13 +253,43 @@ fn sanskrit_consonants_non_vedic() { (Gujarati, "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ"), (Javanese, "ꦏ ꦑ ꦒ ꦓ ꦔ ꦕ ꦖ ꦗ ꦙ ꦚ ꦛ ꦜ ꦝ ꦞ ꦟ ꦠ ꦡ ꦢ ꦣ ꦤ ꦥ ꦦ ꦧ ꦨ ꦩ ꦪ ꦫ ꦭ ꦮ ꦯ ꦰ ꦱ ꦲ"), (Kannada, "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ"), + (Khmer, "ក ខ គ ឃ ង ច ឆ ជ ឈ ញ ដ ឋ ឌ ឍ ណ ត ថ ទ ធ ន ប ផ ព ភ ម យ រ ល វ ឝ ឞ ស ហ"), (Malayalam, "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ"), + (Modi, "𑘎 𑘏 𑘐 𑘑 𑘒 𑘓 𑘔 𑘕 𑘖 𑘗 𑘘 𑘙 𑘚 𑘛 𑘜 𑘝 𑘞 𑘟 𑘠 𑘡 𑘢 𑘣 𑘤 𑘥 𑘦 𑘧 𑘨 𑘩 𑘪 𑘫 𑘬 𑘭 𑘮"), + (Newa, "𑐎 𑐏 𑐐 𑐑 𑐒 𑐔 𑐕 𑐖 𑐗 𑐘 𑐚 𑐛 𑐜 𑐝 𑐞 𑐟 𑐠 𑐡 𑐢 𑐣 𑐥 𑐦 𑐧 𑐨 𑐩 𑐫 𑐬 𑐮 𑐰 𑐱 𑐲 𑐳 𑐴"), (Odia, "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ"), + (Saurashtra, "ꢒ ꢓ ꢔ ꢕ ꢖ ꢗ ꢘ ꢙ ꢚ ꢛ ꢜ ꢝ ꢞ ꢟ ꢠ ꢡ ꢢ ꢣ ꢤ ꢥ ꢦ ꢧ ꢨ ꢩ ꢪ ꢫ ꢬ ꢭ ꢮ ꢯ ꢰ ꢱ ꢲ"), (Sharada, "𑆑 𑆒 𑆓 𑆔 𑆕 𑆖 𑆗 𑆘 𑆙 𑆚 𑆛 𑆜 𑆝 𑆞 𑆟 𑆠 𑆡 𑆢 𑆣 𑆤 𑆥 𑆦 𑆧 𑆨 𑆩 𑆪 𑆫 𑆬 𑆮 𑆯 𑆰 𑆱 𑆲"), (Siddham, "𑖎 𑖏 𑖐 𑖑 𑖒 𑖓 𑖔 𑖕 𑖖 𑖗 𑖘 𑖙 𑖚 𑖛 𑖜 𑖝 𑖞 𑖟 𑖠 𑖡 𑖢 𑖣 𑖤 𑖥 𑖦 𑖧 𑖨 𑖩 𑖪 𑖫 𑖬 𑖭 𑖮"), (Sinhala, "ක ඛ ග ඝ ඞ ච ඡ ජ ඣ ඤ ට ඨ ඩ ඪ ණ ත ථ ද ධ න ප ඵ බ භ ම ය ර ල ව ශ ෂ ස හ"), + (Tamil, "க க² க³ க⁴ ங ச ச² ஜ ஜ² ஞ ட ட² ட³ ட⁴ ண த த² த³ த⁴ ந ப ப² ப³ ப⁴ ம ய ர ல வ ஶ ஷ ஸ ஹ"), (Telugu, "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ"), + (Thai, "ก ข ค ฆ ง จ ฉ ช ฌ ญ ฏ ฐ ฑ ฒ ณ ต ถ ท ธ น ป ผ พ ภ ม ย ร ล ว ศ ษ ส ห"), + (Tirhuta, "𑒏 𑒐 𑒑 𑒒 𑒓 𑒔 𑒕 𑒖 𑒗 𑒘 𑒙 𑒚 𑒛 𑒜 𑒝 𑒞 𑒟 𑒠 𑒡 𑒢 𑒣 𑒤 𑒥 𑒦 𑒧 𑒨 𑒩 𑒪 𑒫 𑒬 𑒭 𑒮 𑒯"), ]); + + // No distinction between ba / va + assert_one_way_pairwise( + (Slp1, slp1), + &[ + ( + Bengali, + "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ", + ), + ( + Tibetan, + "ཀ་ཁ་ག་གྷ་ང་ཙ་ཚ་ཛ་ཛྷ་ཉ་ཊ་ཋ་ཌ་ཌྷ་ཎ་ཏ་ཐ་ད་དྷ་ན་པ་ཕ་བ་བྷ་མ་ཡ་ར་ལ་བ་ཤ་ཥ་ས་ཧ", + ), + ], + ); + + // No distinction between Ta / ta + assert_one_way_pairwise( + (Slp1, slp1), + &[ + // (Cham, "ꨆ ꨇ ꨈ ꨉ ꨋ ꨌ ꨍ ꨎ ꨏ ꨑ ꨓ ꨔ ꨕ ꨖ ꨘ ꨓ ꨔ ꨕ ꨖ ꨘ ꨚ ꨜ ꨝ ꨞ ꨠ ꨢ ꨣ ꨤ ꨥ ꨦ ꨦ ꨧ ꨨ"), + ], + ); } #[test] @@ -214,41 +303,68 @@ fn sanskrit_symbols() { (Wx, "0 1 2 3 4 5 6 7 8 9 . .. Z"), // Indic (Bengali, "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ । ॥ ঽ"), + // (Cham, "꩐ ꩑ ꩒ ꩓ ꩔ ꩕ ꩖ ꩗ ꩘ ꩙ ꩝ ꩞ '"), (Devanagari, "० १ २ ३ ४ ५ ६ ७ ८ ९ । ॥ ऽ"), (Grantha, "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ । ॥ 𑌽"), (Gujarati, "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ । ॥ ઽ"), (Gurmukhi, "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ । ॥ ऽ"), (Kannada, "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ । ॥ ಽ"), + (Khmer, "០ ១ ២ ៣ ៤ ៥ ៦ ៧ ៨ ៩ ។ ៕ ៜ"), (Malayalam, "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ । ॥ ഽ"), + (Modi, "𑙐 𑙑 𑙒 𑙓 𑙔 𑙕 𑙖 𑙗 𑙘 𑙙 𑙁 𑙂 ऽ"), + (Newa, "𑑐 𑑑 𑑒 𑑓 𑑔 𑑕 𑑖 𑑗 𑑘 𑑙 𑑋 𑑌 𑑇"), (Odia, "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ । ॥ ଽ"), + (Saurashtra, "꣐ ꣑ ꣒ ꣓ ꣔ ꣕ ꣖ ꣗ ꣘ ꣙ ꣎ ꣏ ఽ"), (Telugu, "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ । ॥ ఽ"), + (Thai, "๐ ๑ ๒ ๓ ๔ ๕ ๖ ๗ ๘ ๙ ฯ ๚ '"), + (Tibetan, "༠་༡་༢་༣་༤་༥་༦་༧་༨་༩་།་༎་྅"), + (Tirhuta, "𑓐 𑓑 𑓒 𑓓 𑓔 𑓕 𑓖 𑓗 𑓘 𑓙 । ॥ 𑓄"), ]); } #[test] fn sanskrit_basic_sentences() { + let slp1_text = concat!( + "nArAyaRaM namaskftya naraM cEva narottamam . ", + "devIM sarasvatIM cEva tato jayamudIrayet .. 1 .." + ); assert_two_way_pairwise(&[ - (BarahaSouth, "nArAyaNaM namaskRutya naraM chaiva narOttamam | dEvIM sarasvatIM chaiva tatO jayamudIyarEt || 1 ||",), - (HarvardKyoto, "nArAyaNaM namaskRtya naraM caiva narottamam . devIM sarasvatIM caiva tato jayamudIyaret .. 1 ..",), - (Iast, "nārāyaṇaṃ namaskṛtya naraṃ caiva narottamam . devīṃ sarasvatīṃ caiva tato jayamudīyaret .. 1 .."), - (Iso15919, "nārāyaṇaṁ namaskr̥tya naraṁ caiva narōttamam . dēvīṁ sarasvatīṁ caiva tatō jayamudīyarēt .. 1 .."), - (Itrans, "nArAyaNaM namaskRRitya naraM chaiva narottamam | devIM sarasvatIM chaiva tato jayamudIyaret || 1 ||"), - (Slp1, "nArAyaRaM namaskftya naraM cEva narottamam . devIM sarasvatIM cEva tato jayamudIyaret .. 1 .."), - (Velthuis, "naaraaya.na.m namask.rtya nara.m caiva narottamam | devii.m sarasvatii.m caiva tato jayamudiiyaret || 1 ||"), - (Wx, "nArAyaNaM namaskqwya naraM cEva narowwamam . xevIM sarasvawIM cEva wawo jayamuxIyarew .. 1 .."), + (BarahaSouth, "nArAyaNaM namaskRutya naraM chaiva narOttamam | dEvIM sarasvatIM chaiva tatO jayamudIrayEt || 1 ||",), + (HarvardKyoto, "nArAyaNaM namaskRtya naraM caiva narottamam . devIM sarasvatIM caiva tato jayamudIrayet .. 1 ..",), + (Iast, "nārāyaṇaṃ namaskṛtya naraṃ caiva narottamam . devīṃ sarasvatīṃ caiva tato jayamudīrayet .. 1 .."), + (Iso15919, "nārāyaṇaṁ namaskr̥tya naraṁ caiva narōttamam . dēvīṁ sarasvatīṁ caiva tatō jayamudīrayēt .. 1 .."), + (Itrans, "nArAyaNaM namaskRRitya naraM chaiva narottamam | devIM sarasvatIM chaiva tato jayamudIrayet || 1 ||"), + (Slp1, slp1_text), + (Velthuis, "naaraaya.na.m namask.rtya nara.m caiva narottamam | devii.m sarasvatii.m caiva tato jayamudiirayet || 1 ||"), + (Wx, "nArAyaNaM namaskqwya naraM cEva narowwamam . xevIM sarasvawIM cEva wawo jayamuxIrayew .. 1 .."), // Indic - (Balinese, "ᬦᬵᬭᬵᬬᬡᬂ ᬦᬫᬲ᭄ᬓᬺᬢ᭄ᬬ ᬦᬭᬂ ᬘᬿᬯ ᬦᬭᭀᬢ᭄ᬢᬫᬫ᭄ ᭞ ᬤᬾᬯᬷᬂ ᬲᬭᬲ᭄ᬯᬢᬷᬂ ᬘᬿᬯ ᬢᬢᭀ ᬚᬬᬫᬸᬤᬷᬬᬭᬾᬢ᭄ ᭟ ᭑ ᭟"), - (Brahmi, "𑀦𑀸𑀭𑀸𑀬𑀡𑀁 𑀦𑀫𑀲𑁆𑀓𑀾𑀢𑁆𑀬 𑀦𑀭𑀁 𑀘𑁃𑀯 𑀦𑀭𑁄𑀢𑁆𑀢𑀫𑀫𑁆 𑁇 𑀤𑁂𑀯𑀻𑀁 𑀲𑀭𑀲𑁆𑀯𑀢𑀻𑀁 𑀘𑁃𑀯 𑀢𑀢𑁄 𑀚𑀬𑀫𑀼𑀤𑀻𑀬𑀭𑁂𑀢𑁆 𑁈 𑁧 𑁈"), - (Devanagari, "नारायणं नमस्कृत्य नरं चैव नरोत्तमम् । देवीं सरस्वतीं चैव ततो जयमुदीयरेत् ॥ १ ॥"), - (Grantha, "𑌨𑌾𑌰𑌾𑌯𑌣𑌂 𑌨𑌮𑌸𑍍𑌕𑍃𑌤𑍍𑌯 𑌨𑌰𑌂 𑌚𑍈𑌵 𑌨𑌰𑍋𑌤𑍍𑌤𑌮𑌮𑍍 । 𑌦𑍇𑌵𑍀𑌂 𑌸𑌰𑌸𑍍𑌵𑌤𑍀𑌂 𑌚𑍈𑌵 𑌤𑌤𑍋 𑌜𑌯𑌮𑍁𑌦𑍀𑌯𑌰𑍇𑌤𑍍 ॥ ௧ ॥"), - (Gujarati, "નારાયણં નમસ્કૃત્ય નરં ચૈવ નરોત્તમમ્ । દેવીં સરસ્વતીં ચૈવ તતો જયમુદીયરેત્ ॥ ૧ ॥"), - (Kannada, "ನಾರಾಯಣಂ ನಮಸ್ಕೃತ್ಯ ನರಂ ಚೈವ ನರೋತ್ತಮಮ್ । ದೇವೀಂ ಸರಸ್ವತೀಂ ಚೈವ ತತೋ ಜಯಮುದೀಯರೇತ್ ॥ ೧ ॥"), - (Malayalam, "നാരായണം നമസ്കൃത്യ നരം ചൈവ നരോത്തമമ് । ദേവീം സരസ്വതീം ചൈവ തതോ ജയമുദീയരേത് ॥ ൧ ॥"), - (Odia, "ନାରାଯଣଂ ନମସ୍କୃତ୍ଯ ନରଂ ଚୈଵ ନରୋତ୍ତମମ୍ । ଦେଵୀଂ ସରସ୍ଵତୀଂ ଚୈଵ ତତୋ ଜଯମୁଦୀଯରେତ୍ ॥ ୧ ॥"), - (Sharada, "𑆤𑆳𑆫𑆳𑆪𑆟𑆁 𑆤𑆩𑆱𑇀𑆑𑆸𑆠𑇀𑆪 𑆤𑆫𑆁 𑆖𑆽𑆮 𑆤𑆫𑆾𑆠𑇀𑆠𑆩𑆩𑇀 𑇅 𑆢𑆼𑆮𑆵𑆁 𑆱𑆫𑆱𑇀𑆮𑆠𑆵𑆁 𑆖𑆽𑆮 𑆠𑆠𑆾 𑆘𑆪𑆩𑆶𑆢𑆵𑆪𑆫𑆼𑆠𑇀 𑇆 𑇑 𑇆"), - (Siddham, "𑖡𑖯𑖨𑖯𑖧𑖜𑖽 𑖡𑖦𑖭𑖿𑖎𑖴𑖝𑖿𑖧 𑖡𑖨𑖽 𑖓𑖹𑖪 𑖡𑖨𑖺𑖝𑖿𑖝𑖦𑖦𑖿 𑗂 𑖟𑖸𑖪𑖱𑖽 𑖭𑖨𑖭𑖿𑖪𑖝𑖱𑖽 𑖓𑖹𑖪 𑖝𑖝𑖺 𑖕𑖧𑖦𑖲𑖟𑖱𑖧𑖨𑖸𑖝𑖿 𑗃 1 𑗃"), - (Telugu, "నారాయణం నమస్కృత్య నరం చైవ నరోత్తమమ్ । దేవీం సరస్వతీం చైవ తతో జయముదీయరేత్ ॥ ౧ ॥"), + (Balinese, "ᬦᬵᬭᬵᬬᬡᬂ ᬦᬫᬲ᭄ᬓᬺᬢ᭄ᬬ ᬦᬭᬂ ᬘᬿᬯ ᬦᬭᭀᬢ᭄ᬢᬫᬫ᭄ ᭞ ᬤᬾᬯᬷᬂ ᬲᬭᬲ᭄ᬯᬢᬷᬂ ᬘᬿᬯ ᬢᬢᭀ ᬚᬬᬫᬸᬤᬷᬭᬬᬾᬢ᭄ ᭟ ᭑ ᭟"), + (Brahmi, "𑀦𑀸𑀭𑀸𑀬𑀡𑀁 𑀦𑀫𑀲𑁆𑀓𑀾𑀢𑁆𑀬 𑀦𑀭𑀁 𑀘𑁃𑀯 𑀦𑀭𑁄𑀢𑁆𑀢𑀫𑀫𑁆 𑁇 𑀤𑁂𑀯𑀻𑀁 𑀲𑀭𑀲𑁆𑀯𑀢𑀻𑀁 𑀘𑁃𑀯 𑀢𑀢𑁄 𑀚𑀬𑀫𑀼𑀤𑀻𑀭𑀬𑁂𑀢𑁆 𑁈 𑁧 𑁈"), + (Devanagari, "नारायणं नमस्कृत्य नरं चैव नरोत्तमम् । देवीं सरस्वतीं चैव ततो जयमुदीरयेत् ॥ १ ॥"), + (Grantha, "𑌨𑌾𑌰𑌾𑌯𑌣𑌂 𑌨𑌮𑌸𑍍𑌕𑍃𑌤𑍍𑌯 𑌨𑌰𑌂 𑌚𑍈𑌵 𑌨𑌰𑍋𑌤𑍍𑌤𑌮𑌮𑍍 । 𑌦𑍇𑌵𑍀𑌂 𑌸𑌰𑌸𑍍𑌵𑌤𑍀𑌂 𑌚𑍈𑌵 𑌤𑌤𑍋 𑌜𑌯𑌮𑍁𑌦𑍀𑌰𑌯𑍇𑌤𑍍 ॥ ௧ ॥"), + (Gujarati, "નારાયણં નમસ્કૃત્ય નરં ચૈવ નરોત્તમમ્ । દેવીં સરસ્વતીં ચૈવ તતો જયમુદીરયેત્ ॥ ૧ ॥"), + (Kannada, "ನಾರಾಯಣಂ ನಮಸ್ಕೃತ್ಯ ನರಂ ಚೈವ ನರೋತ್ತಮಮ್ । ದೇವೀಂ ಸರಸ್ವತೀಂ ಚೈವ ತತೋ ಜಯಮುದೀರಯೇತ್ ॥ ೧ ॥"), + (Khmer, "នារាយណំ នមស្ក្ឫត្យ នរំ ចៃវ នរោត្តមម៑ ។ ទេវីំ សរស្វតីំ ចៃវ តតោ ជយមុទីរយេត៑ ៕ ១ ៕"), + (Malayalam, "നാരായണം നമസ്കൃത്യ നരം ചൈവ നരോത്തമമ് । ദേവീം സരസ്വതീം ചൈവ തതോ ജയമുദീരയേത് ॥ ൧ ॥"), + (Modi, "𑘡𑘰𑘨𑘰𑘧𑘜𑘽 𑘡𑘦𑘭𑘿𑘎𑘵𑘝𑘿𑘧 𑘡𑘨𑘽 𑘓𑘺𑘪 𑘡𑘨𑘻𑘝𑘿𑘝𑘦𑘦𑘿 𑙁 𑘟𑘹𑘪𑘲𑘽 𑘭𑘨𑘭𑘿𑘪𑘝𑘲𑘽 𑘓𑘺𑘪 𑘝𑘝𑘻 𑘕𑘧𑘦𑘳𑘟𑘲𑘨𑘧𑘹𑘝𑘿 𑙂 𑙑 𑙂"), + (Newa, "𑐣𑐵𑐬𑐵𑐫𑐞𑑄 𑐣𑐩𑐳𑑂𑐎𑐺𑐟𑑂𑐫 𑐣𑐬𑑄 𑐔𑐿𑐰 𑐣𑐬𑑀𑐟𑑂𑐟𑐩𑐩𑑂 𑑋 𑐡𑐾𑐰𑐷𑑄 𑐳𑐬𑐳𑑂𑐰𑐟𑐷𑑄 𑐔𑐿𑐰 𑐟𑐟𑑀 𑐖𑐫𑐩𑐸𑐡𑐷𑐬𑐫𑐾𑐟𑑂 𑑌 𑑑 𑑌"), + (Odia, "ନାରାଯଣଂ ନମସ୍କୃତ୍ଯ ନରଂ ଚୈଵ ନରୋତ୍ତମମ୍ । ଦେଵୀଂ ସରସ୍ଵତୀଂ ଚୈଵ ତତୋ ଜଯମୁଦୀରଯେତ୍ ॥ ୧ ॥"), + (Saurashtra, "ꢥꢵꢬꢵꢫꢠꢀ ꢥꢪꢱ꣄ꢒꢺꢡ꣄ꢫ ꢥꢬꢀ ꢗꣀꢮ ꢥꢬꣂꢡ꣄ꢡꢪꢪ꣄ ꣎ ꢣꢿꢮꢷꢀ ꢱꢬꢱ꣄ꢮꢡꢷꢀ ꢗꣀꢮ ꢡꢡꣂ ꢙꢫꢪꢸꢣꢷꢬꢫꢿꢡ꣄ ꣏ ꣑ ꣏"), + (Sharada, "𑆤𑆳𑆫𑆳𑆪𑆟𑆁 𑆤𑆩𑆱𑇀𑆑𑆸𑆠𑇀𑆪 𑆤𑆫𑆁 𑆖𑆽𑆮 𑆤𑆫𑆾𑆠𑇀𑆠𑆩𑆩𑇀 𑇅 𑆢𑆼𑆮𑆵𑆁 𑆱𑆫𑆱𑇀𑆮𑆠𑆵𑆁 𑆖𑆽𑆮 𑆠𑆠𑆾 𑆘𑆪𑆩𑆶𑆢𑆵𑆫𑆪𑆼𑆠𑇀 𑇆 𑇑 𑇆"), + (Siddham, "𑖡𑖯𑖨𑖯𑖧𑖜𑖽 𑖡𑖦𑖭𑖿𑖎𑖴𑖝𑖿𑖧 𑖡𑖨𑖽 𑖓𑖹𑖪 𑖡𑖨𑖺𑖝𑖿𑖝𑖦𑖦𑖿 𑗂 𑖟𑖸𑖪𑖱𑖽 𑖭𑖨𑖭𑖿𑖪𑖝𑖱𑖽 𑖓𑖹𑖪 𑖝𑖝𑖺 𑖕𑖧𑖦𑖲𑖟𑖱𑖨𑖧𑖸𑖝𑖿 𑗃 1 𑗃"), + (Tamil, "நாராயணம்ʼ நமஸ்க்ருʼத்ய நரம்ʼ சைவ நரோத்தமம் . தே³வீம்ʼ ஸரஸ்வதீம்ʼ சைவ ததோ ஜயமுதீ³ரயேத் .. 1 .."), + (Telugu, "నారాయణం నమస్కృత్య నరం చైవ నరోత్తమమ్ । దేవీం సరస్వతీం చైవ తతో జయముదీరయేత్ ॥ ౧ ॥"), + (Thai, "นารายณํ นมสฺกฺฤตฺย นรํ ไจว นโรตฺตมมฺ ฯ เทวีํ สรสฺวตีํ ไจว ตโต ชยมุทีรเยตฺ ๚ ๑ ๚"), + (Tirhuta, "𑒢𑒰𑒩𑒰𑒨𑒝𑓀 𑒢𑒧𑒮𑓂𑒏𑒵𑒞𑓂𑒨 𑒢𑒩𑓀 𑒔𑒻𑒫 𑒢𑒩𑒼𑒞𑓂𑒞𑒧𑒧𑓂 । 𑒠𑒹𑒫𑒲𑓀 𑒮𑒩𑒮𑓂𑒫𑒞𑒲𑓀 𑒔𑒻𑒫 𑒞𑒞𑒼 𑒖𑒨𑒧𑒳𑒠𑒲𑒩𑒨𑒹𑒞𑓂 ॥ 𑓑 ॥"), ]); + + // Non-reversible due to b/v and no virama. + assert_transliterate( + slp1_text, + Slp1, + Tibetan, + "ནཱརཱཡཎཾ་ནམསྐྲྀཏྱ་ནརཾ་ཙཻབ་ནརོཏྟམམ་།་དེབཱིཾ་སརསྭཏཱིཾ་ཙཻབ་ཏཏོ་ཛཡམུདཱིརཡེཏ་༎་༡་༎", + ); } // Sanskrit (Vedic) @@ -310,6 +426,7 @@ fn sanskrit_vedic_consonants() { (Kannada, "ಳ ಳ್ಹ"), (Malayalam, "ള ള്ഹ"), (Odia, "ଳ ଳ୍ହ"), + (Saurashtra, "ꢳ ꢳ꣄ꢲ"), (Sharada, "𑆭 𑆭𑇀𑆲"), (Siddham, "𑖩𑗀 𑖩𑗀𑖿𑖮"), (Sinhala, "ළ ළ්හ"), @@ -461,6 +578,53 @@ fn iso_unicode_variants() { } } +#[test] +fn itrans_alternates() { + fn assert_same(items: &[&str]) { + let mut lipika = Lipika::new(); + let default = items[0]; + let deva = lipika.transliterate(default, Itrans, Devanagari); + + for x in &items[1..] { + let x_deva = lipika.transliterate(x, Itrans, Devanagari); + assert_eq!(deva, x_deva, "{default} ({deva}) != {x} ({x_deva})"); + } + } + + assert_same(&["A kA", "aa kaa"]); + assert_same(&["I kI", "ii kii", "ee kee"]); + assert_same(&["U kU", "uu kuu", "oo koo"]); + assert_same(&["RRi kRRi", "R^i kR^i"]); + assert_same(&["RRI kRRI", "R^I kR^I"]); + assert_same(&["LLi kLLi", "L^i kL^i"]); + assert_same(&["LLI kLLI", "L^I kL^I"]); + + // Anusvara and candrabindu + assert_same(&["aM", "a.m", "a.n"]); + assert_same(&["a.N", "a{\\m+}"]); + + // Consonants + assert_same(&["~Na", "N^a"]); + assert_same(&["ca", "cha"]); + assert_same(&["~na", "JNa"]); + // TODO: source for Ca? + assert_same(&["Cha", "chha", "Ca"]); + assert_same(&["va", "wa"]); + assert_same(&["Sha", "Sa", "shha"]); + assert_same(&["za", "Ja"]); + // TODO: L / ld? + + // Clusters + assert_same(&["kSha", "kSa", "kshha", "xa"]); + assert_same(&["j~na", "GYa", "dnya"]); + assert_same(&["OM", "AUM"]); + + // Punctuation + assert_same(&[".a", "~"]); + assert_same(&["|", "."]); + assert_same(&["||", ".."]); +} + #[test] fn itrans_zero_width_joiner() { assert_transliterate("bara_u", Itrans, Devanagari, "बरउ"); @@ -475,32 +639,61 @@ fn itrans_backslash_escape() { } #[test] -fn itrans_alternates() { - let assert_identical = |x, y| { - let mut lipika = Lipika::new(); - let deva_x = lipika.transliterate(x, Itrans, Devanagari); - let deva_y = lipika.transliterate(y, Itrans, Devanagari); - assert_eq!(deva_x, deva_y, "{x} ({deva_x}) != {y} ({deva_y})"); - }; - assert_identical("A I U RRi RRI LLi LLI", "aa ii uu R^i R^I L^i L^I"); - assert_identical( - "kA kI kU kRRi kRRI kLLi kLLI", - "kaa kii kuu kR^i kR^I kL^i kL^I", - ); - assert_identical("I U", "ee oo"); - assert_identical("kI kU", "kee koo"); - assert_identical("aM aM", "a.m a.n"); - assert_identical("~Na", "N^a"); - assert_identical("ca", "cha"); - assert_identical("Cha Cha", "Ca chha"); - assert_identical("va", "wa"); - assert_identical("Sha Sha", "Sa shha"); - assert_identical("kSha kSha kSha", "kSa kshha xa"); - assert_identical("j~na j~na", "GYa dnya"); - assert_identical("OM", "AUM"); - assert_identical(".a | ||", "~ . .."); - assert_identical("za", "Ja"); - // TODO: assert_identical("a{\\m+}", "a.h.N"); +fn khmer_sign_robat() { + assert_two_way_pairwise(&[(Slp1, "kra kara rka arka kara"), (Khmer, "ក្រ ករ ក៌ អក៌ ករ")]); +} + +#[test] +fn saurashtra_ksha_with_zwj() { + assert_two_way_pairwise(&[(Slp1, "kza"), (Saurashtra, "ꢒ꣄\u{200d}ꢰ")]); +} + +#[test] +fn tamil_superscript() { + assert_two_way_pairwise(&[ + ( + Slp1, + "ga gA gi gI gu gU gf gF gx gX ge gE go gO gaM gaH gIM gIH devI", + ), + ( + Tamil, + "க³ கா³ கி³ கீ³ கு³ கூ³ க்³ருʼ க்³ரூʼ க்³லுʼ க்³லூʼ கே³ கை³ கோ³ கௌ³ க³ம்ʼ க³꞉ கீ³ம்ʼ கீ³꞉ தே³வீ", + ), + ]); +} + +#[test] +fn thai_preceding_vowels() { + assert_two_way_pairwise(&[(Slp1, "ke kE ko kO"), (Thai, "เก ไก โก เกา")]); +} + +#[test] +fn tibetan_ba_va() { + // TODO: why is this change made? For now, enforce for consistency with Aksharamukha default + // behavior. + assert_transliterate("bava", Slp1, Tibetan, "བབ"); + assert_transliterate("nbanva", Slp1, Tibetan, "ནྦནྭ"); +} + +#[test] +fn tibetan_subjoined_consonants() { + assert_two_way_pairwise(&[(Slp1, "nGa"), (Tibetan, "ནྒྷ")]); + assert_two_way_pairwise(&[(Slp1, "kf kF kx kX"), (Tibetan, "ཀྲྀ་ཀྲཱྀ་ཀླྀ་ཀླཱྀ")]); + + assert_two_way_pairwise(&[ + ( + // Use "nba" instead of "nva" because Tibetan does not have a "v" character. + Slp1, + concat!( + "nka nKa nga nGa nNa nca nCa nja nJa nYa nwa nWa nqa nQa nRa ", + "nta nTa nda nDa nna npa nPa nba nBa nma nya nra nla nba nSa nza nsa nha" + ), + ), + ( + Tibetan, + "ནྐ་ནྑ་ནྒ་ནྒྷ་ནྔ་ནྩ་ནྪ་ནྫ་ནྫྷ་ནྙ་ནྚ་ནྛ་ནྜ་ནྜྷ་ནྞ་ནྟ་ནྠ་ནྡ་ནྡྷ་ནྣ་ནྤ་ནྥ་ནྦ་ནྦྷ་ནྨ་ནྱ་ནྲ་ནླ་ནྦ་ནྴ་ནྵ་ནྶ་ནྷ", + ), + ]); } #[test] diff --git a/vidyut-lipi/www/static/vidyut-lipi-app.js b/vidyut-lipi/www/static/vidyut-lipi-app.js index c9bbc63..47b7a90 100644 --- a/vidyut-lipi/www/static/vidyut-lipi-app.js +++ b/vidyut-lipi/www/static/vidyut-lipi-app.js @@ -121,18 +121,26 @@ let schemes = [ Scheme.Bengali, Scheme.Brahmi, Scheme.Burmese, + Scheme.Cham, Scheme.Grantha, Scheme.Gujarati, Scheme.Gurmukhi, Scheme.Javanese, Scheme.Kannada, + Scheme.Khmer, Scheme.Malayalam, + Scheme.Modi, + Scheme.Newa, Scheme.Odia, + Scheme.Saurashtra, Scheme.Sharada, Scheme.Siddham, Scheme.Sinhala, Scheme.Tamil, Scheme.Telugu, + Scheme.Thai, + Scheme.Tibetan, + Scheme.Tirhuta, Scheme.BarahaSouth, Scheme.HarvardKyoto, @@ -150,18 +158,26 @@ let schemeNames = { [Scheme.Bengali]: "Bengali", [Scheme.Brahmi]: "Brahmi", [Scheme.Burmese]: "Burmese", + [Scheme.Cham]: "Cham", [Scheme.Grantha]: "Grantha", [Scheme.Gujarati]: "Gujarati", [Scheme.Gurmukhi]: "Gurmukhi", [Scheme.Javanese]: "Javanese", [Scheme.Kannada]: "Kannada", + [Scheme.Khmer]: "Khmer", [Scheme.Malayalam]: "Malayalam", + [Scheme.Modi]: "Modi", + [Scheme.Newa]: "Newa (Nepal Bhasa)", [Scheme.Odia]: "Odia", + [Scheme.Saurashtra]: "Saurashtra", [Scheme.Sharada]: "Sharada", [Scheme.Siddham]: "Siddham", [Scheme.Sinhala]: "Sinhala", [Scheme.Tamil]: "Tamil", [Scheme.Telugu]: "Telugu", + [Scheme.Thai]: "Thai", + [Scheme.Tibetan]: "Tibetan", + [Scheme.Tirhuta]: "Tirhuta", [Scheme.BarahaSouth]: "Baraha (Southern)", [Scheme.HarvardKyoto]: "Harvard-Kyoto",