Skip to content

Commit

Permalink
[lipi] Support NFC/NFD, Grantha numerals, etc.
Browse files Browse the repository at this point in the history
Features:
- Add support for Grantha numerals.
- Add support for working with Unicode equivalences in NFC/NFD. For
  details, see the `unicode_norm` module.

Bug fixes:
- Add support for Tamil aytam.
- Rename ISO 19519 to ISO 15919.
- Other minor mapping fixes.

Documentation:
- Add entry for `vidyut-lipi` to main `vidyut` README.
  • Loading branch information
akprasad committed Jan 23, 2024
1 parent a1e7ec9 commit a0e7546
Show file tree
Hide file tree
Showing 14 changed files with 1,434 additions and 325 deletions.
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,16 @@ elsewhere.
For details, see the [vidyut-kosha README][vidyut-kosha].


### [`vidyut-lipi`][vidyut-lipi]

`vidyut-lipi` is an experimental Sanskrit transliteration library that also
supports many of the scripts used within the Indosphere. Our goal is to provide
a standard transliterator for the Sanskrit ecosystem that is easy to bind to
other programming languages.

For details, see the [vidyut-lipi README][vidyut-lipi].


### [`vidyut-prakriya`][vidyut-prakriya]

`vidyut-prakriya` generates Sanskrit words with their prakriyās (derivations)
Expand All @@ -157,6 +167,7 @@ For details, see the [vidyut-sandhi README][vidyut-sandhi].
[vidyut-chandas]: vidyut-chandas/README.md
[vidyut-cheda]: vidyut-cheda/README.md
[vidyut-kosha]: vidyut-kosha/README.md
[vidyut-lipi]: vidyut-lipi/README.md
[vidyut-prakriya]: vidyut-prakriya/README.md
[vidyut-sandhi]: vidyut-sandhi/README.md

Expand Down
3 changes: 3 additions & 0 deletions vidyut-lipi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ console_error_panic_hook = "0.1.7"

[lib]
crate-type = ["cdylib", "rlib"]

[dev-dependencies]
unicode-normalization = "0.1.22"
85 changes: 64 additions & 21 deletions vidyut-lipi/scripts/create_schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import tomllib
import subprocess
import unicodedata
from pathlib import Path
from glob import glob
import shutil
Expand Down Expand Up @@ -66,6 +67,29 @@ def _sanitize(s: str) -> str:
return s.replace("\\", "\\\\").replace('"', '\\"')


def _to_deva_nfd(s: str) -> str:
overrides = {
"\u0958": "\u0915\u093c", # ka
"\u0959": "\u0916\u093c", # kha
"\u095a": "\u0917\u093c", # ga
"\u095b": "\u091c\u093c", # ja
"\u095c": "\u0921\u093c", # Da
"\u095d": "\u0922\u093c", # Dha
"\u095e": "\u092b\u093c", # pha
"\u095f": "\u092f\u093c", # ya
}
return overrides.get(s, s)

def to_unique(xs: list) -> list:
seen = set()
ret = []
for x in xs:
if x not in seen:
ret.append(x)
seen.add(x)
return ret


def _maybe_override(name: str, deva: str, raw: str) -> str | None:
overrides = {}

Expand Down Expand Up @@ -100,6 +124,7 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None:
overrides = {
"।": ".",
"॥": "..",
"क़": None,
}
elif name == "IAST":
overrides = {
Expand All @@ -110,6 +135,11 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None:
# candrabindu
"\u0901": "m̐",
}
elif name == "TAMIL":
overrides = {
# Visarga
"\u0903": None,
}
elif name == "VELTHUIS":
# These are part of the Velthuis spec but are errors in indic-transliteration.
overrides = {
Expand All @@ -135,13 +165,19 @@ def _maybe_override(name: str, deva: str, raw: str) -> str | None:
return overrides.get(deva, raw)


def create_scheme_str(name: str, items: list[tuple[str, str]]) -> str:
def create_scheme_entry(name: str, items: list[tuple[str, str]]) -> str:
buf = []
seen = set()

buf.append(f"pub const {name}: &[(&str, &str)] = &[")
for deva, raw in items:
deva = _sanitize(deva)
raw = _sanitize(raw)
deva = unicodedata.normalize('NFC', _sanitize(deva))
raw = unicodedata.normalize('NFC', _sanitize(raw))

if (deva, raw) in seen:
continue
seen.add((deva, raw))

buf.append(f' ("{deva}", "{raw}"),')
buf.append("];\n")

Expand Down Expand Up @@ -200,6 +236,8 @@ def main():
deva = raw_to_deva.get(raw_main)
if deva is None:
continue

deva = unicodedata.normalize('NFC', _sanitize(deva))
for alt in alts:
alt = _maybe_override(scheme_name, deva, alt)
if alt is not None:
Expand Down Expand Up @@ -231,6 +269,10 @@ def main():
assert isinstance(raw, str)
scheme_items.append((mark, raw))

scheme_items = [(_to_deva_nfd(x), _to_deva_nfd(y))
for (x, y) in scheme_items]
scheme_items = to_unique(scheme_items)

# Add svarita and anudatta for Brahmic scripts that use Devanagari accent marks.
if scheme_name in BRAHMIC_WITH_DEVA_ACCENTS:
scheme_items.extend([
Expand Down Expand Up @@ -273,16 +315,7 @@ def main():
("\u0948", "\u0947\u094e"),
("\u094b", "\u093e\u094e"),
("\u094c", "\u094b\u094e"),
# Consonants with nuqtas.
("\u0931", "\u0931"),
("\u0958", "\u0958"),
("\u0959", "\u0959"),
("\u095a", "\u095a"),
("\u095b", "\u095b"),
("\u095c", "\u095c"),
("\u095d", "\u095d"),
("\u095e", "\u095e"),
("\u095f", "\u095f"),

# Vedic accents
("\u1cd2", "\u1cd2"),
("\u1cda", "\u1cda"),
Expand All @@ -300,6 +333,11 @@ def main():
# AU (AA + AU length mark)
("\u094c", "\U00011347\U00011357"),
])
elif scheme_name == "ISO":
scheme_items.extend([
# Aytam
("\u0b83", "ḳ"),
])
elif scheme_name == "SINHALA":
# Sinhala chandrabindu is not supported in the fonts I tried, so
# use anusvara instead.
Expand All @@ -317,6 +355,11 @@ def main():
# Anudatta
("\u0952", "\\"),
])
elif scheme_name == "TAMIL":
scheme_items.extend([
# Aytam
("\u0b83", "\u0b83"),
])
elif scheme_name == "VELTHUIS":
scheme_items.extend([
# Virama
Expand All @@ -330,16 +373,16 @@ def main():
("\u0971", "#"),
# Consonants with nuqtas
("\u0931", "^r"),
("\u0958", "q"),
("\u0959", ".kh"),
("\u095a", ".g"),
("\u095b", "z"),
("\u095c", "R"),
("\u095d", "Rh"),
("\u095e", "f"),
("\u0915\u093c", "q"),
("\u0916\u093c", ".kh"),
("\u0957\u093c", ".g"),
("\u091c\u093c", "z"),
("\u0921\u093c", "R"),
("\u0922\u093c", "Rh"),
("\u092b\u093c", "f"),
])

buf.append(create_scheme_str(scheme_name, scheme_items))
buf.append(create_scheme_entry(scheme_name, scheme_items))

with open(CRATE_DIR / "src/autogen_schemes.rs", "w") as f:
f.write("\n".join(buf))
Expand Down
Loading

0 comments on commit a0e7546

Please sign in to comment.