Skip to content

Commit

Permalink
[lipi] Standardize API and add more features
Browse files Browse the repository at this point in the history
Features:
- Add `detect` function
- Add `Lipika` API and a simple LRU cache

Schemes:
- Add support for Balinese, Burmese, Javanese, and Sharada
- Add support for WX and ISO 19519
- Rename Oriya to Odia

Documentation:
- Add more doc comments and expand README

UI:
- Add cheat sheet and some other quality-of-life features

Cleanup:
- Rename `app.js` files for clarity and ease of deployment
  • Loading branch information
akprasad committed Dec 27, 2023
1 parent 60f5f95 commit 4c4e22c
Show file tree
Hide file tree
Showing 20 changed files with 2,189 additions and 658 deletions.
5 changes: 3 additions & 2 deletions src/bin/eval_cheda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use vidyut_cheda::dcs;
use vidyut_cheda::Result;
use vidyut_cheda::{Chedaka, Config, Token};
use vidyut_kosha::morph::*;
use vidyut_lipi::{transliterate, Scheme};
use vidyut_lipi::{transliterate, Mapping, Scheme};

#[derive(Parser, Debug)]
#[command(author, version, about)]
Expand Down Expand Up @@ -59,7 +59,8 @@ impl AddAssign for Stats {
}

fn to_slp1(text: &str) -> String {
transliterate(text, Scheme::Iast, Scheme::Slp1)
let m = Mapping::new(Scheme::Iast, Scheme::Slp1);
transliterate(text, &m)
}

/// Converts a word's semantics into a short human-readable code, which we use for comparisons.
Expand Down
5 changes: 3 additions & 2 deletions src/bin/train_cheda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use vidyut_cheda::model::State;
use vidyut_cheda::Result;
use vidyut_cheda::{Config, Token};
use vidyut_kosha::morph::*;
use vidyut_lipi::{transliterate, Scheme};
use vidyut_lipi::{transliterate, Mapping, Scheme};

#[derive(Parser, Debug)]
#[command(author, version, about)]
Expand Down Expand Up @@ -56,7 +56,8 @@ struct Statistics {
}

fn to_slp1(text: &str) -> String {
transliterate(text, Scheme::Iast, Scheme::Slp1)
let m = Mapping::new(Scheme::Iast, Scheme::Slp1);
transliterate(text, &m)
}

fn process_sentence(tokens: &[Token], s: &mut Statistics) {
Expand Down
5 changes: 3 additions & 2 deletions vidyut-cheda/src/dcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ use crate::errors::{Error, Result};
use crate::segmenting::Token;
use compact_str::CompactString;
use vidyut_kosha::morph::*;
use vidyut_lipi::{transliterate, Scheme};
use vidyut_lipi::{transliterate, Mapping, Scheme};

fn to_slp1(text: &str) -> String {
transliterate(text, Scheme::Iast, Scheme::Slp1)
let mapping = Mapping::new(Scheme::Iast, Scheme::Slp1);
transliterate(text, &mapping)
}

/// Convert DCS semantics to Vidyut semantics.
Expand Down
2 changes: 1 addition & 1 deletion vidyut-lipi/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ debugger:
./scripts/run-debugger.sh

test:
cargo nextest run --no-fail-fast --status-level=fail
cargo nextest run --no-fail-fast --status-level=fail && cargo test --doc
75 changes: 68 additions & 7 deletions vidyut-lipi/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<div align="center">
<h1><code>vidyut-lipi</code></h1>
<p><i>A fast Indic transliterator</i></p>
<p><i>A fast Sanskrit transliterator</i></p>
</div>

`vidyut-lipi` is an experimental Sanskrit transliteration library that also
Expand Down Expand Up @@ -65,14 +65,75 @@ projects, which have each been highly influential in our work on `vidyut-lipi`.
Usage
-----

For simple use cases that aren't very performance-sensitive, we recommend using
`vidyut-lipi` like so:
We recommend using `vidyut-lipi` through our `Lipika` API:

```rust
use vidyut_lipi::{Scheme, transliterate};
use vidyut_lipi::{Lipika, Scheme};

let result = transliterate("devO", Scheme::Slp1, Scheme::Iast);
assert_eq!(result, "devau");
// `Lipika` must be `mut` since its method calls mutate its internal cache.
let mut lipika = Lipika::new();

let result = lipika.transliterate("saMskRtam", Scheme::HarvardKyoto, Scheme::Devanagari);
assert_eq!(result, "संस्कृतम्");
```

You can also use `detect` to detect which `Scheme` a piece of text might be using:

```rust
use vidyut_lipi::{Lipika, Scheme, detect};

let some_text = "संस्कृतम्";
let detected = detect(&some_text).unwrap_or(Scheme::HarvardKyoto);

let mut lipika = Lipika::new();
let result = lipika.transliterate(some_text, detected, Scheme::HarvardKyoto);
assert_eq!(result, "saMskRtam");
```

For a list of all available `Scheme`s, you can use `Scheme::iter()`:

```rust
use vidyut_lipi::{Lipika, Scheme, detect};

let mut lipika = Lipika::new();
for scheme in Scheme::iter() {
let result = lipika.transliterate("saMskRtam", Scheme::HarvardKyoto, *scheme);
println!("{:15} {result}", format!("{:?}", scheme));
}
```

As of 2023-12-26, this code prints the following:

```text
Balinese ᬲᬂᬲ᭄ᬓᬺᬢᬫ᭄
Bengali সংস্কৃতম্
Brahmi 𑀲𑀁𑀲𑁆𑀓𑀾𑀢𑀫𑁆
Burmese သံသ်ကၖတမ်
Devanagari संस्कृतम्
Grantha 𑌸𑌂𑌸𑍍𑌕𑍃𑌤𑌮𑍍
Gujarati સંસ્કૃતમ્
Gurmukhi ਸਂਸ੍ਕਤਮ੍
HarvardKyoto saMskRtam
Iast saṃskṛtam
Itrans saMskRRitam
Javanese ꦱꦁꦱ꧀ꦏꦽꦠꦩ꧀
Kannada ಸಂಸ್ಕೃತಮ್ Malayalam സംസ്കൃതമ്
Odia ସଂସ୍କୃତମ୍ Sharada 𑆱𑆁𑆱𑇀𑆑𑆸𑆠𑆩𑇀
Sinhala සංස්කෘතම්
Slp1 saMskftam
Tamil ஸம்ஸ்க்ரு'தம்
Telugu సంస్కృతమ్
Velthuis sa.msk.rtam
```

We are still stabilizing our API and will share more examples here soon.
`Lipika` is a thin wrapper over the `transliterate` function. We recommend
`Lipika` because it handles some bookkeeping and caching on your behalf, but if
you want more precise control, you can use `transliterate` directly like so:

```rust
use vidyut_lipi::{transliterate, Mapping, Scheme};

let mapping = Mapping::new(Scheme::HarvardKyoto, Scheme::Devanagari);
let result = transliterate("saMskRtam", &mapping);
assert_eq!(result, "संस्कृतम्");
```
31 changes: 21 additions & 10 deletions vidyut-lipi/scripts/create_schemes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
"""Create schemes for vidyut-lipi and writes them to `src/schemes.rs`.
"""Create schemes for vidyut-lipi and writes them to `src/autogen_schemes.rs`.
We create these mappings by modifying the data in the `common_maps` dir from
the indic-transliteration project.
Expand Down Expand Up @@ -32,25 +32,31 @@
}

ALLOWED = {
"BALINESE",
"BENGALI",
"BRAHMI",
"BURMESE",
"DEVANAGARI",
"GUJARATI",
"GURMUKHI",
"GRANTHA",
"JAVANESE",
"KANNADA",
"MALAYALAM",
"ORIYA",
"SHARADA",
"SINHALA",
"TAMIL",
"TELUGU",
"TIBETAN",

"HK",
"IAST",
"ISO",
"ITRANS",
"SLP1",
"VELTHUIS",
"WX",
}


Expand All @@ -59,12 +65,12 @@ def _sanitize(s: str) -> str:


def _maybe_override(name: str, deva: str, raw: str) -> str | None:
if name == "BRAHMI":
if deva == "\u0946":
# short e mark
return None
if deva == "\u094a":
# short o mark
if name in {"BRAHMI", "BALINESE", "BURMESE", "TIBETAN"}:
if deva in {"\u0946", "\u094a", "\u090e", "\u0912"}:
# - short e mark
# - short o mark
# - short e vowel
# - short o vowel
return None
elif name == "HK":
if raw == "|":
Expand Down Expand Up @@ -148,11 +154,16 @@ def main():
if deva is None:
continue
for alt in alts:
assert isinstance(deva, str)
assert isinstance(alt, str)
alt = _maybe_override(scheme_name, deva, alt)
if alt is not None:
scheme_items.append((deva, alt))
mark = VOWEL_TO_MARK.get(deva)
if mark:
assert isinstance(mark, str)
for alt in alts:
alt = _maybe_override(scheme_name, mark, alt)
if alt is not None:
scheme_items.append((mark, alt))
else:
for deva, raw in data[category].items():
assert isinstance(deva, str)
Expand All @@ -173,7 +184,7 @@ def main():

buf.append(create_scheme_str(scheme_name, scheme_items))

with open(CRATE_DIR / "src/schemes.rs", "w") as f:
with open(CRATE_DIR / "src/autogen_schemes.rs", "w") as f:
f.write("\n".join(buf))

print("Cleaning up ...")
Expand Down
Loading

0 comments on commit 4c4e22c

Please sign in to comment.