Skip to content

Commit

Permalink
Add canonicalize method to LocaleCanonicalizer (#747)
Browse files Browse the repository at this point in the history
Add canonicalize method to LocaleCanonicalizer
  • Loading branch information
dminor authored Jun 7, 2021
1 parent 3420be9 commit ffd520f
Show file tree
Hide file tree
Showing 18 changed files with 10,696 additions and 15 deletions.
14 changes: 14 additions & 0 deletions components/locale_canonicalizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,20 @@ assert_eq!(lc.minimize(&mut locale), CanonicalizationResult::Unmodified);
assert_eq!(locale.to_string(), "zh");
```

```rust
use icu_locale_canonicalizer::{CanonicalizationResult, LocaleCanonicalizer};
use icu_locid::Locale;

let provider = icu_testdata::get_provider();
let lc = LocaleCanonicalizer::new(&provider)
.expect("create failed");

let mut locale : Locale = "ja-Latn-fonipa-hepburn-heploc".parse()
.expect("parse failed");
assert_eq!(lc.canonicalize(&mut locale), CanonicalizationResult::Modified);
assert_eq!(locale.to_string(), "ja-Latn-alalc97-fonipa");
```

[`ICU4X`]: ../icu/index.html
[`CLDR`]: http://cldr.unicode.org/
[`UTS #35: Unicode LDML 3. Likely Subtags`]: https://www.unicode.org/reports/tr35/#Likely_Subtags.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
[
"cka",
"cze",
"gfx",
"sgn-BR",
"sgn-DD",
"tam",
"und-aaland",
"nob-bokmal",
"no-nynorsk",
"und-Qaai",
"en-554",
"en-084",
"art-lojban",
"zh-guoyu",
"zh-hakka",
"zh-xiang",
"aar-x-private",
"heb-x-private",
"ces",
"hy-arevela",
"hy-arevmda",
"cel-gaulish",
"ja-latn-hepburn-heploc",
"ja-Latn-fonipa-hepburn-heploc",
"und-Armn-SU",
"sh",
"sh-Cyrl",
"cnr",
"cnr-BA",
"ru-SU",
"ru-810",
"en-SU",
"en-810",
"und-SU",
"und-810",
"und-Latn-SU",
"und-Latn-810",
"hy-SU",
"hy-810",
"und-Armn-SU",
"und-Armn-810",
"sr-CS",
"sr-Latn-CS",
"sr-Cyrl-CS",
"az-NT",
"sl-t-sl-rozaj-biske-1994",
"DE-T-M0-DIN-K0-QWERTZ",
"en-t-m0-true",
"en-t-iw",
"und-u-rg-no23",
"und-u-rg-cn11",
"und-u-rg-cz10a",
"und-u-rg-fra",
"und-u-rg-frg",
"und-u-rg-lud",
"und-NO-u-rg-no23",
"und-CN-u-rg-cn11",
"und-CZ-u-rg-cz10a",
"und-FR-u-rg-fra",
"und-FR-u-rg-frg",
"und-u-rg-lud",
"und-u-sd-no23",
"und-u-sd-cn11",
"und-u-sd-cz10a",
"und-u-sd-fra",
"hy-arevela",
"hy-Armn-arevela",
"hy-AM-arevela",
"hy-arevela-fonipa",
"hy-fonipa-arevela",
"hy-arevmda",
"hy-Armn-arevmda",
"hy-AM-arevmda",
"hy-arevmda-fonipa",
"hy-fonipa-arevmda",
"ja-Latn-hepburn-heploc",
"ja-Latn-JP-hepburn-heploc",
"sv-aaland",
"el-polytoni",
"ja-Latn-alalc97-hepburn-heploc",
"ja-Latn-hepburn-alalc97-heploc",
"ja-Latn-hepburn-heploc-alalc97",
"ja-Latn-heploc-hepburn",
"ja-Latn-heploc",
"ja-Latn-aaland-heploc",
"ja-Latn-heploc-polytoni"
]
63 changes: 62 additions & 1 deletion components/locale_canonicalizer/benches/locale_canonicalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,62 @@ use criterion::{criterion_group, criterion_main, Criterion};
use icu_locale_canonicalizer::LocaleCanonicalizer;
use icu_locid::Locale;

fn canonicalize_bench(c: &mut Criterion) {
let provider = icu_testdata::get_provider();
let lc = LocaleCanonicalizer::new(&provider).unwrap();

let mut group = c.benchmark_group("canonicalize");

let path = "./benches/fixtures/uncanonicalized-locales.json";
let data: Vec<String> = helpers::read_fixture(path).expect("Failed to read a fixture");

group.bench_function("create", |b| {
b.iter(|| {
let _: Vec<Locale> = data.iter().map(|s| s.parse().unwrap()).collect();
})
});

group.bench_function("create+canonicalize", |b| {
b.iter(|| {
let locales: Vec<Locale> = data.iter().map(|s| s.parse().unwrap()).collect();
for locale in locales.iter() {
lc.canonicalize(&mut locale.clone());
}
})
});

group.finish();
}

fn canonicalize_noop_bench(c: &mut Criterion) {
let provider = icu_testdata::get_provider();
let lc = LocaleCanonicalizer::new(&provider).unwrap();

let mut group = c.benchmark_group("canonicalize-noop");

// None of these locales require canonicalization, so this measures the cost of calling
// the canonicalizer on locales that will not be modified.
let path = "./benches/fixtures/locales.json";
let data: Vec<String> = helpers::read_fixture(path).expect("Failed to read a fixture");

group.bench_function("create", |b| {
b.iter(|| {
let _: Vec<Locale> = data.iter().map(|s| s.parse().unwrap()).collect();
})
});

group.bench_function("create+canonicalize", |b| {
b.iter(|| {
let locales: Vec<Locale> = data.iter().map(|s| s.parse().unwrap()).collect();
for locale in locales.iter() {
lc.canonicalize(&mut locale.clone());
}
})
});

group.finish();
}

fn maximize_bench(c: &mut Criterion) {
let provider = icu_testdata::get_provider();
let lc = LocaleCanonicalizer::new(&provider).unwrap();
Expand All @@ -30,5 +86,10 @@ fn maximize_bench(c: &mut Criterion) {
group.finish();
}

criterion_group!(benches, maximize_bench);
criterion_group!(
benches,
canonicalize_bench,
canonicalize_noop_bench,
maximize_bench
);
criterion_main!(benches);
14 changes: 14 additions & 0 deletions components/locale_canonicalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@
//! assert_eq!(locale.to_string(), "zh");
//! ```
//!
//! ```
//! use icu_locale_canonicalizer::{CanonicalizationResult, LocaleCanonicalizer};
//! use icu_locid::Locale;
//!
//! let provider = icu_testdata::get_provider();
//! let lc = LocaleCanonicalizer::new(&provider)
//! .expect("create failed");
//!
//! let mut locale : Locale = "ja-Latn-fonipa-hepburn-heploc".parse()
//! .expect("parse failed");
//! assert_eq!(lc.canonicalize(&mut locale), CanonicalizationResult::Modified);
//! assert_eq!(locale.to_string(), "ja-Latn-alalc97-fonipa");
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [`CLDR`]: http://cldr.unicode.org/
//! [`UTS #35: Unicode LDML 3. Likely Subtags`]: https://www.unicode.org/reports/tr35/#Likely_Subtags.
Expand Down
Loading

0 comments on commit ffd520f

Please sign in to comment.