Skip to content

Commit

Permalink
Added POSIX locale support (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
pasabanov committed Sep 27, 2024
1 parent 218c71c commit ecfb35d
Show file tree
Hide file tree
Showing 3 changed files with 360 additions and 2 deletions.
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ exclude = [
]

[dependencies]
language-tags = "0.3.2"
language-tags = { version = "0.3.2", optional = true }

[features]
bcp47 = ["language-tags"]
posix = []
default = ["bcp47", "posix"]

[profile.dev]
# Compilation
Expand Down
6 changes: 5 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@
//! You should have received a copy of the GNU Lesser General Public License
//! along with this program. If not, see <https://www.gnu.org/licenses/>.

pub mod bcp47;
#[cfg(feature = "bcp47")]
pub mod bcp47;

#[cfg(feature = "posix")]
pub mod posix;
349 changes: 349 additions & 0 deletions src/posix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
//! locale-match is a small library for matching locales.
//! Copyright (C) © 2024 Petr Alexandrovich Sabanov
//!
//! This program is free software: you can redistribute it and/or modify
//! it under the terms of the GNU Lesser General Public License as published by
//! the Free Software Foundation, either version 3 of the License, or
//! (at your option) any later version.
//!
//! This program is distributed in the hope that it will be useful,
//! but WITHOUT ANY WARRANTY; without even the implied warranty of
//! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//! GNU Lesser General Public License for more details.
//!
//! You should have received a copy of the GNU Lesser General Public License
//! along with this program. If not, see <https://www.gnu.org/licenses/>.

/// Finds the best matching locale from a list of available locales based on a list of user locales.
/// The function expects locales to be valid POSIX locales, but does not validate them.
/// The function expects locales to be encoded with ASCII.
///
/// The function compares user locales to available locales to find the best match.
/// For each user locale, it iterates through the available locales and, for those with a matching
/// primary language, calculates a score based on how closely each available locale matches the user
/// locale.
/// The score calculation gives higher priority to matching more significant parts of the locale
/// (i.e., earlier segments in the locale string).
/// If a subtag is empty, it is considered to match equally well with any subtag from the same
/// category.
///
/// If multiple available locales have the same score, the function selects the one that appears
/// earlier in the list of available locales.
/// If no available locale matches the primary language of a user locale, the function moves to the
/// next user locale in the list.
/// If no matches are found for any user locale, the function returns [`None`].
///
/// Malformed locales are ignored.
///
/// # Arguments
///
/// * `available_locales` - An iterator over locale strings representing the available locales.
/// These locales should be ordered by priority, meaning that a locale appearing earlier in this
/// list is considered more preferable for the program.
/// * `user_locales` - An iterator over locale strings representing the user locales to match
/// against. These locales should also be ordered by priority, meaning that a locale appearing
/// earlier in this list is considered more desirable for the user.
///
/// # Returns
///
/// Returns an [`Option<String>`] containing the string representation of the best matching locale.
/// If multiple available locales match the same user locale with equal score, the one that appears
/// earlier in the list of available locales is chosen.
/// If no match is found, [`None`] is returned.
///
/// The returned locale is guaranteed to EXACTLY match one of the available locales.
/// For example, `best_matching_locale(&["EN"].iter(), &["en"].iter())` will return `Some("EN")`.
///
/// # Examples
///
/// ```
/// use locale_match::posix::best_matching_locale;
///
///
/// let available_locales = vec!["en_US", "en_GB", "ru_UA", "fr_FR", "it"];
/// let user_locales = vec!["ru_RU", "ru", "en_US", "en"];
///
/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
///
/// // "ru_UA" is the best match for the highest-priority user locale "ru_RU"
/// assert_eq!(best_match, Some("ru_UA"));
///
///
/// let available_locales = vec!["en", "pt_BR", "pt_PT", "es"];
/// let user_locales = vec!["pt", "en"];
///
/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
///
/// // "pt_BR" is the first best match for the highest-priority user locale "pt"
/// assert_eq!(best_match, Some("pt_BR"));
///
///
/// let available_locales = vec!["fr", "fr_FR", "fr_CA.UTF-8"];
/// let user_locales = vec!["fr.UTF-8"];
///
/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
///
/// // Empty territory in "fr.UTF-8" matches any territory, e.g. "CA"
/// assert_eq!(best_match, Some("fr_CA.UTF-8"));
/// ```
pub fn best_matching_locale<'a, 'b, T1, T2>(available_locales: impl Iterator<Item = &'a T1>, user_locales: impl Iterator<Item = &'b T2>) -> Option<&'a str>
where
T1: AsRef<str> + 'a,
T2: AsRef<str> + 'b
{
let available_parsed_locales = available_locales
.map(|l| PosixLocale::parse(l.as_ref()))
.collect::<Vec<PosixLocale>>();

user_locales
.map(|locale| PosixLocale::parse(locale.as_ref()))
.find_map(|user_locale|
available_parsed_locales.iter()
.rev() // For max_by_key to return the first locale with max score
.filter(|aval_locale| aval_locale.language.eq_ignore_ascii_case(user_locale.language))
.max_by_key(|aval_locale| {
let mut score = 0;
for (aval, user, weight) in [
(aval_locale.territory, user_locale.territory, 4),
(aval_locale.codeset, user_locale.codeset, 2),
(aval_locale.modifier, user_locale.modifier, 1),
] {
match (aval, user) {
(Some(a), Some(u)) if a.eq_ignore_ascii_case(u) => score += weight,
_ => {} // Ignore if both are None
}
}
score
})
)
.map(|aval_locale| aval_locale.locale)
}

/// A POSIX locale as described in [The Open Group Base Specifications Issue 8 - 8. Environment Variables](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap08.html).
struct PosixLocale<'a> {
locale: &'a str,
language: &'a str,
territory: Option<&'a str>,
codeset: Option<&'a str>,
modifier: Option<&'a str>,
}

impl<'a> PosixLocale<'a> {
const TERRITORY_DELIMITER: char = '_';
const CODESET_DELIMITER: char = '.';
const MODIFIER_DELIMITER: char = '@';

/// Parse a POSIX locale string into a `PosixLocale`.
///
/// The `locale` string should be in the form `language[_territory][.codeset][@modifier]`.
fn parse(locale: &'a str) -> Self {
let codeset_end = locale.find(Self::MODIFIER_DELIMITER).unwrap_or(locale.len());
let territory_end = locale.find(Self::CODESET_DELIMITER).unwrap_or(codeset_end);
let language_end = locale.find(Self::TERRITORY_DELIMITER).unwrap_or(territory_end);
Self {
locale,
language: &locale[..language_end],
territory: locale.get(language_end + 1..territory_end),
codeset: locale.get(territory_end + 1..codeset_end),
modifier: locale.get(codeset_end + 1..)
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_best_matching_locale() {

fn assert_best_match(available_locales: &[&str], user_locales: &[&str], expected: Option<&str>) {
assert_eq!(best_matching_locale(available_locales.iter(), user_locales.iter()).as_deref(), expected);
}

// One best match
assert_best_match(&["en_US", "ru_RU"], &["ru", "en"], Some("ru_RU"));
assert_best_match(&["en_US", "ru_RU"], &["en", "ru"], Some("en_US"));
assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["ru_RU", "ru", "en_US", "en"], Some("ru_UA"));
assert_best_match(&["ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL"));
assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["fr", "fr_FR", "ml", "si", "id", "ku_IQ"], Some("ku_TR"));
assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["zu_ZA", "st_ZA", "en"], Some("st_LS"));

// Multiple best matches
assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["en_US", "en", "ru_RU", "ru"], Some("en_US"));
assert_best_match(&["en", "pt_BR", "pt_PT", "es"], &["pt", "en"], Some("pt_BR"));
assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku", "en"], Some("ku_TR"));
assert_best_match(&["en_US", "ru_RU", "mn_CN", "sn_ZW", "en", "ru", "mn_MN", "sn"], &["mn", "ru", "en", "sn"], Some("mn_CN"));

// Identical
assert_best_match(&["en"], &["en"], Some("en"));
assert_best_match(&["en_US"], &["en_US"], Some("en_US"));
assert_best_match(&["en_US", "ru_RU"], &["en_US", "ru_RU"], Some("en_US"));
assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["st_LS", "sn_ZW", "en_US"], Some("st_LS"));
assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku_TR", "ku_IQ", "ku_IR"], Some("ku_TR"));
assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], Some("lv_LV"));

// More complicated cases
assert_best_match(&["en_US", "ru_RU.UTF-8"], &["ru", "en"], Some("ru_RU.UTF-8"));
assert_best_match(&["en_US", "ru.UTF-8", "ru_RU.UTF-8"], &["ru.UTF-8", "en"], Some("ru.UTF-8"));
assert_best_match(&["en_US", "ru_RU.UTF-8", "ru.UTF-8"], &["ru.UTF-8", "en"], Some("ru_RU.UTF-8"));
assert_best_match(&["en_US", "ru.UTF-8@dict", "ru_UA"], &["ru_UA.UTF-8@dict", "en"], Some("ru_UA"));
assert_best_match(&["en_US@dict", "ru_RU"], &["en", "ru"], Some("en_US@dict"));
assert_best_match(&["en_US.CP1252", "en_GB.UTF-8", "ru_UA@icase", "fr_FR@euro", "it.UTF-8"], &["ru_RU.KOI8-R", "ru@icase", "en_US.UTF-8", "en.CP1252"], Some("ru_UA@icase"));
assert_best_match(&["fr", "fr_FR", "fr_CA.UTF-8"], &["fr.UTF-8"], Some("fr_CA.UTF-8"));
assert_best_match(&["en", "pt_BR@dict", "pt_PT@icase", "es"], &["pt.CP1252@euro", "en.UTF-8@dict"], Some("pt_BR@dict"));
assert_best_match(&["en_US", "ru_RU", "mn_CN.UTF-8", "sn_ZW", "en", "ru", "mn_MN@dict", "sn"], &["mn.UTF-8@dict", "ru", "en", "sn"], Some("mn_CN.UTF-8"));

// One available locale
assert_best_match(&["kk"], &["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ"], Some("kk"));

// One user locale
assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ", "pt"], &["pt_PT"], Some("pt"));

// Not found
assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "es_ES", "kk_KZ", "pt"], &["ru"], None);
assert_best_match(&["en", "en_US", "fr_FR", "fr", "pt"], &["id"], None);
assert_best_match(&["ru", "be", "uk", "kk"], &["en"], None);

// Empty available locales
assert_best_match(&[], &["en", "fr", "it", "pt"], None);

// Empty user locales
assert_best_match(&["en", "fr", "it", "pt"], &[], None);

// Both lists empty
assert_best_match(&[], &[], None);

// Malformed
assert_best_match(&[" en"], &["en"], None);
assert_best_match(&["?ru"], &["ru"], None);
assert_best_match(&["ruRU"], &["ru"], None);

// Repeating
assert_best_match(&["en", "en", "en", "en"], &["ru_RU", "ru", "en_US", "en"], Some("en"));
assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["kk", "ru", "pt", "ru"], Some("ru_UA"));

// Littered
assert_best_match(&["!!!!!!", "qwydgn12i6i", "ЖЖяяЖяЬЬЬ", "en_US", "!*&^^&*", "qweqweqweqwe_qweqwe", "ru_RU", "@@", "@"], &["ru", "en"], Some("ru_RU"));
assert_best_match(&["", "", "", "zh", "", "", "", "", "", "he", "", ""], &["he", "", "", "zh"], Some("he"));

// Special characters
assert_best_match(&["sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL"));
assert_best_match(&["\0", "\x01\x02\x03\x04", "sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "\x06", "en", "sq_XK", "sq", "\0"], Some("sq_AL"));

// Various letter cases
assert_best_match(&["EN"], &["en"], Some("EN"));
assert_best_match(&["En"], &["EN"], Some("En"));
assert_best_match(&["Ru_rU"], &["en", "ru"], Some("Ru_rU"));
assert_best_match(&["rU_rU"], &["en", "Ru"], Some("rU_rU"));
assert_best_match(&["EN.Utf-8"], &["en.UTF-8"], Some("EN.Utf-8"));
assert_best_match(&["En@dIcT"], &["EN_us"], Some("En@dIcT"));
assert_best_match(&["ru_ru.utf-8@icase"], &["en", "RU_RU.UTF-8@ICASE"], Some("ru_ru.utf-8@icase"));
assert_best_match(&["fr_FR.CP1252@euRO"], &["FR", "en"], Some("fr_FR.CP1252@euRO"));
}

#[test]
#[allow(non_snake_case)]
fn test_PosixLocale() {

fn assert_parts(locale: &str, parts: (&str, Option<&str>, Option<&str>, Option<&str>)) {
let posix_locale = PosixLocale::parse(locale);
assert_eq!(posix_locale.locale, locale);
assert_eq!(posix_locale.language, parts.0);
assert_eq!(posix_locale.territory, parts.1);
assert_eq!(posix_locale.codeset, parts.2);
assert_eq!(posix_locale.modifier, parts.3);
}

// Language only
assert_parts("en", ("en", None, None, None));
assert_parts("ru", ("ru", None, None, None));
assert_parts("fr", ("fr", None, None, None));

// Language and territory
assert_parts("en_US", ("en", Some("US"), None, None));
assert_parts("ru_RU", ("ru", Some("RU"), None, None));
assert_parts("fr_FR", ("fr", Some("FR"), None, None));

// Language and codeset
assert_parts("en.UTF-8", ("en", None, Some("UTF-8"), None));
assert_parts("ru.KOI8-R", ("ru", None, Some("KOI8-R"), None));
assert_parts("fr.CP1252", ("fr", None, Some("CP1252"), None));

// Language and modifier
assert_parts("en@dict", ("en", None, None, Some("dict")));
assert_parts("ru@icase", ("ru", None, None, Some("icase")));
assert_parts("fr@euro", ("fr", None, None, Some("euro")));

// Language, territory and codeset
assert_parts("en_US.UTF-8", ("en", Some("US"), Some("UTF-8"), None));
assert_parts("ru_RU.KOI8-R", ("ru", Some("RU"), Some("KOI8-R"), None));
assert_parts("fr_FR.CP1252", ("fr", Some("FR"), Some("CP1252"), None));

// Language, territory and modifier
assert_parts("en_US@dict", ("en", Some("US"), None, Some("dict")));
assert_parts("ru_RU@icase", ("ru", Some("RU"), None, Some("icase")));
assert_parts("fr_FR@euro", ("fr", Some("FR"), None, Some("euro")));

// Language, codeset and modifier
assert_parts("en.UTF-8@dict", ("en", None, Some("UTF-8"), Some("dict")));
assert_parts("ru.KOI8-R@icase", ("ru", None, Some("KOI8-R"), Some("icase")));
assert_parts("fr.CP1252@euro", ("fr", None, Some("CP1252"), Some("euro")));

// Language, territory, codeset and modifier
assert_parts("en_US.UTF-8@dict", ("en", Some("US"), Some("UTF-8"), Some("dict")));
assert_parts("ru_RU.KOI8-R@icase", ("ru", Some("RU"), Some("KOI8-R"), Some("icase")));
assert_parts("fr_FR.CP1252@euro", ("fr", Some("FR"), Some("CP1252"), Some("euro")));

// Various letter cases
assert_parts("EN", ("EN", None, None, None));
assert_parts("Ru", ("Ru", None, None, None));
assert_parts("fR", ("fR", None, None, None));
assert_parts("eN_us.Utf-8", ("eN", Some("us"), Some("Utf-8"), None));
assert_parts("RU_ru.koi8-R", ("RU", Some("ru"), Some("koi8-R"), None));
assert_parts("Fr_Fr.Cp1252", ("Fr", Some("Fr"), Some("Cp1252"), None));
assert_parts("en_us.utf-8@DICT", ("en", Some("us"), Some("utf-8"), Some("DICT")));
assert_parts("RU_RU.KOI8-R@Icase", ("RU", Some("RU"), Some("KOI8-R"), Some("Icase")));
assert_parts("fR_fR.cP1252@eUrO", ("fR", Some("fR"), Some("cP1252"), Some("eUrO")));

// Empty
assert_parts("", ("", None, None, None));

// Whitespace
assert_parts(" ", (" ", None, None, None));
assert_parts(" ", (" ", None, None, None));
assert_parts("\t", ("\t", None, None, None));
assert_parts("\n", ("\n", None, None, None));
assert_parts("\n \t\t\n \n\t \t\t\n\n\t", ("\n \t\t\n \n\t \t\t\n\n\t", None, None, None));

// Litter
assert_parts("!!!", ("!!!", None, None, None));
assert_parts("12345", ("12345", None, None, None));
assert_parts("+-+-", ("+-+-", None, None, None));

// Malformed
assert_parts("!!!_9999.UUU@()()", ("!!!", Some("9999"), Some("UUU"), Some("()()")));
assert_parts("12_123.1234@12345", ("12", Some("123"), Some("1234"), Some("12345")));
assert_parts("+-+-@+-+-", ("+-+-", None, None, Some("+-+-")));

// Wrong order EXPECTED TO BE BROKEN
assert_parts("lang.codeset_region@modifier", ("lang.codeset", None, Some("codeset_region"), Some("modifier")));
assert_parts("[email protected]_region", ("[email protected]", None, None, Some("modifier.codeset_region")));
assert_parts("[email protected]", ("lang", Some("region@modifier"), None, Some("modifier.codeset")));
assert_parts("lang.codeset@modifier_region", ("lang.codeset@modifier", None, Some("codeset"), Some("modifier_region")));
assert_parts("lang@modifier_region.codeset", ("lang@modifier", Some("region"), None, Some("modifier_region.codeset")));

// Parts missing
assert_parts("_.@", ("", Some(""), Some(""), Some("")));
assert_parts("_US.UTF-8@dict", ("", Some("US"), Some("UTF-8"), Some("dict")));
assert_parts("ru_.KOI8-R@icase", ("ru", Some(""), Some("KOI8-R"), Some("icase")));
assert_parts("fr_FR.@euro", ("fr", Some("FR"), Some(""), Some("euro")));
assert_parts("de_DE.ISO-8859-1@", ("de", Some("DE"), Some("ISO-8859-1"), Some("")));

// Special characters
assert_parts("\0", ("\0", None, None, None));
assert_parts("\0_\0.\0@\0", ("\0", Some("\0"), Some("\0"), Some("\0")));
assert_parts("\0\x01\x02\x03", ("\0\x01\x02\x03", None, None, None));
assert_parts("\x03\x02\x01", ("\x03\x02\x01", None, None, None));
}
}

0 comments on commit ecfb35d

Please sign in to comment.