Added POSIX locale support (#1)

pasabanov · Sep 27, 2024 · ecfb35d · ecfb35d
1 parent 218c71c
commit ecfb35d
Show file tree

Hide file tree

Showing 3 changed files with 360 additions and 2 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,7 +26,12 @@ exclude = [
 ]
 
 [dependencies]
-language-tags = "0.3.2"
+language-tags = { version = "0.3.2", optional = true }
+
+[features]
+bcp47 = ["language-tags"]
+posix = []
+default = ["bcp47", "posix"]
 
 [profile.dev]
 # Compilation

diff --git a/src/lib.rs b/src/lib.rs
@@ -14,4 +14,8 @@
 //! You should have received a copy of the GNU Lesser General Public License
 //! along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-pub mod bcp47;
+#[cfg(feature = "bcp47")]
+pub mod bcp47;
+
+#[cfg(feature = "posix")]
+pub mod posix;
diff --git a/src/posix.rs b/src/posix.rs
@@ -0,0 +1,349 @@
+//! locale-match is a small library for matching locales.
+//! Copyright (C) © 2024  Petr Alexandrovich Sabanov
+//!
+//! This program is free software: you can redistribute it and/or modify
+//! it under the terms of the GNU Lesser General Public License as published by
+//! the Free Software Foundation, either version 3 of the License, or
+//! (at your option) any later version.
+//!
+//! This program is distributed in the hope that it will be useful,
+//! but WITHOUT ANY WARRANTY; without even the implied warranty of
+//! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//! GNU Lesser General Public License for more details.
+//!
+//! You should have received a copy of the GNU Lesser General Public License
+//! along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+/// Finds the best matching locale from a list of available locales based on a list of user locales.
+/// The function expects locales to be valid POSIX locales, but does not validate them.
+/// The function expects locales to be encoded with ASCII.
+///
+/// The function compares user locales to available locales to find the best match.
+/// For each user locale, it iterates through the available locales and, for those with a matching
+/// primary language, calculates a score based on how closely each available locale matches the user
+/// locale.
+/// The score calculation gives higher priority to matching more significant parts of the locale
+/// (i.e., earlier segments in the locale string).
+/// If a subtag is empty, it is considered to match equally well with any subtag from the same
+/// category.
+///
+/// If multiple available locales have the same score, the function selects the one that appears
+/// earlier in the list of available locales.
+/// If no available locale matches the primary language of a user locale, the function moves to the
+/// next user locale in the list.
+/// If no matches are found for any user locale, the function returns [`None`].
+///
+/// Malformed locales are ignored.
+///
+/// # Arguments
+///
+/// * `available_locales` - An iterator over locale strings representing the available locales.
+///   These locales should be ordered by priority, meaning that a locale appearing earlier in this
+///   list is considered more preferable for the program.
+/// * `user_locales` - An iterator over locale strings representing the user locales to match
+///   against. These locales should also be ordered by priority, meaning that a locale appearing
+///   earlier in this list is considered more desirable for the user.
+///
+/// # Returns
+///
+/// Returns an [`Option<String>`] containing the string representation of the best matching locale.
+/// If multiple available locales match the same user locale with equal score, the one that appears
+/// earlier in the list of available locales is chosen.
+/// If no match is found, [`None`] is returned.
+///
+/// The returned locale is guaranteed to EXACTLY match one of the available locales.
+/// For example, `best_matching_locale(&["EN"].iter(), &["en"].iter())` will return `Some("EN")`.
+///
+/// # Examples
+///
+/// ```
+/// use locale_match::posix::best_matching_locale;
+///
+///
+/// let available_locales = vec!["en_US", "en_GB", "ru_UA", "fr_FR", "it"];
+/// let user_locales = vec!["ru_RU", "ru", "en_US", "en"];
+///
+/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
+///
+/// // "ru_UA" is the best match for the highest-priority user locale "ru_RU"
+/// assert_eq!(best_match, Some("ru_UA"));
+///
+///
+/// let available_locales = vec!["en", "pt_BR", "pt_PT", "es"];
+/// let user_locales = vec!["pt", "en"];
+///
+/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
+///
+/// // "pt_BR" is the first best match for the highest-priority user locale "pt"
+/// assert_eq!(best_match, Some("pt_BR"));
+///
+///
+/// let available_locales = vec!["fr", "fr_FR", "fr_CA.UTF-8"];
+/// let user_locales = vec!["fr.UTF-8"];
+///
+/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter());
+///
+/// // Empty territory in "fr.UTF-8" matches any territory, e.g. "CA"
+/// assert_eq!(best_match, Some("fr_CA.UTF-8"));
+/// ```
+pub fn best_matching_locale<'a, 'b, T1, T2>(available_locales: impl Iterator<Item = &'a T1>, user_locales: impl Iterator<Item = &'b T2>) -> Option<&'a str>
+where
+	T1: AsRef<str> + 'a,
+	T2: AsRef<str> + 'b
+{
+	let available_parsed_locales = available_locales
+		.map(|l| PosixLocale::parse(l.as_ref()))
+		.collect::<Vec<PosixLocale>>();
+
+	user_locales
+		.map(|locale| PosixLocale::parse(locale.as_ref()))
+		.find_map(|user_locale|
+			available_parsed_locales.iter()
+				.rev() // For max_by_key to return the first locale with max score
+				.filter(|aval_locale| aval_locale.language.eq_ignore_ascii_case(user_locale.language))
+				.max_by_key(|aval_locale| {
+					let mut score = 0;
+					for (aval, user, weight) in [
+						(aval_locale.territory, user_locale.territory, 4),
+						(aval_locale.codeset,   user_locale.codeset,   2),
+						(aval_locale.modifier,  user_locale.modifier,  1),
+					] {
+						match (aval, user) {
+							(Some(a), Some(u)) if a.eq_ignore_ascii_case(u) => score += weight,
+							_ => {} // Ignore if both are None
+						}
+					}
+					score
+				})
+		)
+		.map(|aval_locale| aval_locale.locale)
+}
+
+/// A POSIX locale as described in [The Open Group Base Specifications Issue 8 - 8. Environment Variables](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap08.html).
+struct PosixLocale<'a> {
+	locale: &'a str,
+	language: &'a str,
+	territory: Option<&'a str>,
+	codeset: Option<&'a str>,
+	modifier: Option<&'a str>,
+}
+
+impl<'a> PosixLocale<'a> {
+	const TERRITORY_DELIMITER: char = '_';
+	const CODESET_DELIMITER: char = '.';
+	const MODIFIER_DELIMITER: char = '@';
+
+	/// Parse a POSIX locale string into a `PosixLocale`.
+	///
+	/// The `locale` string should be in the form `language[_territory][.codeset][@modifier]`.
+	fn parse(locale: &'a str) -> Self {
+		let codeset_end = locale.find(Self::MODIFIER_DELIMITER).unwrap_or(locale.len());
+		let territory_end = locale.find(Self::CODESET_DELIMITER).unwrap_or(codeset_end);
+		let language_end = locale.find(Self::TERRITORY_DELIMITER).unwrap_or(territory_end);
+		Self {
+			locale,
+			language: &locale[..language_end],
+			territory: locale.get(language_end + 1..territory_end),
+			codeset: locale.get(territory_end + 1..codeset_end),
+			modifier: locale.get(codeset_end + 1..)
+		}
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn test_best_matching_locale() {
+
+		fn assert_best_match(available_locales: &[&str], user_locales: &[&str], expected: Option<&str>) {
+			assert_eq!(best_matching_locale(available_locales.iter(), user_locales.iter()).as_deref(), expected);
+		}
+
+		// One best match
+		assert_best_match(&["en_US", "ru_RU"], &["ru", "en"], Some("ru_RU"));
+		assert_best_match(&["en_US", "ru_RU"], &["en", "ru"], Some("en_US"));
+		assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["ru_RU", "ru", "en_US", "en"], Some("ru_UA"));
+		assert_best_match(&["ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL"));
+		assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["fr", "fr_FR", "ml", "si", "id", "ku_IQ"], Some("ku_TR"));
+		assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["zu_ZA", "st_ZA", "en"], Some("st_LS"));
+
+		// Multiple best matches
+		assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["en_US", "en", "ru_RU", "ru"], Some("en_US"));
+		assert_best_match(&["en", "pt_BR", "pt_PT", "es"], &["pt", "en"], Some("pt_BR"));
+		assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku", "en"], Some("ku_TR"));
+		assert_best_match(&["en_US", "ru_RU", "mn_CN", "sn_ZW", "en", "ru", "mn_MN", "sn"], &["mn", "ru", "en", "sn"], Some("mn_CN"));
+
+		// Identical
+		assert_best_match(&["en"], &["en"], Some("en"));
+		assert_best_match(&["en_US"], &["en_US"], Some("en_US"));
+		assert_best_match(&["en_US", "ru_RU"], &["en_US", "ru_RU"], Some("en_US"));
+		assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["st_LS", "sn_ZW", "en_US"], Some("st_LS"));
+		assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku_TR", "ku_IQ", "ku_IR"], Some("ku_TR"));
+		assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], Some("lv_LV"));
+
+		// More complicated cases
+		assert_best_match(&["en_US", "ru_RU.UTF-8"], &["ru", "en"], Some("ru_RU.UTF-8"));
+		assert_best_match(&["en_US", "ru.UTF-8", "ru_RU.UTF-8"], &["ru.UTF-8", "en"], Some("ru.UTF-8"));
+		assert_best_match(&["en_US", "ru_RU.UTF-8", "ru.UTF-8"], &["ru.UTF-8", "en"], Some("ru_RU.UTF-8"));
+		assert_best_match(&["en_US", "ru.UTF-8@dict", "ru_UA"], &["ru_UA.UTF-8@dict", "en"], Some("ru_UA"));
+		assert_best_match(&["en_US@dict", "ru_RU"], &["en", "ru"], Some("en_US@dict"));
+		assert_best_match(&["en_US.CP1252", "en_GB.UTF-8", "ru_UA@icase", "fr_FR@euro", "it.UTF-8"], &["ru_RU.KOI8-R", "ru@icase", "en_US.UTF-8", "en.CP1252"], Some("ru_UA@icase"));
+		assert_best_match(&["fr", "fr_FR", "fr_CA.UTF-8"], &["fr.UTF-8"], Some("fr_CA.UTF-8"));
+		assert_best_match(&["en", "pt_BR@dict", "pt_PT@icase", "es"], &["pt.CP1252@euro", "en.UTF-8@dict"], Some("pt_BR@dict"));
+		assert_best_match(&["en_US", "ru_RU", "mn_CN.UTF-8", "sn_ZW", "en", "ru", "mn_MN@dict", "sn"], &["mn.UTF-8@dict", "ru", "en", "sn"], Some("mn_CN.UTF-8"));
+
+		// One available locale
+		assert_best_match(&["kk"], &["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ"], Some("kk"));
+
+		// One user locale
+		assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ", "pt"], &["pt_PT"], Some("pt"));
+
+		// Not found
+		assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "es_ES", "kk_KZ", "pt"], &["ru"], None);
+		assert_best_match(&["en", "en_US", "fr_FR", "fr", "pt"], &["id"], None);
+		assert_best_match(&["ru", "be", "uk", "kk"], &["en"], None);
+
+		// Empty available locales
+		assert_best_match(&[], &["en", "fr", "it", "pt"], None);
+
+		// Empty user locales
+		assert_best_match(&["en", "fr", "it", "pt"], &[], None);
+
+		// Both lists empty
+		assert_best_match(&[], &[], None);
+
+		// Malformed
+		assert_best_match(&[" en"], &["en"], None);
+		assert_best_match(&["?ru"], &["ru"], None);
+		assert_best_match(&["ruRU"], &["ru"], None);
+
+		// Repeating
+		assert_best_match(&["en", "en", "en", "en"], &["ru_RU", "ru", "en_US", "en"], Some("en"));
+		assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["kk", "ru", "pt", "ru"], Some("ru_UA"));
+
+		// Littered
+		assert_best_match(&["!!!!!!", "qwydgn12i6i", "ЖЖяяЖяЬЬЬ", "en_US", "!*&^^&*", "qweqweqweqwe_qweqwe", "ru_RU", "@@", "@"], &["ru", "en"], Some("ru_RU"));
+		assert_best_match(&["", "", "", "zh", "", "", "", "", "", "he", "", ""], &["he", "", "", "zh"], Some("he"));
+
+		// Special characters
+		assert_best_match(&["sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL"));
+		assert_best_match(&["\0", "\x01\x02\x03\x04", "sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "\x06", "en", "sq_XK", "sq", "\0"], Some("sq_AL"));
+
+		// Various letter cases
+		assert_best_match(&["EN"], &["en"], Some("EN"));
+		assert_best_match(&["En"], &["EN"], Some("En"));
+		assert_best_match(&["Ru_rU"], &["en", "ru"], Some("Ru_rU"));
+		assert_best_match(&["rU_rU"], &["en", "Ru"], Some("rU_rU"));
+		assert_best_match(&["EN.Utf-8"], &["en.UTF-8"], Some("EN.Utf-8"));
+		assert_best_match(&["En@dIcT"], &["EN_us"], Some("En@dIcT"));
+		assert_best_match(&["ru_ru.utf-8@icase"], &["en", "RU_RU.UTF-8@ICASE"], Some("ru_ru.utf-8@icase"));
+		assert_best_match(&["fr_FR.CP1252@euRO"], &["FR", "en"], Some("fr_FR.CP1252@euRO"));
+	}
+
+	#[test]
+	#[allow(non_snake_case)]
+	fn test_PosixLocale() {
+
+		fn assert_parts(locale: &str, parts: (&str, Option<&str>, Option<&str>, Option<&str>)) {
+			let posix_locale = PosixLocale::parse(locale);
+			assert_eq!(posix_locale.locale, locale);
+			assert_eq!(posix_locale.language, parts.0);
+			assert_eq!(posix_locale.territory, parts.1);
+			assert_eq!(posix_locale.codeset, parts.2);
+			assert_eq!(posix_locale.modifier, parts.3);
+		}
+
+		// Language only
+		assert_parts("en", ("en", None, None, None));
+		assert_parts("ru", ("ru", None, None, None));
+		assert_parts("fr", ("fr", None, None, None));
+
+		// Language and territory
+		assert_parts("en_US", ("en", Some("US"), None, None));
+		assert_parts("ru_RU", ("ru", Some("RU"), None, None));
+		assert_parts("fr_FR", ("fr", Some("FR"), None, None));
+
+		// Language and codeset
+		assert_parts("en.UTF-8", ("en", None, Some("UTF-8"), None));
+		assert_parts("ru.KOI8-R", ("ru", None, Some("KOI8-R"), None));
+		assert_parts("fr.CP1252", ("fr", None, Some("CP1252"), None));
+
+		// Language and modifier
+		assert_parts("en@dict", ("en", None, None, Some("dict")));
+		assert_parts("ru@icase", ("ru", None, None, Some("icase")));
+		assert_parts("fr@euro", ("fr", None, None, Some("euro")));
+
+		// Language, territory and codeset
+		assert_parts("en_US.UTF-8", ("en", Some("US"), Some("UTF-8"), None));
+		assert_parts("ru_RU.KOI8-R", ("ru", Some("RU"), Some("KOI8-R"), None));
+		assert_parts("fr_FR.CP1252", ("fr", Some("FR"), Some("CP1252"), None));
+
+		// Language, territory and modifier
+		assert_parts("en_US@dict", ("en", Some("US"), None, Some("dict")));
+		assert_parts("ru_RU@icase", ("ru", Some("RU"), None, Some("icase")));
+		assert_parts("fr_FR@euro", ("fr", Some("FR"), None, Some("euro")));
+
+		// Language, codeset and modifier
+		assert_parts("en.UTF-8@dict", ("en", None, Some("UTF-8"), Some("dict")));
+		assert_parts("ru.KOI8-R@icase", ("ru", None, Some("KOI8-R"), Some("icase")));
+		assert_parts("fr.CP1252@euro", ("fr", None, Some("CP1252"), Some("euro")));
+
+		// Language, territory, codeset and modifier
+		assert_parts("en_US.UTF-8@dict", ("en", Some("US"), Some("UTF-8"), Some("dict")));
+		assert_parts("ru_RU.KOI8-R@icase", ("ru", Some("RU"), Some("KOI8-R"), Some("icase")));
+		assert_parts("fr_FR.CP1252@euro", ("fr", Some("FR"), Some("CP1252"), Some("euro")));
+
+		// Various letter cases
+		assert_parts("EN", ("EN", None, None, None));
+		assert_parts("Ru", ("Ru", None, None, None));
+		assert_parts("fR", ("fR", None, None, None));
+		assert_parts("eN_us.Utf-8", ("eN", Some("us"), Some("Utf-8"), None));
+		assert_parts("RU_ru.koi8-R", ("RU", Some("ru"), Some("koi8-R"), None));
+		assert_parts("Fr_Fr.Cp1252", ("Fr", Some("Fr"), Some("Cp1252"), None));
+		assert_parts("en_us.utf-8@DICT", ("en", Some("us"), Some("utf-8"), Some("DICT")));
+		assert_parts("RU_RU.KOI8-R@Icase", ("RU", Some("RU"), Some("KOI8-R"), Some("Icase")));
+		assert_parts("fR_fR.cP1252@eUrO", ("fR", Some("fR"), Some("cP1252"), Some("eUrO")));
+
+		// Empty
+		assert_parts("", ("", None, None, None));
+
+		// Whitespace
+		assert_parts(" ", (" ", None, None, None));
+		assert_parts("  ", ("  ", None, None, None));
+		assert_parts("\t", ("\t", None, None, None));
+		assert_parts("\n", ("\n", None, None, None));
+		assert_parts("\n  \t\t\n \n\t  \t\t\n\n\t", ("\n  \t\t\n \n\t  \t\t\n\n\t", None, None, None));
+
+		// Litter
+		assert_parts("!!!", ("!!!", None, None, None));
+		assert_parts("12345", ("12345", None, None, None));
+		assert_parts("+-+-", ("+-+-", None, None, None));
+
+		// Malformed
+		assert_parts("!!!_9999.UUU@()()", ("!!!", Some("9999"), Some("UUU"), Some("()()")));
+		assert_parts("12_123.1234@12345", ("12", Some("123"), Some("1234"), Some("12345")));
+		assert_parts("+-+-@+-+-", ("+-+-", None, None, Some("+-+-")));
+
+		// Wrong order EXPECTED TO BE BROKEN
+		assert_parts("lang.codeset_region@modifier", ("lang.codeset", None, Some("codeset_region"), Some("modifier")));
+		assert_parts("[email protected]_region", ("[email protected]", None, None, Some("modifier.codeset_region")));
+		assert_parts("[email protected]", ("lang", Some("region@modifier"), None, Some("modifier.codeset")));
+		assert_parts("lang.codeset@modifier_region", ("lang.codeset@modifier", None, Some("codeset"), Some("modifier_region")));
+		assert_parts("lang@modifier_region.codeset", ("lang@modifier", Some("region"), None, Some("modifier_region.codeset")));
+
+		// Parts missing
+		assert_parts("_.@", ("", Some(""), Some(""), Some("")));
+		assert_parts("_US.UTF-8@dict", ("", Some("US"), Some("UTF-8"), Some("dict")));
+		assert_parts("ru_.KOI8-R@icase", ("ru", Some(""), Some("KOI8-R"), Some("icase")));
+		assert_parts("fr_FR.@euro", ("fr", Some("FR"), Some(""), Some("euro")));
+		assert_parts("de_DE.ISO-8859-1@", ("de", Some("DE"), Some("ISO-8859-1"), Some("")));
+
+		// Special characters
+		assert_parts("\0", ("\0", None, None, None));
+		assert_parts("\0_\0.\0@\0", ("\0", Some("\0"), Some("\0"), Some("\0")));
+		assert_parts("\0\x01\x02\x03", ("\0\x01\x02\x03", None, None, None));
+		assert_parts("\x03\x02\x01", ("\x03\x02\x01", None, None, None));
+	}
+}