From d4d6f2554a6fa5b79d2bc4d56988c60814401c41 Mon Sep 17 00:00:00 2001 From: Mike Barry Date: Thu, 24 Mar 2022 20:26:25 -0400 Subject: [PATCH 1/3] name:latin improvements --- .../basemap/util/LanguageUtils.java | 27 +++-- .../basemap/util/LanguageUtilsTest.java | 103 +++++++++++++++--- .../planetiler/util/Translations.java | 4 + 3 files changed, 108 insertions(+), 26 deletions(-) diff --git a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java index 2757107e44..142f9e8109 100644 --- a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java +++ b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java @@ -42,6 +42,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE import java.util.HashMap; import java.util.Map; import java.util.Set; +import java.util.function.Predicate; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -54,8 +55,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */ public class LanguageUtils { - private static final Pattern NONLATIN = Pattern - .compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]"); + // Name tags that should be eligible for finding a latin name. + // See https://wiki.openstreetmap.org/wiki/Multilingual_names + // and https://github.com/onthegomap/planetiler/issues/86 + private static final Predicate VALID_NAME_TAGS = + Pattern.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_][a-z]{2,})?(-[a-z]{2})?$", Pattern.CASE_INSENSITIVE) + .asMatchPredicate(); + + // Match strings that only contain latin characters. + // and https://github.com/onthegomap/planetiler/issues/86 + private static final Predicate ONLY_LATIN = Pattern + .compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$") + .asMatchPredicate(); + private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+"); private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])"); private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)"); @@ -73,7 +85,7 @@ private static String string(Object obj) { } static boolean containsOnlyLatinCharacters(String string) { - return string != null && !NONLATIN.matcher(string).find(); + return string != null && ONLY_LATIN.test(string); } private static String transliteratedName(Map tags) { @@ -128,7 +140,8 @@ public static Map getNames(Map tags, Translation boolean isLatin = containsOnlyLatinCharacters(name); String latin = isLatin ? name : - Stream.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags)) + Stream + .concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags)) .filter(LanguageUtils::containsOnlyLatinCharacters) .findFirst().orElse(null); if (latin == null && translations != null && translations.getShouldTransliterate()) { @@ -160,12 +173,8 @@ public static Map getNames(Map tags, Translation private static Stream getAllNameTranslationsBesidesEnglishAndGerman(Map tags) { return tags.entrySet().stream() - .filter(e -> { - String key = e.getKey(); - return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key); - }) + .filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey())) .map(Map.Entry::getValue) .map(LanguageUtils::string); } - } diff --git a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java index 20caacd58e..6b0a18f04d 100644 --- a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java +++ b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java @@ -13,6 +13,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; public class LanguageUtilsTest { @@ -59,7 +60,7 @@ public void testSimpleExample() { "é, true", "éś, true", "ɏə, true", - "ɐ, false", + "ɐ, true", "ᵿἀ, false", "Ḁỿ, true", "\u02ff\u0370, false", @@ -95,26 +96,94 @@ public void testRemoveNonLatin(String in, String out) { } @ParameterizedTest - @CsvSource({ - "name, a, true", - "name:en, a, true", - "int_name, a, true", - "name:fr, a, true", - "name:es, a, true", - "name:pt, a, true", - "name:de, a, true", - "name:ar, ِغَّ, false", - "name:it, a, true", - "name:jp, ア, false", - "name:jp-Latn, a, true", - "name:jp_rm, a, true", + @ValueSource(strings = { + // OSM tags that SHOULD be eligible for name:latin feature in the output + "name:en", + "int_name", + "name:fr", + "name:es", + "name:pt", + "name:de", + "name:ar", + "name:it", + "name:ko-Latn", + "name:be-tarask", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan + "name:ja", + "name:ja-Latn", + "name:ja_rm", + "name:ja_kana", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#China + "name:zh-CN", + "name:zh-hant-CN", + "name:zh_pinyin", + "name:zh_zhuyin", + "name:zh-Latn-tongyong", + "name:zh-Latn-pinyin", + "name:zh-Latn-wadegiles", + "name:yue-Latn-jyutping", + // https://wiki.openstreetmap.org/wiki/Multilingual_names#France + "name:fr", + "name:br", + "name:oc", + "name:vls", + "name:frp", + "name:gcf", + "name:gsw", }) - public void testLatinFallbacks(String key, String value, boolean use) { - assertEquals(use ? value : null, LanguageUtils.getNames(Map.of( - key, value + public void testLatinFallbacks(String key) { + assertEquals("a", LanguageUtils.getNames(Map.of( + key, "a" + ), translations).get("name:latin")); + assertNull(LanguageUtils.getNames(Map.of( + key, "ア" + ), translations).get("name:latin")); + assertNull(LanguageUtils.getNames(Map.of( + key, "غ" ), translations).get("name:latin")); } + @ParameterizedTest + @ValueSource(strings = { + // OSM tags that should NOT be eligible for name:latin feature in the output + "name:signed", + "name:prefix", + "name:abbreviation", + "name:source", + "name:full", + "name:adjective", + "name:proposed", + "name:pronunciation", + "name:etymology", + "name:etymology:wikidata", + "name:etymology:wikipedia", + "name:etymology:right", + "name:etymology:left", + "name:genitive", + }) + public void testNoLatinFallback(String key) { + assertSubmap(Map.of( + "name", "Branch Hill–Loveland Road", + "name_en", "Branch Hill–Loveland Road", + "name_de", "Branch Hill–Loveland Road", + "name:latin", "Branch Hill–Loveland Road", + "name_int", "Branch Hill–Loveland Road" + ), LanguageUtils.getNames(Map.of( + "name", "Branch Hill–Loveland Road", + key, "Q22133584;Q843993" + ), translations)); + assertSubmap(Map.of( + "name", "日", + "name_en", "日", + "name_de", "日", + "name:latin", "rì", + "name_int", "rì" + ), LanguageUtils.getNames(Map.of( + "name", "日", + key, "other" // don't use this latin string with invalid name keys + ), translations)); + } + @ParameterizedTest @CsvSource({ "キャンパス, kyanpasu", diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java index 1fb3d3bc24..b367554afb 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java @@ -88,6 +88,10 @@ public Translations setShouldTransliterate(boolean shouldTransliterate) { return this; } + /** Returns true if {@code language} is in the set of language translations to use. */ + public boolean careAboutLanguage(String language) { + return languageSet.contains(language); + } /** A source of name translations. */ public interface TranslationProvider { From 480a35cfcc0e5a2a209d5f0820957bc73b7b65a0 Mon Sep 17 00:00:00 2001 From: Mike Barry Date: Thu, 24 Mar 2022 20:36:18 -0400 Subject: [PATCH 2/3] improve latin letter regex --- .../planetiler/basemap/util/LanguageUtils.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java index 142f9e8109..fdc3a3e916 100644 --- a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java +++ b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java @@ -54,21 +54,22 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * openmaptiles-tools. */ public class LanguageUtils { + // See https://github.com/onthegomap/planetiler/issues/86 // Name tags that should be eligible for finding a latin name. // See https://wiki.openstreetmap.org/wiki/Multilingual_names - // and https://github.com/onthegomap/planetiler/issues/86 private static final Predicate VALID_NAME_TAGS = Pattern.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_][a-z]{2,})?(-[a-z]{2})?$", Pattern.CASE_INSENSITIVE) .asMatchPredicate(); // Match strings that only contain latin characters. - // and https://github.com/onthegomap/planetiler/issues/86 private static final Predicate ONLY_LATIN = Pattern .compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$") .asMatchPredicate(); - private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+"); + // Match only latin letters + private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+"); + private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])"); private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)"); private static final Pattern WHITESPACE = Pattern.compile("\\s+"); @@ -96,7 +97,7 @@ static String removeLatinCharacters(String name) { if (name == null) { return null; } - var matcher = LETTER.matcher(name); + var matcher = LATIN_LETTER.matcher(name); if (matcher.find()) { String result = matcher.replaceAll(""); // if the name was " ( Date: Fri, 25 Mar 2022 05:16:54 -0400 Subject: [PATCH 3/3] allow region codes and x-extension's on localized names --- .../com/onthegomap/planetiler/basemap/util/LanguageUtils.java | 3 ++- .../onthegomap/planetiler/basemap/util/LanguageUtilsTest.java | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java index fdc3a3e916..b5f44f0019 100644 --- a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java +++ b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java @@ -59,7 +59,8 @@ public class LanguageUtils { // Name tags that should be eligible for finding a latin name. // See https://wiki.openstreetmap.org/wiki/Multilingual_names private static final Predicate VALID_NAME_TAGS = - Pattern.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_][a-z]{2,})?(-[a-z]{2})?$", Pattern.CASE_INSENSITIVE) + Pattern + .compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE) .asMatchPredicate(); // Match strings that only contain latin characters. diff --git a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java index 6b0a18f04d..7b8e7f4dab 100644 --- a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java +++ b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java @@ -99,6 +99,8 @@ public void testRemoveNonLatin(String in, String out) { @ValueSource(strings = { // OSM tags that SHOULD be eligible for name:latin feature in the output "name:en", + "name:en-US", + "name:en-010", "int_name", "name:fr", "name:es", @@ -124,6 +126,7 @@ public void testRemoveNonLatin(String in, String out) { "name:yue-Latn-jyutping", // https://wiki.openstreetmap.org/wiki/Multilingual_names#France "name:fr", + "name:fr-x-gallo", "name:br", "name:oc", "name:vls",