diff --git a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java
index 2757107e44..b5f44f0019 100644
--- a/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java
+++ b/planetiler-basemap/src/main/java/com/onthegomap/planetiler/basemap/util/LanguageUtils.java
@@ -42,6 +42,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@@ -53,10 +54,23 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* openmaptiles-tools.
*/
public class LanguageUtils {
+ // See https://github.com/onthegomap/planetiler/issues/86
+
+ // Name tags that should be eligible for finding a latin name.
+ // See https://wiki.openstreetmap.org/wiki/Multilingual_names
+ private static final Predicate VALID_NAME_TAGS =
+ Pattern
+ .compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE)
+ .asMatchPredicate();
+
+ // Match strings that only contain latin characters.
+ private static final Predicate ONLY_LATIN = Pattern
+ .compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$")
+ .asMatchPredicate();
+
+ // Match only latin letters
+ private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+");
- private static final Pattern NONLATIN = Pattern
- .compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]");
- private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+");
private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])");
private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)");
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
@@ -73,7 +87,7 @@ private static String string(Object obj) {
}
static boolean containsOnlyLatinCharacters(String string) {
- return string != null && !NONLATIN.matcher(string).find();
+ return string != null && ONLY_LATIN.test(string);
}
private static String transliteratedName(Map tags) {
@@ -84,7 +98,7 @@ static String removeLatinCharacters(String name) {
if (name == null) {
return null;
}
- var matcher = LETTER.matcher(name);
+ var matcher = LATIN_LETTER.matcher(name);
if (matcher.find()) {
String result = matcher.replaceAll("");
// if the name was " ( getNames(Map tags, Translation
boolean isLatin = containsOnlyLatinCharacters(name);
String latin = isLatin ? name :
- Stream.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
+ Stream
+ .concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
.filter(LanguageUtils::containsOnlyLatinCharacters)
.findFirst().orElse(null);
if (latin == null && translations != null && translations.getShouldTransliterate()) {
@@ -160,12 +175,8 @@ public static Map getNames(Map tags, Translation
private static Stream getAllNameTranslationsBesidesEnglishAndGerman(Map tags) {
return tags.entrySet().stream()
- .filter(e -> {
- String key = e.getKey();
- return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key);
- })
+ .filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey()))
.map(Map.Entry::getValue)
.map(LanguageUtils::string);
}
-
}
diff --git a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java
index 20caacd58e..7b8e7f4dab 100644
--- a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java
+++ b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/LanguageUtilsTest.java
@@ -13,6 +13,7 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.ValueSource;
public class LanguageUtilsTest {
@@ -59,7 +60,7 @@ public void testSimpleExample() {
"é, true",
"éś, true",
"ɏə, true",
- "ɐ, false",
+ "ɐ, true",
"ᵿἀ, false",
"Ḁỿ, true",
"\u02ff\u0370, false",
@@ -95,26 +96,97 @@ public void testRemoveNonLatin(String in, String out) {
}
@ParameterizedTest
- @CsvSource({
- "name, a, true",
- "name:en, a, true",
- "int_name, a, true",
- "name:fr, a, true",
- "name:es, a, true",
- "name:pt, a, true",
- "name:de, a, true",
- "name:ar, ِغَّ, false",
- "name:it, a, true",
- "name:jp, ア, false",
- "name:jp-Latn, a, true",
- "name:jp_rm, a, true",
+ @ValueSource(strings = {
+ // OSM tags that SHOULD be eligible for name:latin feature in the output
+ "name:en",
+ "name:en-US",
+ "name:en-010",
+ "int_name",
+ "name:fr",
+ "name:es",
+ "name:pt",
+ "name:de",
+ "name:ar",
+ "name:it",
+ "name:ko-Latn",
+ "name:be-tarask",
+ // https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan
+ "name:ja",
+ "name:ja-Latn",
+ "name:ja_rm",
+ "name:ja_kana",
+ // https://wiki.openstreetmap.org/wiki/Multilingual_names#China
+ "name:zh-CN",
+ "name:zh-hant-CN",
+ "name:zh_pinyin",
+ "name:zh_zhuyin",
+ "name:zh-Latn-tongyong",
+ "name:zh-Latn-pinyin",
+ "name:zh-Latn-wadegiles",
+ "name:yue-Latn-jyutping",
+ // https://wiki.openstreetmap.org/wiki/Multilingual_names#France
+ "name:fr",
+ "name:fr-x-gallo",
+ "name:br",
+ "name:oc",
+ "name:vls",
+ "name:frp",
+ "name:gcf",
+ "name:gsw",
})
- public void testLatinFallbacks(String key, String value, boolean use) {
- assertEquals(use ? value : null, LanguageUtils.getNames(Map.of(
- key, value
+ public void testLatinFallbacks(String key) {
+ assertEquals("a", LanguageUtils.getNames(Map.of(
+ key, "a"
+ ), translations).get("name:latin"));
+ assertNull(LanguageUtils.getNames(Map.of(
+ key, "ア"
+ ), translations).get("name:latin"));
+ assertNull(LanguageUtils.getNames(Map.of(
+ key, "غ"
), translations).get("name:latin"));
}
+ @ParameterizedTest
+ @ValueSource(strings = {
+ // OSM tags that should NOT be eligible for name:latin feature in the output
+ "name:signed",
+ "name:prefix",
+ "name:abbreviation",
+ "name:source",
+ "name:full",
+ "name:adjective",
+ "name:proposed",
+ "name:pronunciation",
+ "name:etymology",
+ "name:etymology:wikidata",
+ "name:etymology:wikipedia",
+ "name:etymology:right",
+ "name:etymology:left",
+ "name:genitive",
+ })
+ public void testNoLatinFallback(String key) {
+ assertSubmap(Map.of(
+ "name", "Branch Hill–Loveland Road",
+ "name_en", "Branch Hill–Loveland Road",
+ "name_de", "Branch Hill–Loveland Road",
+ "name:latin", "Branch Hill–Loveland Road",
+ "name_int", "Branch Hill–Loveland Road"
+ ), LanguageUtils.getNames(Map.of(
+ "name", "Branch Hill–Loveland Road",
+ key, "Q22133584;Q843993"
+ ), translations));
+ assertSubmap(Map.of(
+ "name", "日",
+ "name_en", "日",
+ "name_de", "日",
+ "name:latin", "rì",
+ "name_int", "rì"
+ ), LanguageUtils.getNames(Map.of(
+ "name", "日",
+ key, "other" // don't use this latin string with invalid name keys
+ ), translations));
+ }
+
@ParameterizedTest
@CsvSource({
"キャンパス, kyanpasu",
diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java
index 1fb3d3bc24..b367554afb 100644
--- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java
+++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Translations.java
@@ -88,6 +88,10 @@ public Translations setShouldTransliterate(boolean shouldTransliterate) {
return this;
}
+ /** Returns true if {@code language} is in the set of language translations to use. */
+ public boolean careAboutLanguage(String language) {
+ return languageSet.contains(language);
+ }
/** A source of name translations. */
public interface TranslationProvider {