From a4b8694437290e5cfed29349507657598ec75b51 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 5 Jun 2021 20:52:44 +0200 Subject: [PATCH 1/7] add classification terms to the database Adds OSM key and value of the main tag as a special string to the database and makes the string searchable via the collector. This is an incompatible change to the index structure. --- es/index_settings.json | 6 +++++ es/mappings.json | 9 +++++++ src/main/java/de/komoot/photon/Constants.java | 1 + src/main/java/de/komoot/photon/Utils.java | 27 +++++++++++++++++++ .../elasticsearch/DatabaseProperties.java | 2 +- 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/es/index_settings.json b/es/index_settings.json index 33748ea35..60fea3a52 100644 --- a/es/index_settings.json +++ b/es/index_settings.json @@ -61,6 +61,12 @@ "lowercase", "preserving_word_delimiter"], "tokenizer": "standard" + }, + "search_classification": { + "filter": [ + "lowercase" + ], + "tokenizer": "whitespace" } }, "tokenizer": { diff --git a/es/mappings.json b/es/mappings.json index 396933c7f..441ca1be1 100644 --- a/es/mappings.json +++ b/es/mappings.json @@ -76,6 +76,15 @@ "importance": { "type": "float" }, + "classification": { + "type": "text", + "index": "true", + "analyzer": "keyword", + "search_analyzer": "search_classification", + "copy_to": [ + "collector.default" + ] + }, "name": { "properties": { "alt": { diff --git a/src/main/java/de/komoot/photon/Constants.java b/src/main/java/de/komoot/photon/Constants.java index e87cb79f4..8bcc02a93 100644 --- a/src/main/java/de/komoot/photon/Constants.java +++ b/src/main/java/de/komoot/photon/Constants.java @@ -31,4 +31,5 @@ public class Constants { public static final String OSM_KEY = "osm_key"; public static final String OSM_VALUE = "osm_value"; public static final String OBJECT_TYPE = "object_type"; + public static final String CLASSIFICATION = "classification"; } diff --git a/src/main/java/de/komoot/photon/Utils.java b/src/main/java/de/komoot/photon/Utils.java index f099cb0eb..7625492a4 100644 --- a/src/main/java/de/komoot/photon/Utils.java +++ b/src/main/java/de/komoot/photon/Utils.java @@ -33,6 +33,11 @@ public static XContentBuilder convert(PhotonDoc doc, String[] languages, String[ .field(Constants.OBJECT_TYPE, atype == null ? "locality" : atype.getName()) .field(Constants.IMPORTANCE, doc.getImportance()); + String classification = buildClassificationString(doc.getTagKey(), doc.getTagValue()); + if (classification != null) { + builder.field(Constants.CLASSIFICATION, classification); + } + if (doc.getCentroid() != null) { builder.startObject("coordinate") .field("lat", doc.getCentroid().getY()) @@ -200,4 +205,26 @@ public static String stripNonDigits( } return sb.toString(); } + + public static String buildClassificationString(String key, String value) { + if ("place".equals(key) || "building".equals(key)) { + return null; + } + + if ("highway".equals(key) + && ("unclassified".equals(value) || "residential".equals(value))) { + return null; + } + + for (char c : value.toCharArray()) { + if (!(c == '_' + || ((c >= 'a') && (c <= 'z')) + || ((c >= 'A') && (c <= 'Z')) + || ((c >= '0') && (c <= '9')))) { + return null; + } + } + + return "tpfld" + value.replaceAll("_", "").toLowerCase() + "clsfld" + key.replaceAll("_", "").toLowerCase(); + } } diff --git a/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java b/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java index 6cab85113..aa11d251e 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java +++ b/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java @@ -26,7 +26,7 @@ public class DatabaseProperties { * changes in an incompatible way. If it is alredy at the next released * version, increase the dev version. */ - private static final String DATABASE_VERSION = "0.3.4-0"; + private static final String DATABASE_VERSION = "0.3.6-0"; public static final String PROPERTY_DOCUMENT_ID = "DATABASE_PROPERTIES"; private static final String BASE_FIELD = "document_properties"; From 2dfd9684aa96689bdebab226b5b233e244629622 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 5 Jun 2021 21:20:28 +0200 Subject: [PATCH 2/7] add search by classification to query builder --- .../photon/query/PhotonQueryBuilder.java | 6 +- .../query/QueryByClassificationTest.java | 63 +++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 src/test/java/de/komoot/photon/query/QueryByClassificationTest.java diff --git a/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java b/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java index 629d3072d..d3fb3bf78 100644 --- a/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java +++ b/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java @@ -117,6 +117,7 @@ private PhotonQueryBuilder(String query, String language, List languages query4QueryBuilder.must(QueryBuilders.boolQuery() .should(nameNgramQuery) .should(QueryBuilders.matchQuery("housenumber", query).analyzer("standard")) + .should(QueryBuilders.matchQuery("classification", query).boost(0.1f)) .minimumShouldMatch("1")); } @@ -128,8 +129,9 @@ private PhotonQueryBuilder(String query, String language, List languages // Weigh the resulting score by importance. Use a linear scale function that ensures that the weight // never drops to 0 and cancels out the ES score. finalQueryWithoutTagFilterBuilder = QueryBuilders.functionScoreQuery(query4QueryBuilder, new FilterFunctionBuilder[]{ - new FilterFunctionBuilder(ScoreFunctionBuilders.linearDecayFunction("importance", "1.0", "0.6")) - }); + new FilterFunctionBuilder(ScoreFunctionBuilders.linearDecayFunction("importance", "1.0", "0.6")), + new FilterFunctionBuilder(QueryBuilders.matchQuery("classification", query), ScoreFunctionBuilders.weightFactorFunction(0.1f)) + }).scoreMode(ScoreMode.SUM); // Filter for later: records that have a housenumber and no name must only appear when the housenumber matches. queryBuilderForTopLevelFilter = QueryBuilders.boolQuery() diff --git a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java new file mode 100644 index 000000000..603d89e50 --- /dev/null +++ b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java @@ -0,0 +1,63 @@ +package de.komoot.photon.query; + +import com.google.common.collect.ImmutableMap; +import de.komoot.photon.*; +import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.index.query.QueryBuilder; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Collections; + +import static org.junit.Assert.*; + + +public class QueryByClassificationTest extends ESBaseTester { + private int testDocId = 10000; + + @Before + public void setup() throws IOException { + setUpES(); + } + + private PhotonDoc createDoc(String key, String value) { + ImmutableMap nameMap = ImmutableMap.of("name", "curliflower"); + + ++testDocId; + return new PhotonDoc(testDocId, "W", testDocId, key, value).names(nameMap); + } + + private SearchResponse search(String query) { + QueryBuilder builder = PhotonQueryBuilder.builder(query, "en", Collections.singletonList("en"), false).buildQuery(); + return getClient().prepareSearch("photon") + .setSearchType(SearchType.QUERY_THEN_FETCH) + .setQuery(builder) + .execute() + .actionGet(); + } + + @Test + public void testQueryByClassificationString() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant")); + instance.finish(); + refresh(); + + String class_term = Utils.buildClassificationString("amenity", "restaurant"); + + assertNotNull(class_term); + + GetResponse response = getById(testDocId); + + String classification = (String) response.getSource().get(Constants.CLASSIFICATION); + assertEquals(classification, class_term); + + SearchResponse result = search(class_term + " curli"); + + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } +} From a49017239f027c7414108203aacc2fb9f2d19326 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 8 Jun 2021 13:22:40 +0200 Subject: [PATCH 3/7] add loading of synonym file --- src/main/java/de/komoot/photon/App.java | 2 +- .../de/komoot/photon/CommandLineArgs.java | 3 + .../photon/elasticsearch/IndexSettings.java | 116 ++++++++++++++++++ .../komoot/photon/elasticsearch/Server.java | 10 +- .../query/QueryByClassificationTest.java | 65 +++++++++- 5 files changed, 183 insertions(+), 13 deletions(-) diff --git a/src/main/java/de/komoot/photon/App.java b/src/main/java/de/komoot/photon/App.java index 3ba382da4..33868a6c3 100644 --- a/src/main/java/de/komoot/photon/App.java +++ b/src/main/java/de/komoot/photon/App.java @@ -65,7 +65,7 @@ public static void main(String[] rawArgs) throws Exception { // Working on an existing installation. // Update the index settings in case there are any changes. - esServer.updateIndexSettings(); + esServer.updateIndexSettings(args.getSynonymFile()); esClient.admin().cluster().prepareHealth().setWaitForYellowStatus().get(); if (args.isNominatimUpdate()) { diff --git a/src/main/java/de/komoot/photon/CommandLineArgs.java b/src/main/java/de/komoot/photon/CommandLineArgs.java index d7a7bb6f5..2c7eb51be 100644 --- a/src/main/java/de/komoot/photon/CommandLineArgs.java +++ b/src/main/java/de/komoot/photon/CommandLineArgs.java @@ -35,6 +35,9 @@ public class CommandLineArgs { @Parameter(names = "-extra-tags", description = "additional tags to save for each place") private String extraTags = ""; + @Parameter(names = "-synonym-file", description = "file with synonym and classification terms") + private String synonymFile = null; + @Parameter(names = "-json", description = "import nominatim database and dump it to a json like files in (useful for developing)") private String jsonDump = null; diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java index 84f1a65f8..3ace2d0a8 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java +++ b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java @@ -1,11 +1,17 @@ package de.komoot.photon.elasticsearch; +import de.komoot.photon.Utils; import org.elasticsearch.client.Client; import org.elasticsearch.common.xcontent.XContentType; +import org.json.JSONArray; import org.json.JSONObject; import org.json.JSONTokener; +import java.io.FileReader; +import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; /** * Encapsulates the ES index settings for the photon index. Adds functions to @@ -41,6 +47,83 @@ public IndexSettings setShards(Integer numShards) { return this; } + + /** + * Add query-time synonyms and classification terms from a file. + * + * Synonyms need to be supplied in a simple text file with one synonym entry per line. + * Synonyms need to be comma-separated. Only single-term synonyms are supported at this + * time. Spaces in the synonym list are considered a syntax error. + * + * @param synonymFile File containing the synonyms. + * + * @return This object for chaining. + */ + public IndexSettings setSynonymFile(String synonymFile) throws IOException { + if (synonymFile == null) { + return this; + } + + JSONObject synonymConfig = new JSONObject(new JSONTokener(new FileReader(synonymFile))); + + setSearchTimeSynonyms(synonymConfig.optJSONArray("search_synonyms")); + setClassificationTerms(synonymConfig.optJSONArray("classification_terms")); + + return this; + } + + public IndexSettings setSearchTimeSynonyms(JSONArray synonyms) { + if (synonyms != null) { + insertSynonymFilter("extra_synonyms", synonyms); + } + + return this; + } + + public IndexSettings setClassificationTerms(JSONArray terms) { + if (terms == null) { + return this; + } + + JSONArray synonyms = new JSONArray(); + for (int i = 0; i < terms.length(); i++) { + JSONObject descr = terms.getJSONObject(i); + + String classString = Utils.buildClassificationString(descr.getString("key"), descr.getString("value")).toLowerCase(); + + if (classString != null) { + JSONArray jsonTerms = descr.getJSONArray("terms"); + List termList = new ArrayList<>(); + for (int j = 0; j < jsonTerms.length(); j++) { + String term = jsonTerms.getString(j).toLowerCase(); + if (term.length() > 1) { + // Each term expands either to itself or the classification term. + synonyms.put(term + " => " + term + "," + classString); + } + } + } + } + + insertSynonymFilter("classification_synonyms", synonyms); + + return this; + } + + private void insertSynonymFilter(String filterName, JSONArray synonyms) { + if (!synonyms.isEmpty()) { + // Create a filter for the synonyms. + JSONObject filters = (JSONObject) settings.optQuery("/analysis/filter"); + if (filters == null) { + throw new RuntimeException("Analyser update: cannot find filter definition"); + } + filters.put(filterName, new JSONObject().put("type", "synonym").put("synonyms", synonyms)); + + // add synonym filter to the search analyzers + insertJsonArrayAfter("/analysis/analyzer/search_ngram", "filter", "lowercase", filterName); + insertJsonArrayAfter("/analysis/analyzer/search_raw", "filter", "lowercase", filterName); + } + } + /** * Create a new index using the current index settings. * @@ -65,4 +148,37 @@ public void updateIndex(Client client, String indexName) { client.admin().indices().prepareUpdateSettings(PhotonIndex.NAME).setSettings(settings.toString(), XContentType.JSON).execute().actionGet(); client.admin().indices().prepareOpen(PhotonIndex.NAME).execute().actionGet(); } + + /** + * Insert the given value into the array after the string given by positionString. + * If the position string is not found, throws a runtime error. + * + * @param jsonPointer Path description of the array to insert into. + * @param positionString Marker string after which to insert. + * @param value Value to insert. + */ + private void insertJsonArrayAfter(String jsonPointer, String field, String positionString, String value) { + JSONObject parent = (JSONObject) settings.optQuery(jsonPointer); + JSONArray array = parent == null ? null : parent.optJSONArray(field); + if (array == null) { + throw new RuntimeException("Analyser update: cannot find JSON array at" + jsonPointer); + } + + // We can't just insert items, so build a new array instead. + JSONArray new_array = new JSONArray(); + boolean done = false; + for (int i = 0; i < array.length(); i++) { + new_array.put(array.get(i)); + if (!done && positionString.equals(array.getString(i))) { + new_array.put(value); + done = true; + } + } + + if (!done) { + throw new RuntimeException("Analyser update: cannot find position string " + positionString); + } + + parent.put(field, new_array); + } } diff --git a/src/main/java/de/komoot/photon/elasticsearch/Server.java b/src/main/java/de/komoot/photon/elasticsearch/Server.java index 9cdf45474..d281f4ab4 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/Server.java +++ b/src/main/java/de/komoot/photon/elasticsearch/Server.java @@ -1,13 +1,11 @@ package de.komoot.photon.elasticsearch; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.SystemUtils; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.InetSocketTransportAddress; -import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexNotFoundException; import org.elasticsearch.node.InternalSettingsPreparer; import org.elasticsearch.node.Node; @@ -15,16 +13,12 @@ import org.elasticsearch.plugins.Plugin; import org.elasticsearch.transport.Netty4Plugin; import org.elasticsearch.transport.client.PreBuiltTransportClient; -import org.json.JSONArray; -import org.json.JSONObject; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.net.InetSocketAddress; import java.net.URISyntaxException; import java.net.URL; -import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.Arrays; @@ -178,14 +172,14 @@ public DatabaseProperties recreateIndex(String[] languages) throws IOException { return dbProperties; } - public void updateIndexSettings() { + public void updateIndexSettings(String synonymFile) throws IOException { // Load the settings from the database to make sure it is at the right // version. If the version is wrong, we should not be messing with the // index. DatabaseProperties dbProperties = new DatabaseProperties(); dbProperties.loadFromDatabase(getClient()); - loadIndexSettings().updateIndex(getClient(), PhotonIndex.NAME); + loadIndexSettings().setSynonymFile(synonymFile).updateIndex(getClient(), PhotonIndex.NAME); // Sanity check: legacy databases don't save the languages, so there is no way to update // the mappings consistently. diff --git a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java index 603d89e50..d91cb6e2a 100644 --- a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java +++ b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java @@ -2,10 +2,14 @@ import com.google.common.collect.ImmutableMap; import de.komoot.photon.*; +import de.komoot.photon.elasticsearch.IndexSettings; +import de.komoot.photon.elasticsearch.PhotonIndex; import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.index.query.QueryBuilder; +import org.json.JSONArray; +import org.json.JSONObject; import org.junit.Before; import org.junit.Test; @@ -23,8 +27,8 @@ public void setup() throws IOException { setUpES(); } - private PhotonDoc createDoc(String key, String value) { - ImmutableMap nameMap = ImmutableMap.of("name", "curliflower"); + private PhotonDoc createDoc(String key, String value, String name) { + ImmutableMap nameMap = ImmutableMap.of("name", name); ++testDocId; return new PhotonDoc(testDocId, "W", testDocId, key, value).names(nameMap); @@ -42,7 +46,7 @@ private SearchResponse search(String query) { @Test public void testQueryByClassificationString() { Importer instance = makeImporter(); - instance.add(createDoc("amenity", "restaurant")); + instance.add(createDoc("amenity", "restaurant", "curliflower")); instance.finish(); refresh(); @@ -51,7 +55,6 @@ public void testQueryByClassificationString() { assertNotNull(class_term); GetResponse response = getById(testDocId); - String classification = (String) response.getSource().get(Constants.CLASSIFICATION); assertEquals(classification, class_term); @@ -60,4 +63,58 @@ public void testQueryByClassificationString() { assertTrue(result.getHits().getTotalHits() > 0); assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); } + + @Test + public void testQueryByClassificationSynonym() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant", "curliflower")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "amenity") + .put("value", "restaurant") + .put("terms", new JSONArray().put("pub").put("kneipe")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("pub curli"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + + + result = search("curliflower kneipe"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } + + + @Test + public void testSynonymDoNotInterfereWithWords() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant", "airport")); + instance.add(createDoc("aeroway", "terminal", "Houston")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "aeroway") + .put("value", "terminal") + .put("terms", new JSONArray().put("airport")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("airport"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + + result = search("airport houston"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } } From e6343cbf9760b8efb25f15fa2bb80e046ff5c29d Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 11 Jun 2021 17:05:40 +0200 Subject: [PATCH 4/7] support using the same term for different key/value pairs --- .../photon/elasticsearch/IndexSettings.java | 18 ++++++---- .../query/QueryByClassificationTest.java | 34 +++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java index 3ace2d0a8..122455624 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java +++ b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java @@ -10,8 +10,7 @@ import java.io.FileReader; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Encapsulates the ES index settings for the photon index. Adds functions to @@ -85,7 +84,8 @@ public IndexSettings setClassificationTerms(JSONArray terms) { return this; } - JSONArray synonyms = new JSONArray(); + // Collect for each term in the list the possible classification expansions. + Map> collector = new HashMap<>(); for (int i = 0; i < terms.length(); i++) { JSONObject descr = terms.getJSONObject(i); @@ -93,17 +93,21 @@ public IndexSettings setClassificationTerms(JSONArray terms) { if (classString != null) { JSONArray jsonTerms = descr.getJSONArray("terms"); - List termList = new ArrayList<>(); for (int j = 0; j < jsonTerms.length(); j++) { - String term = jsonTerms.getString(j).toLowerCase(); + String term = jsonTerms.getString(j).toLowerCase().trim(); + if (term.length() > 1) { - // Each term expands either to itself or the classification term. - synonyms.put(term + " => " + term + "," + classString); + collector.computeIfAbsent(term, k -> new HashSet<>()).add(classString); } } } } + // Create the final list of synonyms. A term can expand to any classificator or not at all. + JSONArray synonyms = new JSONArray(); + collector.forEach((term, classificators) -> + synonyms.put(term + " => " + term + "," + String.join(",", classificators))); + insertSynonymFilter("classification_synonyms", synonyms); return this; diff --git a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java index d91cb6e2a..609082aee 100644 --- a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java +++ b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java @@ -117,4 +117,38 @@ public void testSynonymDoNotInterfereWithWords() { assertTrue(result.getHits().getTotalHits() > 0); assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); } + + @Test + public void testSameSynonymForDifferentTags() { + Importer instance = makeImporter(); + instance.add(createDoc("railway", "halt", "Newtown")); + instance.add(createDoc("railway", "station", "King's Cross")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "railway") + .put("value", "station") + .put("terms", new JSONArray().put("Station")) + ).put(new JSONObject() + .put("key", "railway") + .put("value", "halt") + .put("terms", new JSONArray().put("Station").put("Stop")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("Station newtown"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + result = search("newtown stop"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + result = search("king's cross Station"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } } From af16f48ddb7a6bddd87745c938a6d3629c6ddf90 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 11 Jun 2021 17:42:04 +0200 Subject: [PATCH 5/7] do not allow multi-word classification terms Multi-word synonyms are not supported properly in ES5. They cause wrong word boundary indexes. --- .../java/de/komoot/photon/elasticsearch/IndexSettings.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java index 122455624..1d55c853f 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java +++ b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java @@ -95,6 +95,9 @@ public IndexSettings setClassificationTerms(JSONArray terms) { JSONArray jsonTerms = descr.getJSONArray("terms"); for (int j = 0; j < jsonTerms.length(); j++) { String term = jsonTerms.getString(j).toLowerCase().trim(); + if (term.indexOf(' ') >= 0) { + throw new RuntimeException("Syntax error in synonym file: only single word classification terms allowed."); + } if (term.length() > 1) { collector.computeIfAbsent(term, k -> new HashSet<>()).add(classString); From c273422b06e94831c8f299cb07cf9d95e2c198bd Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 14 Jun 2021 09:02:00 +0200 Subject: [PATCH 6/7] add documentation for synonym feature --- docs/synonyms.md | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 docs/synonyms.md diff --git a/docs/synonyms.md b/docs/synonyms.md new file mode 100644 index 000000000..0b7b805df --- /dev/null +++ b/docs/synonyms.md @@ -0,0 +1,83 @@ +# Using Synonyms and Classification Terms + +Photon has built-in support for using custom query-time synonyms and +special phrases for searching a place by its type. This document explains +how to configure this feature. + +## Configuration + +Synonyms and classification terms are configured with a JSON file which can +be added to a Photon server instance using the command line parameter +`-synonym-file`. Synonyms are a run-time feature. Handing in a synonym list +at import time has no effect. The list of synonyms in use can simply be +changed by restarting the Photon server with a different synonym list (or +not at all, if you want to completely disable the feature again). + +Here is a simple example of a synonym configuration file: + +``` +{ + "search_synonyms": [ + "first,1st", + "second,2nd" + ], + "classification_terms": [ + { + "key": "aeroway", + "value": "aerodrome", + "terms": ["airport", "airfield"] + }, + { + "key": "railway", + "value": "station", + "terms": ["station"] + } + ] +} +``` + +The file has two main sections: `search_synonyms` allows for simple synonym +replacements in the query. `classification_term` defines descriptive terms +for a OSM key/value pair. + +## Synonyms + +The `search_synonyms` section must contain a list of synonym replacements. +Each entry contains a comma-separated of terms that may be replaced with each +other in the query. Only single-word terms are allowed. That means the terms +must neither contain spaces nor hyphens or the like.[^1] + +[^1] This is a restriction of ElasticSearch 5. Synonym replacement does not + create correct term positions when multi-word synonyms are involved. + +## Classification Terms + +The second section `classification_terms` defines a list of OSM key/value +pairs with their descriptive terms. `place` and `building` may not be used as +keys. Neither will `highway=residential` nor `highway=unclassified` work. +There may be multiple entries for the same key/value pair (for example, +if you have extra entries for each supported language). + +The classification terms can help improve search when the type of an object +is used in the query but does not appear in the name. For example, with the +configuration given above a query of "Berlin Station" will find a railway +station which in OpenStreetMap has the name "Berlin" and also one with +the name "Berlin Hauptbahnhof". + +Classification terms do not enable searching for objects of a certain type. +"Station London" will not get you all railway stations in London but a +railway station _named_ London. + +## Usage Advice + +Use synonyms and classification terms sparingly and only if you can be +reasonably sure that they will target the intended part of the address. +Short or frequent terms can have unexpected side-effects and worsen the +search results. For example, it might sound like a good idea to use synonyms +to handle the abbreviation from 'Saint' to 'St'. The problem here is that +'St' is also used as an abbreviation for 'Street'. So all searches that +involve a 'Street' will suddenly also search for places containing 'Saint'. + +Do not create synonyms for terms that are used as classification terms. +Photon will not complain but again there might be unintended side effects. + From 0ac89a56899902b3a947695166c9c3820b6b4ec2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 25 Jun 2021 16:06:50 +0200 Subject: [PATCH 7/7] also need synonyms for classification index The index only works if the classification search terms are converted to the special class/type terms at search time. --- src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java index 1d55c853f..2135fe963 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java +++ b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java @@ -112,6 +112,7 @@ public IndexSettings setClassificationTerms(JSONArray terms) { synonyms.put(term + " => " + term + "," + String.join(",", classificators))); insertSynonymFilter("classification_synonyms", synonyms); + insertJsonArrayAfter("/analysis/analyzer/search_classification", "filter", "lowercase", "classification_synonyms"); return this; }