Merge pull request #585 from lonvia/classification-terms-II

Synonym list + classification terms
komoot · Jan 3, 2022 · 32ad992 · 32ad992
2 parents acf393a + 0ac89a5
commit 32ad992
Show file tree

Hide file tree

Showing 12 changed files with 415 additions and 12 deletions.
diff --git a/docs/synonyms.md b/docs/synonyms.md
@@ -0,0 +1,83 @@
+# Using Synonyms and Classification Terms
+
+Photon has built-in support for using custom query-time synonyms and
+special phrases for searching a place by its type. This document explains
+how to configure this feature.
+
+## Configuration
+
+Synonyms and classification terms are configured with a JSON file which can
+be added to a Photon server instance using the command line parameter
+`-synonym-file`. Synonyms are a run-time feature. Handing in a synonym list
+at import time has no effect. The list of synonyms in use can simply be
+changed by restarting the Photon server with a different synonym list (or
+not at all, if you want to completely disable the feature again).
+
+Here is a simple example of a synonym configuration file:
+
+```
+{
+  "search_synonyms": [
+    "first,1st",
+    "second,2nd"
+  ],
+  "classification_terms": [
+    {
+      "key": "aeroway",
+      "value": "aerodrome",
+      "terms": ["airport", "airfield"]
+    },
+    {
+      "key": "railway",
+      "value": "station",
+      "terms": ["station"]
+    }
+  ]
+}
+```
+
+The file has two main sections: `search_synonyms` allows for simple synonym
+replacements in the query. `classification_term` defines descriptive terms
+for a OSM key/value pair.
+
+## Synonyms
+
+The `search_synonyms` section must contain a list of synonym replacements.
+Each entry contains a comma-separated of terms that may be replaced with each
+other in the query. Only single-word terms are allowed. That means the terms
+must neither contain spaces nor hyphens or the like.[^1]
+
+[^1] This is a restriction of ElasticSearch 5. Synonym replacement does not
+     create correct term positions when multi-word synonyms are involved.
+
+## Classification Terms
+
+The second section `classification_terms` defines a list of OSM key/value
+pairs with their descriptive terms. `place` and `building` may not be used as
+keys. Neither will `highway=residential` nor `highway=unclassified` work.
+There may be multiple entries for the same key/value pair (for example,
+if you have extra entries for each supported language).
+
+The classification terms can help improve search when the type of an object
+is used in the query but does not appear in the name. For example, with the
+configuration given above a query of "Berlin Station" will find a railway
+station which in OpenStreetMap has the name "Berlin" and also one with
+the name "Berlin Hauptbahnhof".
+
+Classification terms do not enable searching for objects of a certain type.
+"Station London" will not get you all railway stations in London but a
+railway station _named_ London.
+
+## Usage Advice
+
+Use synonyms and classification terms sparingly and only if you can be
+reasonably sure that they will target the intended part of the address.
+Short or frequent terms can have unexpected side-effects and worsen the
+search results. For example, it might sound like a good idea to use synonyms
+to handle the abbreviation from 'Saint' to 'St'. The problem here is that
+'St' is also used as an abbreviation for 'Street'. So all searches that
+involve a 'Street' will suddenly also search for places containing 'Saint'.
+
+Do not create synonyms for terms that are used as classification terms.
+Photon will not complain but again there might be unintended side effects.
+
diff --git a/es/index_settings.json b/es/index_settings.json
@@ -61,6 +61,12 @@
 					"lowercase",
 					"preserving_word_delimiter"],
 				"tokenizer": "standard" 
+			},
+			"search_classification": {
+				"filter": [
+					"lowercase"
+				],
+				"tokenizer": "whitespace"
 			}
 		},
 		"tokenizer": {

diff --git a/es/mappings.json b/es/mappings.json
@@ -76,6 +76,15 @@
 			"importance": {
 				"type": "float"
 			},
+			"classification": {
+				"type": "text",
+				"index": "true",
+				"analyzer": "keyword",
+				"search_analyzer": "search_classification",
+				"copy_to": [
+					"collector.default"
+				]
+			},
 			"name": {
 				"properties": {
 					"alt": {

diff --git a/src/main/java/de/komoot/photon/App.java b/src/main/java/de/komoot/photon/App.java
@@ -65,7 +65,7 @@ public static void main(String[] rawArgs) throws Exception {
 
             // Working on an existing installation.
             // Update the index settings in case there are any changes.
-            esServer.updateIndexSettings();
+            esServer.updateIndexSettings(args.getSynonymFile());
             esClient.admin().cluster().prepareHealth().setWaitForYellowStatus().get();
 
             if (args.isNominatimUpdate()) {

diff --git a/src/main/java/de/komoot/photon/CommandLineArgs.java b/src/main/java/de/komoot/photon/CommandLineArgs.java
@@ -35,6 +35,9 @@ public class CommandLineArgs {
     @Parameter(names = "-extra-tags", description = "additional tags to save for each place")
     private String extraTags = "";
 
+    @Parameter(names = "-synonym-file", description = "file with synonym and classification terms")
+    private String synonymFile = null;
+
     @Parameter(names = "-json", description = "import nominatim database and dump it to a json like files in (useful for developing)")
     private String jsonDump = null;
 

diff --git a/src/main/java/de/komoot/photon/Constants.java b/src/main/java/de/komoot/photon/Constants.java
@@ -31,4 +31,5 @@ public class Constants {
     public static final String OSM_KEY = "osm_key";
     public static final String OSM_VALUE = "osm_value";
     public static final String OBJECT_TYPE = "object_type";
+    public static final String CLASSIFICATION = "classification";
 }
diff --git a/src/main/java/de/komoot/photon/Utils.java b/src/main/java/de/komoot/photon/Utils.java
@@ -33,6 +33,11 @@ public static XContentBuilder convert(PhotonDoc doc, String[] languages, String[
                 .field(Constants.OBJECT_TYPE, atype == null ? "locality" : atype.getName())
                 .field(Constants.IMPORTANCE, doc.getImportance());
 
+        String classification = buildClassificationString(doc.getTagKey(), doc.getTagValue());
+        if (classification != null) {
+            builder.field(Constants.CLASSIFICATION, classification);
+        }
+
         if (doc.getCentroid() != null) {
             builder.startObject("coordinate")
                     .field("lat", doc.getCentroid().getY())
@@ -200,4 +205,26 @@ public static String stripNonDigits(
         }
         return sb.toString();
     }
+
+    public static String buildClassificationString(String key, String value) {
+        if ("place".equals(key) || "building".equals(key)) {
+            return null;
+        }
+
+        if ("highway".equals(key)
+            && ("unclassified".equals(value) || "residential".equals(value))) {
+            return null;
+        }
+
+        for (char c : value.toCharArray()) {
+            if (!(c == '_'
+                  || ((c >= 'a') && (c <= 'z'))
+                  || ((c >= 'A') && (c <= 'Z'))
+                  || ((c >= '0') && (c <= '9')))) {
+                return null;
+            }
+        }
+
+        return "tpfld" + value.replaceAll("_", "").toLowerCase() + "clsfld" + key.replaceAll("_", "").toLowerCase();
+    }
 }
diff --git a/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java b/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java
@@ -26,7 +26,7 @@ public class DatabaseProperties {
      * changes in an incompatible way. If it is alredy at the next released
      * version, increase the dev version.
      */
-    private static final String DATABASE_VERSION = "0.3.4-0";
+    private static final String DATABASE_VERSION = "0.3.6-0";
     public static final String PROPERTY_DOCUMENT_ID = "DATABASE_PROPERTIES";
 
     private static final String BASE_FIELD = "document_properties";

diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java
@@ -1,11 +1,16 @@
 package de.komoot.photon.elasticsearch;
 
+import de.komoot.photon.Utils;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.json.JSONArray;
 import org.json.JSONObject;
 import org.json.JSONTokener;
 
+import java.io.FileReader;
+import java.io.IOException;
 import java.io.InputStream;
+import java.util.*;
 
 /**
  * Encapsulates the ES index settings for the photon index. Adds functions to
@@ -41,6 +46,92 @@ public IndexSettings setShards(Integer numShards) {
         return this;
     }
 
+
+    /**
+     * Add query-time synonyms and classification terms from a file.
+     *
+     * Synonyms need to be supplied in a simple text file with one synonym entry per line.
+     * Synonyms need to be comma-separated. Only single-term synonyms are supported at this
+     * time. Spaces in the synonym list are considered a syntax error.
+     *
+     * @param synonymFile File containing the synonyms.
+     *
+     * @return This object for chaining.
+     */
+    public IndexSettings setSynonymFile(String synonymFile) throws IOException {
+        if (synonymFile == null) {
+            return this;
+        }
+
+        JSONObject synonymConfig = new JSONObject(new JSONTokener(new FileReader(synonymFile)));
+
+        setSearchTimeSynonyms(synonymConfig.optJSONArray("search_synonyms"));
+        setClassificationTerms(synonymConfig.optJSONArray("classification_terms"));
+
+        return this;
+    }
+
+    public IndexSettings setSearchTimeSynonyms(JSONArray synonyms) {
+        if (synonyms != null) {
+            insertSynonymFilter("extra_synonyms", synonyms);
+        }
+
+        return this;
+    }
+
+    public IndexSettings setClassificationTerms(JSONArray terms) {
+        if (terms == null) {
+            return this;
+        }
+
+        // Collect for each term in the list the possible classification expansions.
+        Map<String, Set<String>> collector = new HashMap<>();
+        for (int i = 0; i < terms.length(); i++) {
+            JSONObject descr = terms.getJSONObject(i);
+
+            String classString = Utils.buildClassificationString(descr.getString("key"), descr.getString("value")).toLowerCase();
+
+            if (classString != null) {
+                JSONArray jsonTerms = descr.getJSONArray("terms");
+                for (int j = 0; j < jsonTerms.length(); j++) {
+                    String term = jsonTerms.getString(j).toLowerCase().trim();
+                    if (term.indexOf(' ') >= 0) {
+                        throw new RuntimeException("Syntax error in synonym file: only single word classification terms allowed.");
+                    }
+
+                    if (term.length() > 1) {
+                        collector.computeIfAbsent(term, k -> new HashSet<>()).add(classString);
+                    }
+                }
+            }
+        }
+
+        // Create the final list of synonyms. A term can expand to any classificator or not at all.
+        JSONArray synonyms = new JSONArray();
+        collector.forEach((term, classificators) ->
+            synonyms.put(term + " => " + term + "," + String.join(",", classificators)));
+
+        insertSynonymFilter("classification_synonyms", synonyms);
+        insertJsonArrayAfter("/analysis/analyzer/search_classification", "filter", "lowercase", "classification_synonyms");
+
+        return this;
+    }
+
+    private void insertSynonymFilter(String filterName, JSONArray synonyms) {
+        if (!synonyms.isEmpty()) {
+            // Create a filter for the synonyms.
+            JSONObject filters = (JSONObject) settings.optQuery("/analysis/filter");
+            if (filters == null) {
+                throw new RuntimeException("Analyser update: cannot find filter definition");
+            }
+            filters.put(filterName, new JSONObject().put("type", "synonym").put("synonyms", synonyms));
+
+            // add synonym filter to the search analyzers
+            insertJsonArrayAfter("/analysis/analyzer/search_ngram", "filter", "lowercase", filterName);
+            insertJsonArrayAfter("/analysis/analyzer/search_raw", "filter", "lowercase", filterName);
+        }
+    }
+
     /**
      * Create a new index using the current index settings.
      *
@@ -65,4 +156,37 @@ public void updateIndex(Client client, String indexName) {
         client.admin().indices().prepareUpdateSettings(PhotonIndex.NAME).setSettings(settings.toString(), XContentType.JSON).execute().actionGet();
         client.admin().indices().prepareOpen(PhotonIndex.NAME).execute().actionGet();
     }
+
+        /**
+     * Insert the given value into the array after the string given by positionString.
+     * If the position string is not found, throws a runtime error.
+     *
+     * @param jsonPointer    Path description of the array to insert into.
+     * @param positionString Marker string after which to insert.
+     * @param value          Value to insert.
+     */
+    private void insertJsonArrayAfter(String jsonPointer, String field, String positionString, String value) {
+        JSONObject parent = (JSONObject) settings.optQuery(jsonPointer);
+        JSONArray array = parent == null ? null : parent.optJSONArray(field);
+        if (array == null) {
+            throw new RuntimeException("Analyser update: cannot find JSON array at" + jsonPointer);
+        }
+
+        // We can't just insert items, so build a new array instead.
+        JSONArray new_array = new JSONArray();
+        boolean done = false;
+        for (int i = 0; i < array.length(); i++) {
+            new_array.put(array.get(i));
+            if (!done && positionString.equals(array.getString(i))) {
+                new_array.put(value);
+                done = true;
+            }
+        }
+
+        if (!done) {
+            throw new RuntimeException("Analyser update: cannot find position string " + positionString);
+        }
+
+        parent.put(field, new_array);
+    }
 }
diff --git a/src/main/java/de/komoot/photon/elasticsearch/Server.java b/src/main/java/de/komoot/photon/elasticsearch/Server.java
@@ -1,30 +1,24 @@
 package de.komoot.photon.elasticsearch;
 
 import lombok.extern.slf4j.Slf4j;
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.SystemUtils;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.client.transport.TransportClient;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.transport.InetSocketTransportAddress;
-import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.node.InternalSettingsPreparer;
 import org.elasticsearch.node.Node;
 import org.elasticsearch.node.NodeValidationException;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.transport.Netty4Plugin;
 import org.elasticsearch.transport.client.PreBuiltTransportClient;
-import org.json.JSONArray;
-import org.json.JSONObject;
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.InetSocketAddress;
 import java.net.URISyntaxException;
 import java.net.URL;
-import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
 import java.util.Arrays;
@@ -178,14 +172,14 @@ public DatabaseProperties recreateIndex(String[] languages) throws IOException {
         return dbProperties;
     }
 
-    public void updateIndexSettings() {
+    public void updateIndexSettings(String synonymFile) throws IOException {
         // Load the settings from the database to make sure it is at the right
         // version. If the version is wrong, we should not be messing with the
         // index.
         DatabaseProperties dbProperties = new DatabaseProperties();
         dbProperties.loadFromDatabase(getClient());
 
-        loadIndexSettings().updateIndex(getClient(), PhotonIndex.NAME);
+        loadIndexSettings().setSynonymFile(synonymFile).updateIndex(getClient(), PhotonIndex.NAME);
 
         // Sanity check: legacy databases don't save the languages, so there is no way to update
         //               the mappings consistently.