From aa3e764e49a76cb6d8f9fb2466aee42bf7e98b8f Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 29 Mar 2016 14:41:44 +0200 Subject: [PATCH 01/15] update analyzers to work with https://github.com/pelias/schema/pull/109 --- query/autocomplete_defaults.js | 6 +++--- query/reverse_defaults.js | 2 +- query/search_defaults.js | 2 +- .../autocomplete_linguistic_final_token.js | 6 +++--- .../fixture/autocomplete_linguistic_focus.js | 8 ++++---- ...utocomplete_linguistic_focus_null_island.js | 8 ++++---- .../autocomplete_linguistic_multiple_tokens.js | 10 +++++----- .../fixture/autocomplete_linguistic_only.js | 6 +++--- .../autocomplete_linguistic_with_admin.js | 8 ++++---- test/unit/fixture/search_boundary_country.js | 2 +- test/unit/fixture/search_full_address.js | 2 +- test/unit/fixture/search_linguistic_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus.js | 2 +- .../fixture/search_linguistic_focus_bbox.js | 2 +- .../search_linguistic_focus_null_island.js | 2 +- test/unit/fixture/search_linguistic_only.js | 2 +- .../unit/fixture/search_linguistic_viewport.js | 2 +- .../search_linguistic_viewport_min_diagonal.js | 2 +- test/unit/fixture/search_partial_address.js | 2 +- test/unit/fixture/search_regions_address.js | 2 +- test/unit/query/autocomplete.js | 18 +++++++++--------- 21 files changed, 48 insertions(+), 48 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049b..da0791ef5 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -20,12 +20,12 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasPhrase', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 100, - 'phrase:analyzer': 'peliasPhrase', - 'phrase:field': 'phrase.default', + 'phrase:analyzer': 'peliasQueryFullToken', + 'phrase:field': 'name.default', 'phrase:boost': 1, 'phrase:slop': 2, diff --git a/query/reverse_defaults.js b/query/reverse_defaults.js index 306efaac2..06ad64002 100644 --- a/query/reverse_defaults.js +++ b/query/reverse_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/query/search_defaults.js b/query/search_defaults.js index ea0dc87ff..3c26f4dc7 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fbe800529..e100206b1 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', @@ -45,7 +45,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index 4f722b84c..bcb18d0de 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -64,7 +64,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -89,7 +89,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index d9c04fd13..65a3146dc 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -64,7 +64,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -89,7 +89,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab6..db6c4fc4b 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -6,8 +6,8 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, 'slop': 2, @@ -18,7 +18,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'three', 'type': 'phrase', @@ -31,7 +31,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two three', 'type': 'phrase', @@ -56,7 +56,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two three', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index 24b89ad96..e4fe20ee4 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -45,7 +45,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f64514..59e77c0c3 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -7,8 +7,8 @@ module.exports = { 'must': [ { 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, 'slop': 2, @@ -86,7 +86,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two', 'type': 'phrase', @@ -114,7 +114,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'one two', 'type': 'phrase', diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index 4bf453153..b84dd0e91 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 8a8290abd..570e5eca4 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index 5bb5907cf..e9368bd5c 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index 5d03d66db..c495243a1 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 96fe92f6a..365b37d8d 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index 0924475d5..a9e49a06c 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index 58c05826a..bf1056f9f 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasQueryPartialToken' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index be76ab056..bcd39af2f 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index cf44d0d80..2d1d3e2dd 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6c4174b64..aa45ca686 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index e0c05f3c7..821270516 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasQueryPartialToken', 'boost': 1 } } diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index dc973ddcf..5ea2182da 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -20,7 +20,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_only'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_only'); t.end(); }); @@ -30,9 +30,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_multiple_tokens.js'); + var expected = require('../fixture/autocomplete_linguistic_multiple_tokens'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_multiple_tokens'); t.end(); }); @@ -47,9 +47,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_with_admin.js'); + var expected = require('../fixture/autocomplete_linguistic_with_admin'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_with_admin'); t.end(); }); @@ -62,9 +62,9 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_final_token.js'); + var expected = require('../fixture/autocomplete_linguistic_final_token'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_final_token'); t.end(); }); @@ -78,7 +78,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus'); t.end(); }); @@ -92,7 +92,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus_null_island'); t.end(); }); }; From 3a789b4a933e4636bd47206ca9c96412d37104b4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 7 Apr 2016 12:39:18 +0200 Subject: [PATCH 02/15] increase autocomplete 'phrase:slop' setting from 2->3 --- query/autocomplete_defaults.js | 2 +- test/unit/fixture/autocomplete_linguistic_multiple_tokens.js | 2 +- test/unit/fixture/autocomplete_linguistic_with_admin.js | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049b..cd45b62d7 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -27,7 +27,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'phrase:analyzer': 'peliasPhrase', 'phrase:field': 'phrase.default', 'phrase:boost': 1, - 'phrase:slop': 2, + 'phrase:slop': 3, 'focus:function': 'linear', 'focus:offset': '0km', diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab6..eaf01ee69 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -10,7 +10,7 @@ module.exports = { 'analyzer': 'peliasPhrase', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f64514..8f2edc44f 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -11,7 +11,7 @@ module.exports = { 'analyzer': 'peliasPhrase', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } From e40c9ef32623f78e8a3405779eba1e84287d8091 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 15 Apr 2016 15:21:53 +0200 Subject: [PATCH 03/15] increase focus weight from 10->40 and simplify population/popularity subview --- query/autocomplete.js | 7 ++++--- query/autocomplete_defaults.js | 2 +- query/view/pop_subquery.js | 16 ++++++++++++++++ .../autocomplete_linguistic_final_token.js | 10 ++-------- .../fixture/autocomplete_linguistic_focus.js | 12 +++--------- .../autocomplete_linguistic_focus_null_island.js | 12 +++--------- .../autocomplete_linguistic_multiple_tokens.js | 10 ++-------- .../unit/fixture/autocomplete_linguistic_only.js | 10 ++-------- .../autocomplete_linguistic_with_admin.js | 10 ++-------- .../autocomplete_with_source_filtering.js | 10 ++-------- 10 files changed, 37 insertions(+), 62 deletions(-) create mode 100644 query/view/pop_subquery.js diff --git a/query/autocomplete.js b/query/autocomplete.js index ffc573967..0416163d9 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -9,7 +9,8 @@ var views = { ngrams_strict: require('./view/ngrams_strict'), focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), - phrase_first_tokens_only: require('./view/phrase_first_tokens_only') + phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), + pop_subquery: require('./view/pop_subquery') }; //------------------------------ @@ -38,8 +39,8 @@ query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost query.score( views.focus_selected_layers( views.ngrams_strict ) ); -query.score( peliasQuery.view.popularity( views.ngrams_strict ) ); -query.score( peliasQuery.view.population( views.ngrams_strict ) ); +query.score( peliasQuery.view.popularity( views.pop_subquery ) ); +query.score( peliasQuery.view.population( views.pop_subquery ) ); // non-scoring hard filters query.filter( peliasQuery.view.sources ); diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index da0791ef5..cacc8297c 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -33,7 +33,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'focus:offset': '0km', 'focus:scale': '250km', 'focus:decay': 0.5, - 'focus:weight': 10, + 'focus:weight': 40, 'function_score:score_mode': 'avg', 'function_score:boost_mode': 'multiply', diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js new file mode 100644 index 000000000..bde1492bd --- /dev/null +++ b/query/view/pop_subquery.js @@ -0,0 +1,16 @@ + +var peliasQuery = require('pelias-query'); + +/** + Population / Popularity subquery +**/ + +module.exports = function( vs ){ + + var view = peliasQuery.view.ngrams( vs ); + + view.match['name.default'].analyzer = 'peliasQueryFullToken'; + delete view.match['name.default'].boost; + + return view; +}; diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index e100206b1..fc431c776 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index bcb18d0de..430d43c9d 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index 65a3146dc..9a4afc05e 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index db6c4fc4b..26e5fa868 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -31,11 +31,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -56,11 +53,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index e4fe20ee4..4360f4d7e 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 59e77c0c3..b183bf774 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -86,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -114,11 +111,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_with_source_filtering.js b/test/unit/fixture/autocomplete_with_source_filtering.js index 5f3bcf07f..075eb6d46 100644 --- a/test/unit/fixture/autocomplete_with_source_filtering.js +++ b/test/unit/fixture/autocomplete_with_source_filtering.js @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, From 25ab63c3b37c40c59fd309dda566562a2572da5f Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 21 Apr 2016 17:08:46 +0200 Subject: [PATCH 04/15] change search analyzer to be more similar to what we had before the autocomplete_refactor milestone --- query/search_defaults.js | 2 +- test/unit/fixture/search_boundary_country.js | 2 +- test/unit/fixture/search_full_address.js | 2 +- test/unit/fixture/search_linguistic_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus.js | 2 +- test/unit/fixture/search_linguistic_focus_bbox.js | 2 +- test/unit/fixture/search_linguistic_focus_null_island.js | 2 +- test/unit/fixture/search_linguistic_only.js | 2 +- test/unit/fixture/search_linguistic_viewport.js | 2 +- test/unit/fixture/search_linguistic_viewport_min_diagonal.js | 2 +- test/unit/fixture/search_partial_address.js | 2 +- test/unit/fixture/search_regions_address.js | 2 +- test/unit/fixture/search_with_source_filtering.js | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/query/search_defaults.js b/query/search_defaults.js index 3c26f4dc7..b0f8b119f 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasQueryPartialToken', + 'ngram:analyzer': 'peliasIndexOneEdgeGram', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index b84dd0e91..71965df41 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 570e5eca4..172d439fa 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index e9368bd5c..6afe7be6d 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index c495243a1..da3e8fb39 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 365b37d8d..d5042c0f5 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index a9e49a06c..b99febea4 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index bf1056f9f..a564a4c17 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index bcd39af2f..b85d83225 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index 2d1d3e2dd..e6b50ac6d 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index aa45ca686..6810de543 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index 821270516..bf3f3dcef 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasQueryPartialToken', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_with_source_filtering.js b/test/unit/fixture/search_with_source_filtering.js index 4aedeb047..18ee13a3b 100644 --- a/test/unit/fixture/search_with_source_filtering.js +++ b/test/unit/fixture/search_with_source_filtering.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasQueryPartialToken' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], From 01a3233a7ba78d42bba013bce92e952144f6d34c Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 12:15:18 +0200 Subject: [PATCH 05/15] add a view to boost exact matches --- package.json | 2 +- query/autocomplete.js | 4 +- query/view/boost_exact_matches.js | 48 ++++++ ...autocomplete_linguistic_multiple_tokens.js | 14 +- .../autocomplete_linguistic_with_admin.js | 11 ++ .../autocomplete_single_character_street.js | 147 ++++++++++++++++++ test/unit/query/autocomplete.js | 17 ++ 7 files changed, 240 insertions(+), 3 deletions(-) create mode 100644 query/view/boost_exact_matches.js create mode 100644 test/unit/fixture/autocomplete_single_character_street.js diff --git a/package.json b/package.json index 8e325f5dc..f70cb57bd 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,7 @@ "precommit-hook": "^3.0.0", "proxyquire": "^1.4.0", "tap-dot": "1.0.5", - "tape": "^4.4.0" + "tape": "^4.5.1" }, "pre-commit": [ "lint", diff --git a/query/autocomplete.js b/query/autocomplete.js index 0416163d9..d64151ae1 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -10,7 +10,8 @@ var views = { focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), - pop_subquery: require('./view/pop_subquery') + pop_subquery: require('./view/pop_subquery'), + boost_exact_matches: require('./view/boost_exact_matches') }; //------------------------------ @@ -38,6 +39,7 @@ query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost +query.score( views.boost_exact_matches ); query.score( views.focus_selected_layers( views.ngrams_strict ) ); query.score( peliasQuery.view.popularity( views.pop_subquery ) ); query.score( peliasQuery.view.population( views.pop_subquery ) ); diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js new file mode 100644 index 000000000..8cf575f47 --- /dev/null +++ b/query/view/boost_exact_matches.js @@ -0,0 +1,48 @@ + +var peliasQuery = require('pelias-query'), + searchDefaults = require('../search_defaults'); + +/** + This view (unfortunately) requires autocomplete to use the phrase.* index. + + ideally we wouldn't need to use this, but at time of writing we are unable + to distinguish between 'complete tokens' and 'grams' in the name.* index. + + this view was introduced in order to score exact matches higher than partial + matches, without it we find results such as "Clayton Avenue" appearing first + in the results list for the query "Clay Av". + + the view uses some of the values from the 'search_defaults.js' file to add an + additional 'SHOULD' condition which scores exact matches slighly higher + than partial matches. +**/ + +module.exports = function( vs ){ + + // make a copy of the variables so we don't interfere with the values + // passed to other views. + var vsCopy = new peliasQuery.Vars( vs.export() ); + + // copy phrase:* values from search defaults + vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); + vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); + + // split the 'input:name' on whitespace + var name = vs.var('input:name').get(), + tokens = name.split(' '); + + // if the query is incomplete then we need to remove + // the final (incomplete) token as it will not match + // tokens in the phrase.* index. + if( !vs.var('input:name:isComplete').get() ){ + tokens.pop(); + } + + // no valid tokens to use, fail now, don't render this view. + if( tokens.length < 1 ){ return null; } + + // set 'input:name' to be only the fully completed characters + vsCopy.var('input:name').set( tokens.join(' ') ); + + return peliasQuery.view.phrase( vsCopy ); +}; diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 1c1b13c0a..d0465b043 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -26,7 +26,19 @@ module.exports = { } } }], - 'should':[{ + 'should':[ + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, + { 'function_score': { 'query': { 'match': { diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index d10073435..e3a62df2f 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -81,6 +81,17 @@ module.exports = { } } }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, { 'function_score': { 'query': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js new file mode 100644 index 000000000..e992cc58d --- /dev/null +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -0,0 +1,147 @@ + +module.exports = { + 'query': { + 'filtered': { + 'query': { + 'bool': { + 'must': [{ + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'type': 'phrase', + 'boost': 1, + 'slop': 3, + 'query': 'k road' + } + } + }], + 'should':[ + { + 'match': { + 'address_parts.street': { + 'query': 'k road', + 'boost': 5, + 'analyzer': 'peliasStreet' + } + } + }, { + 'match': { + 'parent.country': { + 'query': 'laird', + 'boost': 800, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region_a': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.county': { + 'query': 'laird', + 'boost': 400, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.localadmin': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.locality': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.neighbourhood': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'k road' + } + } + }, + { + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'popularity', + 'missing': 1 + }, + 'weight': 1 + }] + } + },{ + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'population', + 'missing': 1 + }, + 'weight': 3 + }] + } + }] + } + } + } + }, + 'sort': [ '_score' ], + 'size': 20, + 'track_scores': true +}; diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index e08034a8d..72cfb5f29 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -107,6 +107,23 @@ module.exports.tests.query = function(test, common) { t.deepEqual(compiled, expected, 'valid autocomplete query with source filtering'); t.end(); }); + + test('single character street address', function(t) { + var query = generate({ + text: 'k road, laird', + parsed_text: { + name: 'k road', + street: 'k road', + regions: [ 'laird' ] + } + }); + + var compiled = JSON.parse( JSON.stringify( query ) ); + var expected = require('../fixture/autocomplete_single_character_street'); + + t.deepEqual(compiled, expected, 'autocomplete_single_character_street'); + t.end(); + }); }; module.exports.all = function (tape, common) { From ca0c51b0fde45c3f863a516bc923bc122c68f7ee Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 12:22:41 +0200 Subject: [PATCH 06/15] don't strip single digits from query --- query/autocomplete.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index d64151ae1..fec0a80b7 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -71,7 +71,8 @@ function generateQuery( clean ){ // - to a 2gram index when using 'type:phrase' or 'operator:and' will // - result in a complete failure of the query. // 2. trim leading and trailing whitespace. - var text = clean.text.replace(/( .$)/g,'').trim(); + // note: single digit grams are now being produced in the name.* index + var text = clean.text.replace(/( [^0-9]$)/g,'').trim(); // if the input parser has run and suggested a 'parsed_text.name' to use. if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ From b862fc88339d5680d1ee82c0cfeb8d8bee5a6c1a Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 13:03:16 +0200 Subject: [PATCH 07/15] refactor pop_subquery to be config driven --- query/view/pop_subquery.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index bde1492bd..d18b9963b 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -9,7 +9,7 @@ module.exports = function( vs ){ var view = peliasQuery.view.ngrams( vs ); - view.match['name.default'].analyzer = 'peliasQueryFullToken'; + view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; return view; From 2398f05f8d96c3003f16a2124ff89961869a834c Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 25 Apr 2016 13:33:08 +0200 Subject: [PATCH 08/15] fix borough matching for both autocomplete and search endpoints --- query/autocomplete.js | 1 + query/autocomplete_defaults.js | 4 ++++ query/search.js | 1 + query/text_parser.js | 1 + .../autocomplete_linguistic_with_admin.js | 9 +++++++++ .../autocomplete_single_character_street.js | 8 ++++++++ test/unit/fixture/search_full_address.js | 8 ++++++++ test/unit/fixture/search_partial_address.js | 8 ++++++++ test/unit/fixture/search_regions_address.js | 8 ++++++++ test/unit/query/search.js | 18 +++++++++--------- 10 files changed, 57 insertions(+), 9 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index fec0a80b7..6da4d5694 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -34,6 +34,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 9c432cb9c..8f46ce8d4 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -82,6 +82,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 200, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 800, + 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', 'popularity:max_boost': 20, diff --git a/query/search.js b/query/search.js index 9f0a792cd..77fcb3f5b 100644 --- a/query/search.js +++ b/query/search.js @@ -30,6 +30,7 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/query/text_parser.js b/query/text_parser.js index d19465eb3..914a7f43c 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -9,6 +9,7 @@ or postalcode because we should only try to match those when we're sure that's w */ var adminFields = placeTypes.concat([ 'region_a', + 'borough' ]); /** diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index e3a62df2f..a0b07025c 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -54,6 +54,15 @@ module.exports = { } } }, + { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 800, + 'query': 'three' + } + } + }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index e992cc58d..77264f8ea 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -56,6 +56,14 @@ module.exports = { 'analyzer': 'peliasAdmin' } } + }, { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 800, + 'query': 'laird' + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index 172d439fa..dfd64e345 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -139,6 +139,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 6810de543..746899b74 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -107,6 +107,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'new york', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index bf3f3dcef..0a8b199da 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -123,6 +123,14 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } + }, { + 'match': { + 'parent.borough': { + 'query': 'manhattan', + 'boost': vs['admin:borough:boost'], + 'analyzer': vs['admin:borough:analyzer'] + } + } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/query/search.js b/test/unit/query/search.js index 426eb2bcc..a2bb8e2f8 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -25,7 +25,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_bbox'); t.end(); }); @@ -42,7 +42,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_bbox'); t.end(); }); @@ -55,7 +55,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_only'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_only'); t.end(); }); @@ -69,7 +69,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus'); t.end(); }); @@ -86,7 +86,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_viewport'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_viewport'); t.end(); }); @@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_null_island'); t.end(); }); @@ -134,7 +134,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_full_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_full_address'); t.end(); }); @@ -149,7 +149,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_partial_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_partial_address'); t.end(); }); @@ -164,7 +164,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_regions_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_regions_address'); t.end(); }); From da4c66653871c2ee5b8d82783efbfbf16fc629fc Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:34:55 +0200 Subject: [PATCH 09/15] reduce admin:borough:boost from 800->600 --- query/autocomplete_defaults.js | 2 +- test/unit/fixture/autocomplete_linguistic_with_admin.js | 2 +- test/unit/fixture/autocomplete_single_character_street.js | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 8f46ce8d4..08e33aebb 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -84,7 +84,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:borough:analyzer': 'peliasAdmin', 'admin:borough:field': 'parent.borough', - 'admin:borough:boost': 800, + 'admin:borough:boost': 600, 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index a0b07025c..7cb51eea9 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -58,7 +58,7 @@ module.exports = { 'match': { 'parent.borough': { 'analyzer': 'peliasAdmin', - 'boost': 800, + 'boost': 600, 'query': 'three' } } diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 77264f8ea..f89e84935 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -60,7 +60,7 @@ module.exports = { 'match': { 'parent.borough': { 'analyzer': 'peliasAdmin', - 'boost': 800, + 'boost': 600, 'query': 'laird' } } From 9dbed08884897cfd7ac8ec4ba9715ba1dd3170ff Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:36:29 +0200 Subject: [PATCH 10/15] remove duplicate entry for borough --- query/text_parser.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/query/text_parser.js b/query/text_parser.js index 914a7f43c..00e607240 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -8,8 +8,7 @@ when we can't identify parts of an address. This shouldn't contain fields like c or postalcode because we should only try to match those when we're sure that's what they are. */ var adminFields = placeTypes.concat([ - 'region_a', - 'borough' + 'region_a' ]); /** From e093a09a8d6a78656ea5a9920f5a0383a1b3d630 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 14:47:06 +0200 Subject: [PATCH 11/15] remove search related improvements from this PR --- query/search.js | 1 - test/unit/fixture/search_full_address.js | 8 -------- test/unit/fixture/search_partial_address.js | 8 -------- test/unit/fixture/search_regions_address.js | 8 -------- 4 files changed, 25 deletions(-) diff --git a/query/search.js b/query/search.js index 77fcb3f5b..9f0a792cd 100644 --- a/query/search.js +++ b/query/search.js @@ -30,7 +30,6 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); -query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index dfd64e345..172d439fa 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -139,14 +139,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'new york', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 746899b74..6810de543 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -107,14 +107,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'new york', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index 0a8b199da..bf3f3dcef 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -123,14 +123,6 @@ module.exports = { 'analyzer': vs['admin:county:analyzer'] } } - }, { - 'match': { - 'parent.borough': { - 'query': 'manhattan', - 'boost': vs['admin:borough:boost'], - 'analyzer': vs['admin:borough:analyzer'] - } - } }, { 'match': { 'parent.localadmin': { From ee73774c899010d9de62c1d08d380a383e9c90cf Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 28 Apr 2016 19:48:52 +0200 Subject: [PATCH 12/15] add tokenizer, refactor how we determine if a token is 'complete' or 'incomplete' --- query/autocomplete.js | 29 +- query/view/boost_exact_matches.js | 14 +- query/view/ngrams_last_token_only.js | 17 +- query/view/phrase_first_tokens_only.js | 25 +- query/view/pop_subquery.js | 19 +- sanitiser/_tokenizer.js | 95 ++++ sanitiser/autocomplete.js | 1 + .../autocomplete_linguistic_final_token.js | 18 +- test/unit/query/autocomplete.js | 40 +- test/unit/run.js | 1 + test/unit/sanitiser/_tokenizer.js | 425 ++++++++++++++++++ test/unit/sanitiser/autocomplete.js | 5 +- 12 files changed, 616 insertions(+), 73 deletions(-) create mode 100644 sanitiser/_tokenizer.js create mode 100644 test/unit/sanitiser/_tokenizer.js diff --git a/query/autocomplete.js b/query/autocomplete.js index 6da4d5694..50f6da290 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -63,31 +63,24 @@ function generateQuery( clean ){ vs.var( 'sources', clean.sources ); } - // mark the name as incomplete (user has not yet typed a comma) - vs.var( 'input:name:isComplete', false ); - - // perform some operations on 'clean.text': - // 1. if there is a space followed by a single char, remove them. - // - this is required as the index uses 2grams and sending 1grams - // - to a 2gram index when using 'type:phrase' or 'operator:and' will - // - result in a complete failure of the query. - // 2. trim leading and trailing whitespace. - // note: single digit grams are now being produced in the name.* index - var text = clean.text.replace(/( [^0-9]$)/g,'').trim(); + // pass the input tokens to the views so they can choose which tokens + // are relevant for their specific function. + if( check.array( clean.tokens ) ){ + vs.var( 'input:name:tokens', clean.tokens ); + vs.var( 'input:name:tokens_complete', clean.tokens_complete ); + vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete ); + } + + // input text + vs.var( 'input:name', clean.text ); // if the input parser has run and suggested a 'parsed_text.name' to use. if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - // mark the name as complete (user has already typed a comma) - vs.var( 'input:name:isComplete', true ); - // use 'parsed_text.name' instead of 'clean.text'. - text = clean.parsed_text.name; + vs.var( 'input:name', clean.parsed_text.name ); } - // input text - vs.var( 'input:name', text ); - // focus point if( check.number(clean['focus.point.lat']) && check.number(clean['focus.point.lon']) ){ diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js index 8cf575f47..9af56cfb3 100644 --- a/query/view/boost_exact_matches.js +++ b/query/view/boost_exact_matches.js @@ -27,19 +27,11 @@ module.exports = function( vs ){ vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); - // split the 'input:name' on whitespace - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // if the query is incomplete then we need to remove - // the final (incomplete) token as it will not match - // tokens in the phrase.* index. - if( !vs.var('input:name:isComplete').get() ){ - tokens.pop(); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); // no valid tokens to use, fail now, don't render this view. - if( tokens.length < 1 ){ return null; } + if( !tokens || tokens.length < 1 ){ return null; } // set 'input:name' to be only the fully completed characters vsCopy.var('input:name').set( tokens.join(' ') ); diff --git a/query/view/ngrams_last_token_only.js b/query/view/ngrams_last_token_only.js index 3e3315f7a..2665c2940 100644 --- a/query/view/ngrams_last_token_only.js +++ b/query/view/ngrams_last_token_only.js @@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'), eg. if the input was "100 foo str", then 'input:name' would only be 'str' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'), module.exports = function( vs ){ - // Totally disable this view when bool value 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ return null; } + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable - var name = vs.var('input:name').get(); - // set the 'name' variable in the copy to only the last token - vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return ngrams_strict( vsCopy ); diff --git a/query/view/phrase_first_tokens_only.js b/query/view/phrase_first_tokens_only.js index b047b30f4..7ab4539be 100644 --- a/query/view/phrase_first_tokens_only.js +++ b/query/view/phrase_first_tokens_only.js @@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query'); eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query'); module.exports = function( vs ){ - // Don't mutate the name variable when 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ - // return the view rendered using the original vars - return peliasQuery.view.phrase( vs ); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable and split in to tokens - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // single token only, abort (we don't want the *last* token) - // return null here will completely disable the view. - if( tokens.length < 2 ){ return null; } - // set the 'name' variable in the copy to all but the last token - vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return peliasQuery.view.phrase( vsCopy ); diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index d18b9963b..f29191fc6 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -1,5 +1,6 @@ -var peliasQuery = require('pelias-query'); +var peliasQuery = require('pelias-query'), + check = require('check-types'); /** Population / Popularity subquery @@ -12,5 +13,21 @@ module.exports = function( vs ){ view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; + // only use complete tokens against the phase index (where possible). + var completeTokens = vs.var('input:name:tokens_complete').get(), + incompleteTokens = vs.var('input:name:tokens_incomplete').get(); + + // if the tokenizer has run (autocomplete only) then we will combine the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( completeTokens ) && check.array( incompleteTokens ) ){ + var combined = completeTokens.concat( incompleteTokens ); + if( combined.length ){ + view.match['name.default'].query = combined.join(' '); + } + } + return view; }; diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js new file mode 100644 index 000000000..7b8e234c6 --- /dev/null +++ b/sanitiser/_tokenizer.js @@ -0,0 +1,95 @@ + +var check = require('check-types'); + +/** + simplified version of the elaticsearch tokenizer, used in order to + be able to detect which tokens are 'complete' (user has finished typing them) + or 'incomplete' (the user has possibly only typed part of the token). + + note: we don't need to strip punctuation as that will be handled on the + elasticsearch side, so sending a token such as 'st.' is not an issue, these + tokens should *not* be modified as the anaylsis can use the punctuation to + infer meaning. + + note: this sanitizer should run *after* the '_text' sanitizer so it can + use the output of clean.parsed_text where available. +**/ +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // this is the string we will use for analysis + var text = clean.text; + + // a boolean to track whether the input parser successfully ran; or not. + var inputParserRanSuccessfully = false; + + // if the text parser has run then we only tokenize the 'name' section + // of the 'parsed_text' object, ignoring the 'admin' parts. + if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + inputParserRanSuccessfully = true; + text = clean.parsed_text.name; // use this string instead + } + + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. + clean.tokens = []; + clean.tokens_complete = []; + clean.tokens_incomplete = []; + + // sanity check that the text is valid. + if( check.nonEmptyString( text ) ){ + + // split according to the regex used in the elasticsearch tokenizer + // see: https://github.com/pelias/schema/blob/master/settings.js + // see: settings.analysis.tokenizer.peliasNameTokenizer + clean.tokens = text + .split(/[\s,\\\/]+/) // split on delimeters + .filter(function(el){return el;}); // remove empty elements + } + + /** + the following section splits the tokens in to two arrays called + 'tokens_complete' and 'tokens_incomplete'. + + it also strips any tokens from 'tokens_incomplete' which might not + match the ngrams index (such as single grams not stored in the index). + **/ + + // split the tokens in to 'complete' and 'incomplete'. + if( clean.tokens.length ){ + + // if all the tokens are complete, simply copy them from clean.tokens + if( inputParserRanSuccessfully ){ + + // all these tokens are complete! + clean.tokens_complete = clean.tokens.slice(); + + // user hasn't finished typing yet + } else { + + // make a copy of the tokens and remove the last element + var tokensCopy = clean.tokens.slice(), + lastToken = tokensCopy.pop(); + + // set all but the last token as 'complete' + clean.tokens_complete = tokensCopy; + + /** + if the last token is a single non-numeric character then we must discard it. + + at time of writing, single non-numeric ngrams are not stored in the index, + sending them as part of the query would result in 0 documents being returned. + **/ + if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){ + clean.tokens_incomplete = [ lastToken ]; + } + } + + } + + return messages; +} + +// export function +module.exports = sanitize; diff --git a/sanitiser/autocomplete.js b/sanitiser/autocomplete.js index f96989564..8ab6fd9c7 100644 --- a/sanitiser/autocomplete.js +++ b/sanitiser/autocomplete.js @@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'), sanitizers = { singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), text: require('../sanitiser/_text'), + tokenizer: require('../sanitiser/_tokenizer'), size: require('../sanitiser/_size')(10, 10, 10), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fc431c776..b4cc33d2f 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,15 +7,25 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', + 'boost': 1, + 'slop': 3, 'query': 'one', - 'type': 'phrase', - 'operator': 'and' + 'type': 'phrase' } } }], 'should':[{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasPhrase', + 'boost': 1, + 'slop': 3, + 'query': 'one', + 'type': 'phrase' + } + } + },{ 'function_score': { 'query': { 'match': { diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 72cfb5f29..bb368fc95 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -13,7 +13,10 @@ module.exports.tests.interface = function(test, common) { module.exports.tests.query = function(test, common) { test('valid lingustic-only autocomplete', function(t) { var query = generate({ - text: 'test' + text: 'test', + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -25,7 +28,10 @@ module.exports.tests.query = function(test, common) { test('valid lingustic autocomplete with 3 tokens', function(t) { var query = generate({ - text: 'one two three' + text: 'one two three', + tokens: ['one','two','three'], + tokens_complete: ['one','two'], + tokens_incomplete: ['three'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -42,7 +48,10 @@ module.exports.tests.query = function(test, common) { name: 'one two', regions: [ 'one two', 'three' ], admin_parts: 'three' - } + }, + tokens: ['one','two'], + tokens_complete: ['one','two'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -57,7 +66,10 @@ module.exports.tests.query = function(test, common) { // note: if 1 grams are enabled at a later date, remove this behaviour. test('valid lingustic autocomplete final token', function(t) { var query = generate({ - text: 'one t' + text: 'one t', + tokens: ['one','t'], + tokens_complete: ['one'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -71,7 +83,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 29.49136, - 'focus.point.lon': -82.50622 + 'focus.point.lon': -82.50622, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -85,7 +100,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 0, - 'focus.point.lon': 0 + 'focus.point.lon': 0, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -98,7 +116,10 @@ module.exports.tests.query = function(test, common) { test('valid sources filter', function(t) { var query = generate({ 'text': 'test', - 'sources': ['test_source'] + 'sources': ['test_source'], + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -115,7 +136,10 @@ module.exports.tests.query = function(test, common) { name: 'k road', street: 'k road', regions: [ 'laird' ] - } + }, + tokens: ['k', 'road'], + tokens_complete: ['k', 'road'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); diff --git a/test/unit/run.js b/test/unit/run.js index 94d9ebb35..1a6f7a904 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -46,6 +46,7 @@ var tests = [ require('./sanitiser/_sources'), require('./sanitiser/_sources_and_layers'), require('./sanitiser/_text'), + require('./sanitiser/_tokenizer'), require('./sanitiser/_deprecate_quattroshapes'), require('./src/backend'), require('./sanitiser/autocomplete'), diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js new file mode 100644 index 000000000..a7c6ced4a --- /dev/null +++ b/test/unit/sanitiser/_tokenizer.js @@ -0,0 +1,425 @@ +var sanitiser = require('../../../sanitiser/_tokenizer'); + +module.exports.tests = {}; + +module.exports.tests.sanity_checks = function(test, common) { + test('clean.text not set', function(t) { + + var clean = {}; // clean.text not set + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.text not a string', function(t) { + + var clean = { text: {} }; // clean.text not a string + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('empty string', function(t) { + + var clean = { text: '' }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + + var clean = { parsed_text: { text: {} } }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.text', function(t) { + + var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.space_delimiter = function(test, common) { + test('space delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('space delimiter - multiple spaces / other whitespace', function(t) { + + var clean = { text: ' 30 west \t26th \nstreet new york ' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.comma_delimiter = function(test, common) { + test('comma delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street, new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('comma delimiter - multiple commas', function(t) { + + var clean = { text: ',30 west 26th street,,, new york,' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.forward_slash_delimiter = function(test, common) { + test('forward slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street/133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('forward slash - multiple slashes', function(t) { + + var clean = { text: '/Bedell Street//133rd Avenue/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.final_token_single_gram = function(test, common) { + test('final token single gram - numeric', function(t) { + + var clean = { text: 'grolmanstrasse 1' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + '1' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + '1' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('final token single gram - non-numeric', function(t) { + + var clean = { text: 'grolmanstrasse a' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + 'a' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token removed! + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.back_slash_delimiter = function(test, common) { + test('back slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street\\133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('back slash - multiple slashes', function(t) { + + var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.mixed_delimiter = function(test, common) { + test('mixed delimiters', function(t) { + + var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _tokenizer: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitiser/autocomplete.js b/test/unit/sanitiser/autocomplete.js index 26bf9afb4..186cb4b67 100644 --- a/test/unit/sanitiser/autocomplete.js +++ b/test/unit/sanitiser/autocomplete.js @@ -4,7 +4,10 @@ module.exports.tests = {}; module.exports.tests.sanitisers = function(test, common) { test('check sanitiser list', function (t) { - var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; + var expected = [ + 'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources', + 'sources_and_layers', 'private', 'geo_autocomplete' + ]; t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.end(); }); From 05240626fd4e0b798df86d8f82b618509c44ddba Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 16:56:33 +0200 Subject: [PATCH 13/15] handle addressit case where parsed_text.street is produced and parsed_text.name is not --- sanitiser/_tokenizer.js | 21 ++++++++++++++++++-- test/unit/sanitiser/_tokenizer.js | 32 +++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js index 7b8e234c6..3312ea056 100644 --- a/sanitiser/_tokenizer.js +++ b/sanitiser/_tokenizer.js @@ -27,9 +27,26 @@ function sanitize( raw, clean ){ // if the text parser has run then we only tokenize the 'name' section // of the 'parsed_text' object, ignoring the 'admin' parts. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ + if( clean.hasOwnProperty('parsed_text') ) { inputParserRanSuccessfully = true; - text = clean.parsed_text.name; // use this string instead + + // parsed_text.name is set, this is the highest priority, use this string + if( clean.parsed_text.hasOwnProperty('name') ){ + text = clean.parsed_text.name; // use this string instead + } + + // else handle the case where parsed_text.street was produced but + // no parsed_text.name is produced. + // additionally, handle the case where parsed_text.number is present + // note: the addressit module may also produce parsed_text.unit info + // for now, we discard that information as we don't have an appropriate + else if( clean.parsed_text.hasOwnProperty('street') ){ + text = [ + clean.parsed_text.number, + clean.parsed_text.street + ].filter(function(el){return el;}) + .join(' '); // remove empty elements + } } // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js index a7c6ced4a..8837d4abf 100644 --- a/test/unit/sanitiser/_tokenizer.js +++ b/test/unit/sanitiser/_tokenizer.js @@ -81,6 +81,38 @@ module.exports.tests.sanity_checks = function(test, common) { t.deepEquals(messages.errors, [], 'no errors'); t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); + test('favor clean.parsed_text street data over clean.text', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over all other variables + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); }); }; From 1c9af40f3cc274cfe77e33716501f2c4a0a26f7a Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 17:16:44 +0200 Subject: [PATCH 14/15] remove query.tokens_complete and query.tokens_incomplete from geoJSON --- middleware/geocodeJSON.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/middleware/geocodeJSON.js b/middleware/geocodeJSON.js index 80d69e869..414d216b8 100644 --- a/middleware/geocodeJSON.js +++ b/middleware/geocodeJSON.js @@ -16,7 +16,7 @@ function setup(peliasConfig, basePath) { config: peliasConfig || require('pelias-config').generate().api, basePath: basePath || '/' }; - + function middleware(req, res, next) { return convertToGeocodeJSON(req, res, next, opts); } @@ -56,6 +56,10 @@ function convertToGeocodeJSON(req, res, next, opts) { // Helpful for debugging and understanding how the input impacts results. res.body.geocoding.query = req.clean; + // remove arrays produced by the tokenizer (only intended to be used internally). + delete res.body.geocoding.query.tokens_complete; + delete res.body.geocoding.query.tokens_incomplete; + // OPTIONAL. Warnings and errors. addMessages(req, 'warnings', res.body.geocoding); addMessages(req, 'errors', res.body.geocoding); From 979aab1ac3231cf598ebcf0df1c4d6f2fee26fa4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 29 Apr 2016 19:10:52 +0200 Subject: [PATCH 15/15] ensure that problematic single grams are removed from the query --- query/autocomplete.js | 15 ++++++++++----- query/view/pop_subquery.js | 16 ---------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 50f6da290..6d5863a85 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -74,11 +74,16 @@ function generateQuery( clean ){ // input text vs.var( 'input:name', clean.text ); - // if the input parser has run and suggested a 'parsed_text.name' to use. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - - // use 'parsed_text.name' instead of 'clean.text'. - vs.var( 'input:name', clean.parsed_text.name ); + // if the tokenizer has run then we set 'input:name' to as the combination of the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){ + var combined = clean.tokens_complete.concat( clean.tokens_incomplete ); + if( combined.length ){ + vs.var( 'input:name', combined.join(' ') ); + } } // focus point diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js index f29191fc6..724b773f8 100644 --- a/query/view/pop_subquery.js +++ b/query/view/pop_subquery.js @@ -13,21 +13,5 @@ module.exports = function( vs ){ view.match['name.default'].analyzer = vs.var('phrase:analyzer'); delete view.match['name.default'].boost; - // only use complete tokens against the phase index (where possible). - var completeTokens = vs.var('input:name:tokens_complete').get(), - incompleteTokens = vs.var('input:name:tokens_incomplete').get(); - - // if the tokenizer has run (autocomplete only) then we will combine the - // 'complete' tokens with the 'incomplete' tokens, the resuting array differs - // slightly from the 'input:name:tokens' array as some tokens might have been - // removed in the process; such as single grams which are not present in then - // ngrams index. - if( check.array( completeTokens ) && check.array( incompleteTokens ) ){ - var combined = completeTokens.concat( incompleteTokens ); - if( combined.length ){ - view.match['name.default'].query = combined.join(' '); - } - } - return view; };