diff --git a/middleware/geocodeJSON.js b/middleware/geocodeJSON.js index 80d69e869..414d216b8 100644 --- a/middleware/geocodeJSON.js +++ b/middleware/geocodeJSON.js @@ -16,7 +16,7 @@ function setup(peliasConfig, basePath) { config: peliasConfig || require('pelias-config').generate().api, basePath: basePath || '/' }; - + function middleware(req, res, next) { return convertToGeocodeJSON(req, res, next, opts); } @@ -56,6 +56,10 @@ function convertToGeocodeJSON(req, res, next, opts) { // Helpful for debugging and understanding how the input impacts results. res.body.geocoding.query = req.clean; + // remove arrays produced by the tokenizer (only intended to be used internally). + delete res.body.geocoding.query.tokens_complete; + delete res.body.geocoding.query.tokens_incomplete; + // OPTIONAL. Warnings and errors. addMessages(req, 'warnings', res.body.geocoding); addMessages(req, 'errors', res.body.geocoding); diff --git a/package.json b/package.json index 8e325f5dc..f70cb57bd 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,7 @@ "precommit-hook": "^3.0.0", "proxyquire": "^1.4.0", "tap-dot": "1.0.5", - "tape": "^4.4.0" + "tape": "^4.5.1" }, "pre-commit": [ "lint", diff --git a/query/autocomplete.js b/query/autocomplete.js index ffc573967..6d5863a85 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -9,7 +9,9 @@ var views = { ngrams_strict: require('./view/ngrams_strict'), focus_selected_layers: require('./view/focus_selected_layers'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), - phrase_first_tokens_only: require('./view/phrase_first_tokens_only') + phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), + pop_subquery: require('./view/pop_subquery'), + boost_exact_matches: require('./view/boost_exact_matches') }; //------------------------------ @@ -32,14 +34,16 @@ query.score( peliasQuery.view.admin('country_a') ); query.score( peliasQuery.view.admin('region') ); query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin('county') ); +query.score( peliasQuery.view.admin('borough') ); query.score( peliasQuery.view.admin('localadmin') ); query.score( peliasQuery.view.admin('locality') ); query.score( peliasQuery.view.admin('neighbourhood') ); // scoring boost +query.score( views.boost_exact_matches ); query.score( views.focus_selected_layers( views.ngrams_strict ) ); -query.score( peliasQuery.view.popularity( views.ngrams_strict ) ); -query.score( peliasQuery.view.population( views.ngrams_strict ) ); +query.score( peliasQuery.view.popularity( views.pop_subquery ) ); +query.score( peliasQuery.view.population( views.pop_subquery ) ); // non-scoring hard filters query.filter( peliasQuery.view.sources ); @@ -59,29 +63,28 @@ function generateQuery( clean ){ vs.var( 'sources', clean.sources ); } - // mark the name as incomplete (user has not yet typed a comma) - vs.var( 'input:name:isComplete', false ); - - // perform some operations on 'clean.text': - // 1. if there is a space followed by a single char, remove them. - // - this is required as the index uses 2grams and sending 1grams - // - to a 2gram index when using 'type:phrase' or 'operator:and' will - // - result in a complete failure of the query. - // 2. trim leading and trailing whitespace. - var text = clean.text.replace(/( .$)/g,'').trim(); - - // if the input parser has run and suggested a 'parsed_text.name' to use. - if( clean.hasOwnProperty('parsed_text') && clean.parsed_text.hasOwnProperty('name') ){ - - // mark the name as complete (user has already typed a comma) - vs.var( 'input:name:isComplete', true ); - - // use 'parsed_text.name' instead of 'clean.text'. - text = clean.parsed_text.name; + // pass the input tokens to the views so they can choose which tokens + // are relevant for their specific function. + if( check.array( clean.tokens ) ){ + vs.var( 'input:name:tokens', clean.tokens ); + vs.var( 'input:name:tokens_complete', clean.tokens_complete ); + vs.var( 'input:name:tokens_incomplete', clean.tokens_incomplete ); } // input text - vs.var( 'input:name', text ); + vs.var( 'input:name', clean.text ); + + // if the tokenizer has run then we set 'input:name' to as the combination of the + // 'complete' tokens with the 'incomplete' tokens, the resuting array differs + // slightly from the 'input:name:tokens' array as some tokens might have been + // removed in the process; such as single grams which are not present in then + // ngrams index. + if( check.array( clean.tokens_complete ) && check.array( clean.tokens_incomplete ) ){ + var combined = clean.tokens_complete.concat( clean.tokens_incomplete ); + if( combined.length ){ + vs.var( 'input:name', combined.join(' ') ); + } + } // focus point if( check.number(clean['focus.point.lat']) && diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba52a049b..08e33aebb 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -20,20 +20,20 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasPhrase', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 100, - 'phrase:analyzer': 'peliasPhrase', - 'phrase:field': 'phrase.default', + 'phrase:analyzer': 'peliasQueryFullToken', + 'phrase:field': 'name.default', 'phrase:boost': 1, - 'phrase:slop': 2, + 'phrase:slop': 3, 'focus:function': 'linear', 'focus:offset': '0km', 'focus:scale': '250km', 'focus:decay': 0.5, - 'focus:weight': 10, + 'focus:weight': 40, 'function_score:score_mode': 'avg', 'function_score:boost_mode': 'multiply', @@ -82,6 +82,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:neighbourhood:field': 'parent.neighbourhood', 'admin:neighbourhood:boost': 200, + 'admin:borough:analyzer': 'peliasAdmin', + 'admin:borough:field': 'parent.borough', + 'admin:borough:boost': 600, + 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', 'popularity:max_boost': 20, diff --git a/query/reverse_defaults.js b/query/reverse_defaults.js index 306efaac2..06ad64002 100644 --- a/query/reverse_defaults.js +++ b/query/reverse_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasQueryPartialToken', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/query/search_defaults.js b/query/search_defaults.js index 89aca7d69..281d25aee 100644 --- a/query/search_defaults.js +++ b/query/search_defaults.js @@ -20,7 +20,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'boundary:rect:type': 'indexed', 'boundary:rect:_cache': true, - 'ngram:analyzer': 'peliasOneEdgeGram', + 'ngram:analyzer': 'peliasIndexOneEdgeGram', 'ngram:field': 'name.default', 'ngram:boost': 1, diff --git a/query/text_parser.js b/query/text_parser.js index d19465eb3..00e607240 100644 --- a/query/text_parser.js +++ b/query/text_parser.js @@ -8,7 +8,7 @@ when we can't identify parts of an address. This shouldn't contain fields like c or postalcode because we should only try to match those when we're sure that's what they are. */ var adminFields = placeTypes.concat([ - 'region_a', + 'region_a' ]); /** diff --git a/query/view/boost_exact_matches.js b/query/view/boost_exact_matches.js new file mode 100644 index 000000000..9af56cfb3 --- /dev/null +++ b/query/view/boost_exact_matches.js @@ -0,0 +1,40 @@ + +var peliasQuery = require('pelias-query'), + searchDefaults = require('../search_defaults'); + +/** + This view (unfortunately) requires autocomplete to use the phrase.* index. + + ideally we wouldn't need to use this, but at time of writing we are unable + to distinguish between 'complete tokens' and 'grams' in the name.* index. + + this view was introduced in order to score exact matches higher than partial + matches, without it we find results such as "Clayton Avenue" appearing first + in the results list for the query "Clay Av". + + the view uses some of the values from the 'search_defaults.js' file to add an + additional 'SHOULD' condition which scores exact matches slighly higher + than partial matches. +**/ + +module.exports = function( vs ){ + + // make a copy of the variables so we don't interfere with the values + // passed to other views. + var vsCopy = new peliasQuery.Vars( vs.export() ); + + // copy phrase:* values from search defaults + vsCopy.var('phrase:analyzer').set(searchDefaults['phrase:analyzer']); + vsCopy.var('phrase:field').set(searchDefaults['phrase:field']); + + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } + + // set 'input:name' to be only the fully completed characters + vsCopy.var('input:name').set( tokens.join(' ') ); + + return peliasQuery.view.phrase( vsCopy ); +}; diff --git a/query/view/ngrams_last_token_only.js b/query/view/ngrams_last_token_only.js index 3e3315f7a..2665c2940 100644 --- a/query/view/ngrams_last_token_only.js +++ b/query/view/ngrams_last_token_only.js @@ -8,9 +8,6 @@ var peliasQuery = require('pelias-query'), eg. if the input was "100 foo str", then 'input:name' would only be 'str' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -18,19 +15,17 @@ var peliasQuery = require('pelias-query'), module.exports = function( vs ){ - // Totally disable this view when bool value 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ return null; } + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable - var name = vs.var('input:name').get(); - // set the 'name' variable in the copy to only the last token - vsCopy.var('input:name').set( name.substr( name.lastIndexOf(' ')+1 ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return ngrams_strict( vsCopy ); diff --git a/query/view/phrase_first_tokens_only.js b/query/view/phrase_first_tokens_only.js index b047b30f4..7ab4539be 100644 --- a/query/view/phrase_first_tokens_only.js +++ b/query/view/phrase_first_tokens_only.js @@ -7,9 +7,6 @@ var peliasQuery = require('pelias-query'); eg. if the input was "100 foo str", then 'input:name' would only be '100 foo' note: it is assumed that the rest of the input is matched using another view. - there is an additional flag 'input:name:isComplete' used to disable this view - selectively, see that section for more info. - code notes: this view makes a copy of the $vs object in order to change their values without mutating the original values, which may be expected in their unaltered form by other views. @@ -17,27 +14,17 @@ var peliasQuery = require('pelias-query'); module.exports = function( vs ){ - // Don't mutate the name variable when 'input:name:isComplete' is true. - // This is the case when the user has typed a comma, so we can assume - // that the 'name' part of the query is now complete. - if( vs.var('input:name:isComplete').get() ){ - // return the view rendered using the original vars - return peliasQuery.view.phrase( vs ); - } + // get a copy of the *complete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_complete').get(); + + // no valid tokens to use, fail now, don't render this view. + if( !tokens || tokens.length < 1 ){ return null; } // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); - // get the input 'name' variable and split in to tokens - var name = vs.var('input:name').get(), - tokens = name.split(' '); - - // single token only, abort (we don't want the *last* token) - // return null here will completely disable the view. - if( tokens.length < 2 ){ return null; } - // set the 'name' variable in the copy to all but the last token - vsCopy.var('input:name').set( name.substr( 0, name.lastIndexOf(' ') ) ); + vsCopy.var('input:name').set( tokens.join(' ') ); // return the view rendered using the copy return peliasQuery.view.phrase( vsCopy ); diff --git a/query/view/pop_subquery.js b/query/view/pop_subquery.js new file mode 100644 index 000000000..724b773f8 --- /dev/null +++ b/query/view/pop_subquery.js @@ -0,0 +1,17 @@ + +var peliasQuery = require('pelias-query'), + check = require('check-types'); + +/** + Population / Popularity subquery +**/ + +module.exports = function( vs ){ + + var view = peliasQuery.view.ngrams( vs ); + + view.match['name.default'].analyzer = vs.var('phrase:analyzer'); + delete view.match['name.default'].boost; + + return view; +}; diff --git a/sanitiser/_tokenizer.js b/sanitiser/_tokenizer.js new file mode 100644 index 000000000..3312ea056 --- /dev/null +++ b/sanitiser/_tokenizer.js @@ -0,0 +1,112 @@ + +var check = require('check-types'); + +/** + simplified version of the elaticsearch tokenizer, used in order to + be able to detect which tokens are 'complete' (user has finished typing them) + or 'incomplete' (the user has possibly only typed part of the token). + + note: we don't need to strip punctuation as that will be handled on the + elasticsearch side, so sending a token such as 'st.' is not an issue, these + tokens should *not* be modified as the anaylsis can use the punctuation to + infer meaning. + + note: this sanitizer should run *after* the '_text' sanitizer so it can + use the output of clean.parsed_text where available. +**/ +function sanitize( raw, clean ){ + + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // this is the string we will use for analysis + var text = clean.text; + + // a boolean to track whether the input parser successfully ran; or not. + var inputParserRanSuccessfully = false; + + // if the text parser has run then we only tokenize the 'name' section + // of the 'parsed_text' object, ignoring the 'admin' parts. + if( clean.hasOwnProperty('parsed_text') ) { + inputParserRanSuccessfully = true; + + // parsed_text.name is set, this is the highest priority, use this string + if( clean.parsed_text.hasOwnProperty('name') ){ + text = clean.parsed_text.name; // use this string instead + } + + // else handle the case where parsed_text.street was produced but + // no parsed_text.name is produced. + // additionally, handle the case where parsed_text.number is present + // note: the addressit module may also produce parsed_text.unit info + // for now, we discard that information as we don't have an appropriate + else if( clean.parsed_text.hasOwnProperty('street') ){ + text = [ + clean.parsed_text.number, + clean.parsed_text.street + ].filter(function(el){return el;}) + .join(' '); // remove empty elements + } + } + + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. + clean.tokens = []; + clean.tokens_complete = []; + clean.tokens_incomplete = []; + + // sanity check that the text is valid. + if( check.nonEmptyString( text ) ){ + + // split according to the regex used in the elasticsearch tokenizer + // see: https://github.com/pelias/schema/blob/master/settings.js + // see: settings.analysis.tokenizer.peliasNameTokenizer + clean.tokens = text + .split(/[\s,\\\/]+/) // split on delimeters + .filter(function(el){return el;}); // remove empty elements + } + + /** + the following section splits the tokens in to two arrays called + 'tokens_complete' and 'tokens_incomplete'. + + it also strips any tokens from 'tokens_incomplete' which might not + match the ngrams index (such as single grams not stored in the index). + **/ + + // split the tokens in to 'complete' and 'incomplete'. + if( clean.tokens.length ){ + + // if all the tokens are complete, simply copy them from clean.tokens + if( inputParserRanSuccessfully ){ + + // all these tokens are complete! + clean.tokens_complete = clean.tokens.slice(); + + // user hasn't finished typing yet + } else { + + // make a copy of the tokens and remove the last element + var tokensCopy = clean.tokens.slice(), + lastToken = tokensCopy.pop(); + + // set all but the last token as 'complete' + clean.tokens_complete = tokensCopy; + + /** + if the last token is a single non-numeric character then we must discard it. + + at time of writing, single non-numeric ngrams are not stored in the index, + sending them as part of the query would result in 0 documents being returned. + **/ + if( lastToken && ( lastToken.length > 1 || lastToken.match(/[0-9]/) ) ){ + clean.tokens_incomplete = [ lastToken ]; + } + } + + } + + return messages; +} + +// export function +module.exports = sanitize; diff --git a/sanitiser/autocomplete.js b/sanitiser/autocomplete.js index f96989564..8ab6fd9c7 100644 --- a/sanitiser/autocomplete.js +++ b/sanitiser/autocomplete.js @@ -4,6 +4,7 @@ var sanitizeAll = require('../sanitiser/sanitizeAll'), sanitizers = { singleScalarParameters: require('../sanitiser/_single_scalar_parameters'), text: require('../sanitiser/_text'), + tokenizer: require('../sanitiser/_tokenizer'), size: require('../sanitiser/_size')(10, 10, 10), layers: require('../sanitiser/_targets')('layers', type_mapping.layer_mapping), sources: require('../sanitiser/_targets')('sources', type_mapping.source_mapping), diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index fbe800529..b4cc33d2f 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -7,24 +7,31 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', + 'boost': 1, + 'slop': 3, 'query': 'one', - 'type': 'phrase', - 'operator': 'and' + 'type': 'phrase' } } }], 'should':[{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasPhrase', + 'boost': 1, + 'slop': 3, + 'query': 'one', + 'type': 'phrase' + } + } + },{ 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +52,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus.js b/test/unit/fixture/autocomplete_linguistic_focus.js index 4f722b84c..430d43c9d 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus.js +++ b/test/unit/fixture/autocomplete_linguistic_focus.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js index d9c04fd13..9a4afc05e 100644 --- a/test/unit/fixture/autocomplete_linguistic_focus_null_island.js +++ b/test/unit/fixture/autocomplete_linguistic_focus_null_island.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,7 +20,7 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -40,7 +40,7 @@ module.exports = { 'decay': 0.5 } }, - 'weight': 10 + 'weight': 40 }], 'score_mode': 'avg', 'boost_mode': 'multiply', @@ -64,11 +64,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -89,11 +86,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 9018fdab6..d0465b043 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -6,11 +6,11 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } @@ -18,7 +18,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'three', 'type': 'phrase', @@ -26,16 +26,25 @@ module.exports = { } } }], - 'should':[{ + 'should':[ + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, + { 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -56,11 +65,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two three', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_only.js b/test/unit/fixture/autocomplete_linguistic_only.js index 24b89ad96..4360f4d7e 100644 --- a/test/unit/fixture/autocomplete_linguistic_only.js +++ b/test/unit/fixture/autocomplete_linguistic_only.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245f64514..7cb51eea9 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -7,11 +7,11 @@ module.exports = { 'must': [ { 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', + 'name.default': { + 'analyzer': 'peliasQueryFullToken', 'type': 'phrase', 'boost': 1, - 'slop': 2, + 'slop': 3, 'query': 'one two' } } @@ -54,6 +54,15 @@ module.exports = { } } }, + { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 600, + 'query': 'three' + } + } + }, { 'match': { 'parent.localadmin': { @@ -81,16 +90,24 @@ module.exports = { } } }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'one two' + } + } + }, { 'function_score': { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -114,11 +131,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'one two', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js new file mode 100644 index 000000000..f89e84935 --- /dev/null +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -0,0 +1,155 @@ + +module.exports = { + 'query': { + 'filtered': { + 'query': { + 'bool': { + 'must': [{ + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'type': 'phrase', + 'boost': 1, + 'slop': 3, + 'query': 'k road' + } + } + }], + 'should':[ + { + 'match': { + 'address_parts.street': { + 'query': 'k road', + 'boost': 5, + 'analyzer': 'peliasStreet' + } + } + }, { + 'match': { + 'parent.country': { + 'query': 'laird', + 'boost': 800, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.region_a': { + 'query': 'laird', + 'boost': 600, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.county': { + 'query': 'laird', + 'boost': 400, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.borough': { + 'analyzer': 'peliasAdmin', + 'boost': 600, + 'query': 'laird' + } + } + }, { + 'match': { + 'parent.localadmin': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.locality': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, { + 'match': { + 'parent.neighbourhood': { + 'query': 'laird', + 'boost': 200, + 'analyzer': 'peliasAdmin' + } + } + }, + { + 'match': { + 'phrase.default': { + 'analyzer' : 'peliasPhrase', + 'type' : 'phrase', + 'boost' : 1, + 'slop' : 3, + 'query' : 'k road' + } + } + }, + { + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'popularity', + 'missing': 1 + }, + 'weight': 1 + }] + } + },{ + 'function_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'k road', + } + } + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'population', + 'missing': 1 + }, + 'weight': 3 + }] + } + }] + } + } + } + }, + 'sort': [ '_score' ], + 'size': 20, + 'track_scores': true +}; diff --git a/test/unit/fixture/autocomplete_with_source_filtering.js b/test/unit/fixture/autocomplete_with_source_filtering.js index 22c12a5d8..075eb6d46 100644 --- a/test/unit/fixture/autocomplete_with_source_filtering.js +++ b/test/unit/fixture/autocomplete_with_source_filtering.js @@ -7,7 +7,7 @@ module.exports = { 'must': [{ 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', + 'analyzer': 'peliasQueryPartialToken', 'boost': 100, 'query': 'test', 'type': 'phrase', @@ -20,11 +20,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, @@ -45,11 +42,8 @@ module.exports = { 'query': { 'match': { 'name.default': { - 'analyzer': 'peliasPhrase', - 'boost': 100, + 'analyzer': 'peliasQueryFullToken', 'query': 'test', - 'type': 'phrase', - 'operator': 'and' } } }, diff --git a/test/unit/fixture/search_boundary_country.js b/test/unit/fixture/search_boundary_country.js index 4bf453153..71965df41 100644 --- a/test/unit/fixture/search_boundary_country.js +++ b/test/unit/fixture/search_boundary_country.js @@ -18,7 +18,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } } diff --git a/test/unit/fixture/search_full_address.js b/test/unit/fixture/search_full_address.js index bef0a7b00..dfd64e345 100644 --- a/test/unit/fixture/search_full_address.js +++ b/test/unit/fixture/search_full_address.js @@ -9,7 +9,7 @@ module.exports = { 'match': { 'name.default': { 'query': '123 main st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_linguistic_bbox.js b/test/unit/fixture/search_linguistic_bbox.js index 5bb5907cf..6afe7be6d 100644 --- a/test/unit/fixture/search_linguistic_bbox.js +++ b/test/unit/fixture/search_linguistic_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus.js b/test/unit/fixture/search_linguistic_focus.js index 5d03d66db..da3e8fb39 100644 --- a/test/unit/fixture/search_linguistic_focus.js +++ b/test/unit/fixture/search_linguistic_focus.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_bbox.js b/test/unit/fixture/search_linguistic_focus_bbox.js index 96fe92f6a..d5042c0f5 100644 --- a/test/unit/fixture/search_linguistic_focus_bbox.js +++ b/test/unit/fixture/search_linguistic_focus_bbox.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_focus_null_island.js b/test/unit/fixture/search_linguistic_focus_null_island.js index 0924475d5..b99febea4 100644 --- a/test/unit/fixture/search_linguistic_focus_null_island.js +++ b/test/unit/fixture/search_linguistic_focus_null_island.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_only.js b/test/unit/fixture/search_linguistic_only.js index 58c05826a..a564a4c17 100644 --- a/test/unit/fixture/search_linguistic_only.js +++ b/test/unit/fixture/search_linguistic_only.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/fixture/search_linguistic_viewport.js b/test/unit/fixture/search_linguistic_viewport.js index be76ab056..b85d83225 100644 --- a/test/unit/fixture/search_linguistic_viewport.js +++ b/test/unit/fixture/search_linguistic_viewport.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js index cf44d0d80..e6b50ac6d 100644 --- a/test/unit/fixture/search_linguistic_viewport_min_diagonal.js +++ b/test/unit/fixture/search_linguistic_viewport_min_diagonal.js @@ -7,7 +7,7 @@ module.exports = { { 'match': { 'name.default': { - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1, 'query': 'test' } diff --git a/test/unit/fixture/search_partial_address.js b/test/unit/fixture/search_partial_address.js index 290d28e54..746899b74 100644 --- a/test/unit/fixture/search_partial_address.js +++ b/test/unit/fixture/search_partial_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': 'soho grand', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_regions_address.js b/test/unit/fixture/search_regions_address.js index 343dfc43e..0a8b199da 100644 --- a/test/unit/fixture/search_regions_address.js +++ b/test/unit/fixture/search_regions_address.js @@ -10,7 +10,7 @@ module.exports = { 'match': { 'name.default': { 'query': '1 water st', - 'analyzer': 'peliasOneEdgeGram', + 'analyzer': 'peliasIndexOneEdgeGram', 'boost': 1 } } diff --git a/test/unit/fixture/search_with_source_filtering.js b/test/unit/fixture/search_with_source_filtering.js index 593eac5b6..18ee13a3b 100644 --- a/test/unit/fixture/search_with_source_filtering.js +++ b/test/unit/fixture/search_with_source_filtering.js @@ -9,7 +9,7 @@ module.exports = { 'name.default': { 'query': 'test', 'boost': 1, - 'analyzer': 'peliasOneEdgeGram' + 'analyzer': 'peliasIndexOneEdgeGram' } } }], diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index f4b44ffab..bb368fc95 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -13,25 +13,31 @@ module.exports.tests.interface = function(test, common) { module.exports.tests.query = function(test, common) { test('valid lingustic-only autocomplete', function(t) { var query = generate({ - text: 'test' + text: 'test', + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_only'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_only'); t.end(); }); test('valid lingustic autocomplete with 3 tokens', function(t) { var query = generate({ - text: 'one two three' + text: 'one two three', + tokens: ['one','two','three'], + tokens_complete: ['one','two'], + tokens_incomplete: ['three'] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_multiple_tokens.js'); + var expected = require('../fixture/autocomplete_linguistic_multiple_tokens'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_multiple_tokens'); t.end(); }); @@ -42,13 +48,16 @@ module.exports.tests.query = function(test, common) { name: 'one two', regions: [ 'one two', 'three' ], admin_parts: 'three' - } + }, + tokens: ['one','two'], + tokens_complete: ['one','two'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_with_admin.js'); + var expected = require('../fixture/autocomplete_linguistic_with_admin'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_with_admin'); t.end(); }); @@ -57,13 +66,16 @@ module.exports.tests.query = function(test, common) { // note: if 1 grams are enabled at a later date, remove this behaviour. test('valid lingustic autocomplete final token', function(t) { var query = generate({ - text: 'one t' + text: 'one t', + tokens: ['one','t'], + tokens_complete: ['one'], + tokens_incomplete: [] }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/autocomplete_linguistic_final_token.js'); + var expected = require('../fixture/autocomplete_linguistic_final_token'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_final_token'); t.end(); }); @@ -71,13 +83,16 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 29.49136, - 'focus.point.lon': -82.50622 + 'focus.point.lon': -82.50622, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus'); t.end(); }); @@ -85,20 +100,26 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'test', 'focus.point.lat': 0, - 'focus.point.lon': 0 + 'focus.point.lon': 0, + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/autocomplete_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid autocomplete query'); + t.deepEqual(compiled, expected, 'autocomplete_linguistic_focus_null_island'); t.end(); }); test('valid sources filter', function(t) { var query = generate({ 'text': 'test', - 'sources': ['test_source'] + 'sources': ['test_source'], + tokens: ['test'], + tokens_complete: [], + tokens_incomplete: ['test'] }); var compiled = JSON.parse( JSON.stringify( query ) ); @@ -107,6 +128,26 @@ module.exports.tests.query = function(test, common) { t.deepEqual(compiled, expected, 'valid autocomplete query with source filtering'); t.end(); }); + + test('single character street address', function(t) { + var query = generate({ + text: 'k road, laird', + parsed_text: { + name: 'k road', + street: 'k road', + regions: [ 'laird' ] + }, + tokens: ['k', 'road'], + tokens_complete: ['k', 'road'], + tokens_incomplete: [] + }); + + var compiled = JSON.parse( JSON.stringify( query ) ); + var expected = require('../fixture/autocomplete_single_character_street'); + + t.deepEqual(compiled, expected, 'autocomplete_single_character_street'); + t.end(); + }); }; module.exports.all = function (tape, common) { diff --git a/test/unit/query/search.js b/test/unit/query/search.js index 426eb2bcc..a2bb8e2f8 100644 --- a/test/unit/query/search.js +++ b/test/unit/query/search.js @@ -25,7 +25,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_bbox'); t.end(); }); @@ -42,7 +42,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_bbox'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_bbox'); t.end(); }); @@ -55,7 +55,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_only'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_only'); t.end(); }); @@ -69,7 +69,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus'); t.end(); }); @@ -86,7 +86,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_viewport'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_viewport'); t.end(); }); @@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_null_island'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_linguistic_focus_null_island'); t.end(); }); @@ -134,7 +134,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_full_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_full_address'); t.end(); }); @@ -149,7 +149,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_partial_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_partial_address'); t.end(); }); @@ -164,7 +164,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_regions_address'); - t.deepEqual(compiled, expected, 'valid search query'); + t.deepEqual(compiled, expected, 'search_regions_address'); t.end(); }); diff --git a/test/unit/run.js b/test/unit/run.js index 94d9ebb35..1a6f7a904 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -46,6 +46,7 @@ var tests = [ require('./sanitiser/_sources'), require('./sanitiser/_sources_and_layers'), require('./sanitiser/_text'), + require('./sanitiser/_tokenizer'), require('./sanitiser/_deprecate_quattroshapes'), require('./src/backend'), require('./sanitiser/autocomplete'), diff --git a/test/unit/sanitiser/_tokenizer.js b/test/unit/sanitiser/_tokenizer.js new file mode 100644 index 000000000..8837d4abf --- /dev/null +++ b/test/unit/sanitiser/_tokenizer.js @@ -0,0 +1,457 @@ +var sanitiser = require('../../../sanitiser/_tokenizer'); + +module.exports.tests = {}; + +module.exports.tests.sanity_checks = function(test, common) { + test('clean.text not set', function(t) { + + var clean = {}; // clean.text not set + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.text not a string', function(t) { + + var clean = { text: {} }; // clean.text not a string + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('empty string', function(t) { + + var clean = { text: '' }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + + var clean = { parsed_text: { text: {} } }; + var messages = sanitiser({}, clean); + + // no tokens produced + t.deepEquals(clean.tokens, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [], 'no tokens'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.text', function(t) { + + var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text street data over clean.text', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over clean.text + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { + + var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var messages = sanitiser({}, clean); + + // favor clean.parsed_text.name over all other variables + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.space_delimiter = function(test, common) { + test('space delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('space delimiter - multiple spaces / other whitespace', function(t) { + + var clean = { text: ' 30 west \t26th \nstreet new york ' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.comma_delimiter = function(test, common) { + test('comma delimiter - simple', function(t) { + + var clean = { text: '30 west 26th street, new york' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('comma delimiter - multiple commas', function(t) { + + var clean = { text: ',30 west 26th street,,, new york,' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + '30', + 'west', + '26th', + 'street', + 'new', + 'york' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + '30', + 'west', + '26th', + 'street', + 'new' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'york' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.forward_slash_delimiter = function(test, common) { + test('forward slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street/133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('forward slash - multiple slashes', function(t) { + + var clean = { text: '/Bedell Street//133rd Avenue/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'Bedell', + 'Street', + '133rd' + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.final_token_single_gram = function(test, common) { + test('final token single gram - numeric', function(t) { + + var clean = { text: 'grolmanstrasse 1' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + '1' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token marked as 'incomplete' + t.deepEquals(clean.tokens_incomplete, [ + '1' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('final token single gram - non-numeric', function(t) { + + var clean = { text: 'grolmanstrasse a' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'grolmanstrasse', + 'a' + ], 'tokens produced'); + + // all but last token marked as 'complete' + t.deepEquals(clean.tokens_complete, [ + 'grolmanstrasse', + ], 'tokens produced'); + + // last token removed! + t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.back_slash_delimiter = function(test, common) { + test('back slash delimiter - simple', function(t) { + + var clean = { text: 'Bedell Street\\133rd Avenue' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('back slash - multiple slashes', function(t) { + + var clean = { text: '\\Bedell Street\\\\133rd Avenue\\' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.tests.mixed_delimiter = function(test, common) { + test('mixed delimiters', function(t) { + + var clean = { text: ',/Bedell Street\\, \n\t ,\\//133rd Avenue, /\n/' }; + var messages = sanitiser({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, [ + 'Bedell', + 'Street', + '133rd', + 'Avenue' + ], 'tokens produced'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('SANITISER _tokenizer: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitiser/autocomplete.js b/test/unit/sanitiser/autocomplete.js index 26bf9afb4..186cb4b67 100644 --- a/test/unit/sanitiser/autocomplete.js +++ b/test/unit/sanitiser/autocomplete.js @@ -4,7 +4,10 @@ module.exports.tests = {}; module.exports.tests.sanitisers = function(test, common) { test('check sanitiser list', function (t) { - var expected = ['singleScalarParameters', 'text', 'size', 'layers', 'sources', 'sources_and_layers', 'private', 'geo_autocomplete' ]; + var expected = [ + 'singleScalarParameters', 'text', 'tokenizer', 'size', 'layers', 'sources', + 'sources_and_layers', 'private', 'geo_autocomplete' + ]; t.deepEqual(Object.keys(autocomplete.sanitiser_list), expected); t.end(); });