Skip to content

Commit

Permalink
Watch out for both upper and lowercase dotted I
Browse files Browse the repository at this point in the history
This issue has some more info.
osmlab/name-suggestion-index#8261

Don't know whether this letter is used by osm-community-index communities,
but we might as well all use the same simplify.js code
  • Loading branch information
bhousel committed Jun 13, 2023
1 parent bed76e7 commit b7d3346
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
1 change: 1 addition & 0 deletions index.mjs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export { resolveStrings } from './lib/resolve_strings.js';
export { simplify } from './lib/simplify.js';
5 changes: 3 additions & 2 deletions lib/simplify.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// External
import diacritics from 'diacritics';

// remove spaces, punctuation, diacritics
Expand All @@ -8,8 +9,8 @@ export function simplify(str) {
return diacritics.remove(
str
.replace(/&/g, 'and')
.replace(/İ/ig, 'i')
.replace(/[\s\-=_!"#%'*{},.\/:;?\(\)\[\]@\\$\^*+<>«»~`’\u00a1\u00a7\u00b6\u00b7\u00bf\u037e\u0387\u055a-\u055f\u0589\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d\u07f7-\u07f9\u0830-\u083e\u085e\u0964\u0965\u0970\u0af0\u0df4\u0e4f\u0e5a\u0e5b\u0f04-\u0f12\u0f14\u0f85\u0fd0-\u0fd4\u0fd9\u0fda\u104a-\u104f\u10fb\u1360-\u1368\u166d\u166e\u16eb-\u16ed\u1735\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u1805\u1807-\u180a\u1944\u1945\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-\u1b60\u1bfc-\u1bff\u1c3b-\u1c3f\u1c7e\u1c7f\u1cc0-\u1cc7\u1cd3\u200b-\u200f\u2016\u2017\u2020-\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2d70\u2e00\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e16\u2e18\u2e19\u2e1b\u2e1e\u2e1f\u2e2a-\u2e2e\u2e30-\u2e39\u3001-\u3003\u303d\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uaaf0\uaaf1\uabeb\ufe10-\ufe16\ufe19\ufe30\ufe45\ufe46\ufe49-\ufe4c\ufe50-\ufe52\ufe54-\ufe57\ufe5f-\ufe61\ufe68\ufe6a\ufe6b\ufeff\uff01-\uff03\uff05-\uff07\uff0a\uff0c\uff0e\uff0f\uff1a\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65]+/g,'')
.replace(/(İ|i̇)/ig, 'i') // see name-suggestion-index#5017, #8261
.replace(/[\s\-=_!"#%'*{},.\/:;?\(\)\[\]@\\$\^*+<>«»~`’\u00a1\u00a7\u00b6\u00b7\u00bf\u037e\u0387\u055a-\u055f\u0589\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d\u07f7-\u07f9\u0830-\u083e\u085e\u0964\u0965\u0970\u0af0\u0df4\u0e4f\u0e5a\u0e5b\u0f04-\u0f12\u0f14\u0f85\u0fd0-\u0fd4\u0fd9\u0fda\u104a-\u104f\u10fb\u1360-\u1368\u166d\u166e\u16eb-\u16ed\u1735\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u1805\u1807-\u180a\u1944\u1945\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-\u1b60\u1bfc-\u1bff\u1c3b-\u1c3f\u1c7e\u1c7f\u1cc0-\u1cc7\u1cd3\u2000-\u206f\u2cf9-\u2cfc\u2cfe\u2cff\u2d70\u2e00-\u2e7f\u3001-\u3003\u303d\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uaaf0\uaaf1\uabeb\ufe10-\ufe16\ufe19\ufe30\ufe45\ufe46\ufe49-\ufe4c\ufe50-\ufe52\ufe54-\ufe57\ufe5f-\ufe61\ufe68\ufe6a\ufe6b\ufeff\uff01-\uff03\uff05-\uff07\uff0a\uff0c\uff0e\uff0f\uff1a\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65]+/g,'')
.toLowerCase()
);
}
63 changes: 63 additions & 0 deletions tests/simplify.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import { test } from 'tap';
import { simplify } from '../index.mjs';

test('simplify', t => {

t.test('lowercases', t => {
t.equal(simplify('Aldo'), 'aldo');
t.end();
});

t.test('replaces diacritics', t => {
t.equal(simplify('André'), 'andre');
t.end();
});

t.test('removes spaces', t => {
t.equal(simplify('Jimmy Choo'), 'jimmychoo');
t.end();
});

t.test('removes various dashes', t => {
t.equal(simplify('PTV - Metropolitan'), 'ptvmetropolitan'); // hypen
t.equal(simplify('PTV – Metropolitan'), 'ptvmetropolitan'); // en dash (U+2013)
t.equal(simplify('PTV — Metropolitan'), 'ptvmetropolitan'); // em dash (U+2014)
t.equal(simplify('PTV ― Metropolitan'), 'ptvmetropolitan'); // horizontal bar (U+2015)
t.end();
});

t.test('removes unprintable unicode (like RTL/LTR marks, zero width space, zero width nonjoiner)', t => {
t.equal(simplify('\u200FJim\u200Bmy\u200CChoo\u200E'), 'jimmychoo');
t.end();
});

t.test('removes punctuation', t => {
t.equal(simplify('K+K Schuh-Center'), 'kkschuhcenter');
t.end();
});

t.test('replaces & with and', t => {
t.equal(simplify('Johnston & Murphy'), 'johnstonandmurphy');
t.end();
});

t.test('replaces ß (eszett) with ss', t => {
t.equal(simplify('Beßon'), 'besson');
t.end();
});

t.test('replaces İ (0130) or i̇ (0069 0307) with i', t => { // #5017, #8261 for examples
t.equal(simplify('İnşaat'), 'insaat');
t.equal(simplify('i̇nşaat'), 'insaat');
t.end();
});

t.test('returns empty string if no input', t => {
t.equal(simplify(), '');
t.equal(simplify(null), '');
t.equal(simplify({}), '');
t.end();
});

t.end();
});

0 comments on commit b7d3346

Please sign in to comment.