diff --git a/docs/faq/search.ipynb b/docs/faq/search.ipynb index c048624f3..80a9104e1 100644 --- a/docs/faq/search.ipynb +++ b/docs/faq/search.ipynb @@ -49,6 +49,16 @@ "bt.CellType.import_source()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a95bcef6", + "metadata": {}, + "outputs": [], + "source": [ + "ln.ULabel(name=\"cat[*_*]\").save()" + ] + }, { "cell_type": "markdown", "id": "799b528e", @@ -97,6 +107,34 @@ " assert query in top_record.name.lower() or query in top_record.synonyms.lower()" ] }, + { + "cell_type": "markdown", + "id": "00427176", + "metadata": {}, + "source": [ + "Check escaping of special characters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cea360c6", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(ln.ULabel.search(\"cat[\")) == 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04f1e84a", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(ln.ULabel.search(\"*_*\")) == 1" + ] + }, { "cell_type": "markdown", "id": "f4e41c9b", diff --git a/lamindb/_record.py b/lamindb/_record.py index 76307f064..2f47566b0 100644 --- a/lamindb/_record.py +++ b/lamindb/_record.py @@ -1,6 +1,7 @@ from __future__ import annotations import builtins +import re from functools import reduce from typing import TYPE_CHECKING, NamedTuple @@ -316,6 +317,7 @@ def _search( string = string[:n_80_pct] string = string.strip() + string_escape = re.escape(string) exact_lookup = Exact if case_sensitive else IExact regex_lookup = Regex if case_sensitive else IRegex @@ -334,28 +336,28 @@ def _search( exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200 ranks.append(exact_rank) # exact synonym - synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string}(?:\|.*|$)") + synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string_escape}(?:\|.*|$)") synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200 ranks.append(synonym_rank) # match as sub-phrase sub_expr = regex_lookup( - field_expr, rf"(?:^|.*[ \|\.,;:]){string}(?:[ \|\.,;:].*|$)" + field_expr, rf"(?:^|.*[ \|\.,;:]){string_escape}(?:[ \|\.,;:].*|$)" ) sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10 ranks.append(sub_rank) # startswith and avoid matching string with " " on the right # mostly for truncated startswith_expr = regex_lookup( - field_expr, rf"(?:^|.*\|){string}[^ ]*(?:\|.*|$)" + field_expr, rf"(?:^|.*\|){string_escape}[^ ]*(?:\|.*|$)" ) startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8 ranks.append(startswith_rank) # match as sub-phrase from the left, mostly for truncated - right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string}.*") + right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string_escape}.*") right_rank = Cast(right_expr, output_field=IntegerField()) * 2 ranks.append(right_rank) # match as sub-phrase from the right - left_expr = regex_lookup(field_expr, rf".*{string}(?:$|[ \|\.,;:].*)") + left_expr = regex_lookup(field_expr, rf".*{string_escape}(?:$|[ \|\.,;:].*)") left_rank = Cast(left_expr, output_field=IntegerField()) * 2 ranks.append(left_rank) # simple contains filter