Skip to content

Commit

Permalink
🐛 Fix the error on special chars in search strings (#2261)
Browse files Browse the repository at this point in the history
  • Loading branch information
Koncopd authored Dec 6, 2024
1 parent f68b8b6 commit 93e3499
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 5 deletions.
38 changes: 38 additions & 0 deletions docs/faq/search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@
"bt.CellType.import_source()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a95bcef6",
"metadata": {},
"outputs": [],
"source": [
"ln.ULabel(name=\"cat[*_*]\").save()"
]
},
{
"cell_type": "markdown",
"id": "799b528e",
Expand Down Expand Up @@ -97,6 +107,34 @@
" assert query in top_record.name.lower() or query in top_record.synonyms.lower()"
]
},
{
"cell_type": "markdown",
"id": "00427176",
"metadata": {},
"source": [
"Check escaping of special characters."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cea360c6",
"metadata": {},
"outputs": [],
"source": [
"assert len(ln.ULabel.search(\"cat[\")) == 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04f1e84a",
"metadata": {},
"outputs": [],
"source": [
"assert len(ln.ULabel.search(\"*_*\")) == 1"
]
},
{
"cell_type": "markdown",
"id": "f4e41c9b",
Expand Down
12 changes: 7 additions & 5 deletions lamindb/_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import builtins
import re
from functools import reduce
from typing import TYPE_CHECKING, NamedTuple

Expand Down Expand Up @@ -316,6 +317,7 @@ def _search(
string = string[:n_80_pct]

string = string.strip()
string_escape = re.escape(string)

exact_lookup = Exact if case_sensitive else IExact
regex_lookup = Regex if case_sensitive else IRegex
Expand All @@ -334,28 +336,28 @@ def _search(
exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200
ranks.append(exact_rank)
# exact synonym
synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string}(?:\|.*|$)")
synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string_escape}(?:\|.*|$)")
synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200
ranks.append(synonym_rank)
# match as sub-phrase
sub_expr = regex_lookup(
field_expr, rf"(?:^|.*[ \|\.,;:]){string}(?:[ \|\.,;:].*|$)"
field_expr, rf"(?:^|.*[ \|\.,;:]){string_escape}(?:[ \|\.,;:].*|$)"
)
sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10
ranks.append(sub_rank)
# startswith and avoid matching string with " " on the right
# mostly for truncated
startswith_expr = regex_lookup(
field_expr, rf"(?:^|.*\|){string}[^ ]*(?:\|.*|$)"
field_expr, rf"(?:^|.*\|){string_escape}[^ ]*(?:\|.*|$)"
)
startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8
ranks.append(startswith_rank)
# match as sub-phrase from the left, mostly for truncated
right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string}.*")
right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string_escape}.*")
right_rank = Cast(right_expr, output_field=IntegerField()) * 2
ranks.append(right_rank)
# match as sub-phrase from the right
left_expr = regex_lookup(field_expr, rf".*{string}(?:$|[ \|\.,;:].*)")
left_expr = regex_lookup(field_expr, rf".*{string_escape}(?:$|[ \|\.,;:].*)")
left_rank = Cast(left_expr, output_field=IntegerField()) * 2
ranks.append(left_rank)
# simple contains filter
Expand Down

0 comments on commit 93e3499

Please sign in to comment.