Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Search functionality - Backend #91

Merged
merged 33 commits into from
Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e82e7d6
Add new paths
aadarsh-ram Sep 11, 2022
595e2c0
Merge branch 'main' into update-add-delete-paths
aadarsh-ram Sep 11, 2022
38c0a88
Change None to empty list
aadarsh-ram Sep 12, 2022
ecb3e59
Merge branch 'update-add-delete-paths' of https://github.com/openfood…
aadarsh-ram Sep 12, 2022
aa52b37
Add requested changes
aadarsh-ram Sep 13, 2022
06cedbc
Change query style
aadarsh-ram Sep 13, 2022
e3d6d51
Add search (id index only)
aadarsh-ram Sep 23, 2022
610a763
Merge branch 'main' into search-backend
aadarsh-ram Sep 23, 2022
6ac910c
Minor changes
aadarsh-ram Sep 23, 2022
3fb3274
Merge branch 'search-backend' of https://github.com/openfoodfacts/tax…
aadarsh-ram Sep 23, 2022
23eba18
Changes for passing tests
aadarsh-ram Sep 23, 2022
531b1e8
Change line max limit for flake8
aadarsh-ram Sep 23, 2022
4e9085e
Remove whitespace
aadarsh-ram Sep 23, 2022
d9b9eac
Change import order
aadarsh-ram Sep 23, 2022
bcc9a8b
Add sys import
aadarsh-ram Sep 23, 2022
cde0d4f
Black required changes
aadarsh-ram Sep 23, 2022
fb85e89
Add indexes for tags
aadarsh-ram Sep 24, 2022
dced5cb
Remove whitespace
aadarsh-ram Sep 24, 2022
a72cc6e
Sort imports
aadarsh-ram Sep 24, 2022
e266e58
Update import order
aadarsh-ram Sep 24, 2022
64510d1
Add black reformatted file
aadarsh-ram Sep 24, 2022
45f87e5
Change requirements.txt
aadarsh-ram Sep 24, 2022
c8a3346
Revert to default
aadarsh-ram Sep 28, 2022
15c256c
Change test-neo4j.json
aadarsh-ram Sep 28, 2022
41bc497
Change comment
aadarsh-ram Sep 28, 2022
137d690
Add query for normalized strings
aadarsh-ram Sep 29, 2022
ec8f6ec
Change normalizing function
aadarsh-ram Sep 29, 2022
ccb7141
Interchange var names
aadarsh-ram Sep 29, 2022
0f011b8
Remove normalizing hyphens
aadarsh-ram Oct 2, 2022
7659775
Merge branch 'main' into search-backend
aadarsh-ram Oct 2, 2022
ab71042
Change index config for hyphens
aadarsh-ram Oct 4, 2022
643c967
Merge branch 'search-backend' of https://github.com/openfoodfacts/tax…
aadarsh-ram Oct 4, 2022
911491a
Remove traling whitespace
aadarsh-ram Oct 4, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backend/editor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# DB helper imports
from .entries import initialize_db, shutdown_db
from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label
from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label, full_text_search
from .entries import update_nodes, update_node_children
from .entries import create_node, add_node_to_end, add_node_to_beginning, delete_node
#------------------------------------------------------------------------#
Expand Down Expand Up @@ -187,6 +187,11 @@ async def findFooter(response: Response):
footer = list(result)
return footer[0]

@app.get("/search")
async def searchNode(response: Response, query: str):
result = full_text_search(query)
return result

# Post methods

@app.post("/nodes")
Expand Down
48 changes: 48 additions & 0 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from neo4j import GraphDatabase # Interface with Neo4J
from . import settings # Neo4J settings
from normalizer import normalizing # Normalizing tags

def initialize_db():
"""
Expand Down Expand Up @@ -242,3 +243,50 @@ def update_node_children(entry, new_children_ids):
result = session.run(query, {"id": entry, "child": child})

return result

def full_text_search(text):
"""
Helper function used for searching a taxonomy
"""
# Escape special characters
normalized_text = re.sub(r"([^A-Za-z0-9 _])", r"\\\1", text)
normalized_id_text = re.sub(r"([^A-Za-z0-9 _])", r"\\\1", normalizing(text))
aadarsh-ram marked this conversation as resolved.
Show resolved Hide resolved

text_query_exact = "*" + normalized_text + '*'
text_query_fuzzy = normalized_text + "~"
text_id_query_exact = normalized_id_text + "~"
text_id_query_fuzzy = "*" + normalized_id_text + "*"
aadarsh-ram marked this conversation as resolved.
Show resolved Hide resolved
params = {
"text_query_fuzzy" : text_query_fuzzy,
"text_query_exact" : text_query_exact,
"text_id_query_fuzzy" : text_id_query_fuzzy,
"text_id_query_exact" : text_id_query_exact
}

# Fuzzy search and "* search" search on two indexes
# Fuzzy search has more priority, since it matches more close strings
query = """
CALL {
CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_fuzzy)
yield node, score as score_
return node, score_ * 3 as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_fuzzy)
yield node, score as score_
return node, score_ * 5 as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_exact)
yield node, score as score_
return node, score_ as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_exact)
yield node, score as score_
return node, score_ as score
}
with node.id as node, score
RETURN node, avg(score) as score
alexgarel marked this conversation as resolved.
Show resolved Hide resolved

ORDER BY score DESC
"""
result = [record["node"] for record in session.run(query, params)]
return result
30 changes: 30 additions & 0 deletions backend/editor/normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import re
aadarsh-ram marked this conversation as resolved.
Show resolved Hide resolved
import unicodedata
import unidecode

def normalizing(line, lang="default"):
"""normalize a string depending of the language code lang"""
line = unicodedata.normalize("NFC", line)

# removing accent
if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]:
line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line)
line = unidecode.unidecode(line)

# lower case except if language in list
if lang not in []:
line = line.lower()

# changing unwanted character to "-"
line = re.sub(r"[\u0000-\u0027\u200b]", "-", line)
line = re.sub(r"&\w+;", "-", line)
line = re.sub(
r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]", # noqa: E501
"-",
line,
)

# removing excess "-"
line = re.sub(r"-+", "-", line)
line = line.strip("-")
return line
11 changes: 11 additions & 0 deletions backend/sample/test-neo4j.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@
"main_language": "en",
"tags_en":["yogurts", "yoghurts"],
"tags_ids_en":["yogurts", "yoghurts"],
"tags_en_str":"yogurts yoghurts",
"tags_fr": ["yaourts", "yoghourts", "yogourts"],
"tags_ids_fr": ["yaourts", "yoghourts", "yogourts"],
"tags_fr_str":"yaourts yoghourts yogourts",
"preceding_lines" : [],
"src_position": 9
},
Expand All @@ -46,8 +48,10 @@
"main_language": "en",
"tags_en":["banana yogurts"],
"tags_ids_en":["banana-yogurts"],
"tags_en_str":"banana yogurts",
"tags_fr": ["yaourts à la banane"],
"tags_ids_fr": ["yaourts-banane"],
"tags_fr_str":"yaourts à la banane",
"preceding_lines" : [],
"src_position": 12
},
Expand All @@ -57,8 +61,10 @@
"main_language": "en",
"tags_en":["Passion fruit yogurts"],
"tags_ids_en":["passion-fruit-yogurts"],
"tags_en_str":"Passion fruit yogurts",
"tags_fr": ["yaourts au fruit de la passion"],
"tags_ids_fr": ["yaourts-fruit-passion"],
"tags_fr_str":"yaourts au fruit de la passion",
"preceding_lines" : [],
"src_position": 16
},
Expand All @@ -68,6 +74,7 @@
"main_language": "fr",
"tags_fr": ["yaourts au fruit de la passion allégés"],
"tags_ids_fr": ["yaourts-fruit-passion-alleges"],
"tags_fr_str":"yaourts au fruit de la passion allégés",
"preceding_lines" : [],
"src_position": 20
},
Expand All @@ -77,6 +84,7 @@
"main_language": "en",
"tags_en": ["meat"],
"tags_ids_en": ["meat"],
"tags_en_str": "meat",
"prop_vegan_en": "no",
"prop_carbon_footprint_fr_foodges_value_fr": "10",
"preceding_lines": ["# meat", ""],
Expand All @@ -88,6 +96,7 @@
"main_language": "en",
"tags_en": ["fake-meat"],
"tags_ids_en": ["fake-meat"],
"tags_en_str":"fake-meat",
"prop_vegan_en": "yes",
"preceding_lines" : [],
"src_position": 29
Expand All @@ -98,6 +107,7 @@
"main_language": "en",
"tags_en": ["fake-stuff"],
"tags_ids_en": ["fake-stuff"],
"tags_en_str":"fake-stuff",
"preceding_lines" : [],
"src_position": 33
},
Expand All @@ -107,6 +117,7 @@
"main_language": "en",
"tags_en": ["fake-duck-meat"],
"tags_ids_en": ["fake-duck-meat"],
"tags_en_str":"fake-duck-meat",
"preceding_lines" : [],
"src_position": 35
},
Expand Down
34 changes: 21 additions & 13 deletions parser/openfoodfacts_taxonomy_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import re
import sys
import unicodedata

import iso639
alexgarel marked this conversation as resolved.
Show resolved Hide resolved
import unidecode
from neo4j import GraphDatabase

Expand Down Expand Up @@ -58,11 +60,7 @@ def create_node(self, data):
entry_query += " SET n." + key + " = $" + key + "\n"

query = id_query + entry_query + position_query
self.session.run(
query,
data,
is_before=self.is_before,
)
self.session.run(query, data, is_before=self.is_before)

def normalized_filename(self, filename):
"""add the .txt extension if it is missing in the filename"""
Expand Down Expand Up @@ -323,6 +321,7 @@ def harvest(self, filename):
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
data["tags_" + lang] = tags_list
data["tags_" + lang + "_str"] = " ".join(tags_list)
aadarsh-ram marked this conversation as resolved.
Show resolved Hide resolved
data["tags_ids_" + lang] = tagsids_list
else:
# property definition
Expand Down Expand Up @@ -388,11 +387,7 @@ def create_previous_link(self):
id_previous,
)
elif not relation[0]:
logging.error(
"link not created between %s and %s",
id,
id_previous,
)
logging.error("link not created between %s and %s", id, id_previous)

def parent_search(self):
"""Get the parent and the child to link"""
Expand Down Expand Up @@ -423,18 +418,31 @@ def delete_used_properties(self):
query = "MATCH (n) SET n.is_before = null, n.parents = null"
self.session.run(query)

def create_fulltext_index(self):
query = """CREATE FULLTEXT INDEX nodeSearchIds FOR (n:ENTRY) ON EACH [n.id]"""
self.session.run(query)

language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
tags_prefixed_lc = ["n.tags_" + lc + "_str" for lc in language_codes]
tags_prefixed_lc = ", ".join(tags_prefixed_lc)
query = (
f"""CREATE FULLTEXT INDEX nodeSearchTags FOR (n:ENTRY) ON EACH [{tags_prefixed_lc}]"""
)
self.session.run(query)

def __call__(self, filename):
"""process the file"""
self.create_nodes(filename)
self.create_child_link()
self.create_previous_link()
self.create_fulltext_index()
# self.delete_used_properties()


if __name__ == "__main__":
import sys

logging.basicConfig(filename="parser.log", encoding="utf-8", level=logging.INFO)
logging.basicConfig(
handlers=[logging.FileHandler(filename="parser.log", encoding="utf-8")], level=logging.INFO
)
filename = sys.argv[1] if len(sys.argv) > 1 else "test"
parse = Parser()
parse(filename)
3 changes: 1 addition & 2 deletions parser/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ py==1.11.0
pyparsing==3.0.9
pytest==7.1.2
pytz==2022.1
tomli==2.0.1
Unidecode==1.3.4
tomli==2.0.1
3 changes: 2 additions & 1 deletion parser/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
neo4j==4.4.5
pytz==2022.1
Unidecode==1.3.4
Unidecode==1.3.4
iso-639==0.4.5