Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(backend,parser): remove stopwords from normalized tags on node update #369

Merged
merged 3 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,14 @@ async def create_node(self, label, entry, main_language_code):
"""
params = {"id": entry}
query = [f"""CREATE (n:{self.project_name}:{label})\n"""]
stopwords = await self.get_stopwords_dict()

# Build all basic keys of a node
if label == "ENTRY":
# Normalizing new canonical tag
language_code, canonical_tag = entry.split(":", 1)
normalised_canonical_tag = parser_utils.normalize_text(
canonical_tag, main_language_code
canonical_tag, main_language_code, stopwords=stopwords
)

# Reconstructing and updation of node ID
Expand Down Expand Up @@ -438,6 +439,28 @@ async def get_children(self, entry):
result = await get_current_transaction().run(query, {"id": entry})
return await async_list(result)

async def get_stopwords_dict(self):
perierc marked this conversation as resolved.
Show resolved Hide resolved
"""
Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary
where the keys are the language codes, and the values are the stopwords in the
corresponding language
"""
query = f"""
MATCH (s:{self.project_name}:STOPWORDS)
WITH keys(s) AS properties, s
UNWIND properties AS property
WITH s, property
WHERE property STARTS WITH 'tags_ids'
RETURN property AS tags_ids_lc, s[property] AS stopwords
"""
result = await get_current_transaction().run(query)
records = await async_list(result)
stopwords_dict = {}
for record in records:
language_code = record["tags_ids_lc"].split("_")[-1]
stopwords_dict[language_code] = record["stopwords"]
perierc marked this conversation as resolved.
Show resolved Hide resolved
return stopwords_dict
perierc marked this conversation as resolved.
Show resolved Hide resolved

async def update_node(self, label, entry, new_node):
"""
Helper function used for updation of node with given id and label
Expand Down Expand Up @@ -471,14 +494,17 @@ async def update_node(self, label, entry, new_node):

# Adding normalized tags ids corresponding to entry tags
normalised_new_node = {}
stopwords = await self.get_stopwords_dict()
for key in set(new_node.keys()) - deleted_keys:
if key.startswith("tags_"):
if "_ids_" not in key:
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(
parser_utils.normalize_text(value, keys_language_code)
parser_utils.normalize_text(
value, keys_language_code, stopwords=stopwords
)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
Expand Down
19 changes: 3 additions & 16 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,35 +98,22 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
line_count += 1
yield line_count, "" # to end the last entry if not ended

def _remove_stopwords(self, lc: str, words: str) -> str:
"""Remove the stopwords that were read at the beginning of the file"""
# First check if this language has stopwords
if lc in self.stopwords:
words_to_remove = self.stopwords[lc]
new_words = []
for word in words.split("-"):
if word not in words_to_remove:
new_words.append(word)
return ("-").join(new_words)
else:
return words

def _add_line(self, line: str) -> str:
"""
Get a normalized string but keeping the language code "lc:",
used for id and parent tag
"""
lc, line = line.split(":", 1)
new_line = lc + ":"
new_line += self._remove_stopwords(lc, normalize_text(line, lc))
new_line += normalize_text(line, lc, stopwords=self.stopwords)
return new_line

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
lc, line = line.split(":", 1)
new_line: list[str] = []
for word in line.split(","):
new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
new_line.append(normalize_text(word, lc, stopwords=self.stopwords))
return lc, new_line

def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
Expand Down Expand Up @@ -291,7 +278,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list = []
for word in line.split(","):
tags_list.append(word.strip())
word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
Expand Down
11 changes: 10 additions & 1 deletion parser/openfoodfacts_taxonomy_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unidecode


def normalize_text(line: str, lang="default", char="-"):
def normalize_text(line: str, lang="default", char="-", stopwords={}):
perierc marked this conversation as resolved.
Show resolved Hide resolved
"""Normalize a string depending on the language code"""
line = unicodedata.normalize("NFC", line)

Expand All @@ -29,6 +29,15 @@ def normalize_text(line: str, lang="default", char="-"):
# Removing excess "-"
line = re.sub(r"-+", char, line)
line = line.strip(char)

# Remove stopwords
if lang in stopwords:
stopwords = stopwords[lang]
line_surrounded_by_char = char + line + char
for stopword in stopwords:
line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
line = line_surrounded_by_char[1:-1]
perierc marked this conversation as resolved.
Show resolved Hide resolved
perierc marked this conversation as resolved.
Show resolved Hide resolved

return line


Expand Down
Loading