openfoodfacts · perierc · Feb 1, 2024 · Jan 31, 2024 · Jan 31, 2024 · Feb 1, 2024
@@ -60,13 +60,14 @@ async def create_node(self, label, entry, main_language_code):
         """
         params = {"id": entry}
         query = [f"""CREATE (n:{self.project_name}:{label})\n"""]
+        stopwords = await self.get_stopwords_dict()
 
         # Build all basic keys of a node
         if label == "ENTRY":
             # Normalizing new canonical tag
             language_code, canonical_tag = entry.split(":", 1)
             normalised_canonical_tag = parser_utils.normalize_text(
-                canonical_tag, main_language_code
+                canonical_tag, main_language_code, stopwords=stopwords
             )
 
             # Reconstructing and updation of node ID
@@ -438,6 +439,28 @@ async def get_children(self, entry):
         result = await get_current_transaction().run(query, {"id": entry})
         return await async_list(result)
 
+    async def get_stopwords_dict(self):
+        """
+        Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary
+        where the keys are the language codes, and the values are the stopwords in the
+        corresponding language
+        """
+        query = f"""
+            MATCH (s:{self.project_name}:STOPWORDS)
+            WITH keys(s) AS properties, s
+            UNWIND properties AS property
+            WITH s, property
+            WHERE property STARTS WITH 'tags_ids'
+            RETURN property AS tags_ids_lc, s[property] AS stopwords
+        """
+        result = await get_current_transaction().run(query)
+        records = await async_list(result)
+        stopwords_dict = {}
+        for record in records:
+            language_code = record["tags_ids_lc"].split("_")[-1]
+            stopwords_dict[language_code] = record["stopwords"]
+        return stopwords_dict
+
     async def update_node(self, label, entry, new_node):
         """
         Helper function used for updation of node with given id and label
@@ -471,14 +494,17 @@ async def update_node(self, label, entry, new_node):
 
         # Adding normalized tags ids corresponding to entry tags
         normalised_new_node = {}
+        stopwords = await self.get_stopwords_dict()
         for key in set(new_node.keys()) - deleted_keys:
             if key.startswith("tags_"):
                 if "_ids_" not in key:
                     keys_language_code = key.split("_", 1)[1]
                     normalised_value = []
                     for value in new_node[key]:
                         normalised_value.append(
-                            parser_utils.normalize_text(value, keys_language_code)
+                            parser_utils.normalize_text(
+                                value, keys_language_code, stopwords=stopwords
+                            )
                         )
                     normalised_new_node[key] = new_node[key]
                     normalised_new_node["tags_ids_" + keys_language_code] = normalised_value

@@ -98,35 +98,22 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
                 line_count += 1
             yield line_count, ""  # to end the last entry if not ended
 
-    def _remove_stopwords(self, lc: str, words: str) -> str:
-        """Remove the stopwords that were read at the beginning of the file"""
-        # First check if this language has stopwords
-        if lc in self.stopwords:
-            words_to_remove = self.stopwords[lc]
-            new_words = []
-            for word in words.split("-"):
-                if word not in words_to_remove:
-                    new_words.append(word)
-            return ("-").join(new_words)
-        else:
-            return words
-
     def _add_line(self, line: str) -> str:
         """
         Get a normalized string but keeping the language code "lc:",
         used for id and parent tag
         """
         lc, line = line.split(":", 1)
         new_line = lc + ":"
-        new_line += self._remove_stopwords(lc, normalize_text(line, lc))
+        new_line += normalize_text(line, lc, stopwords=self.stopwords)
         return new_line
 
     def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
         """Get the language code "lc" and a list of normalized values"""
         lc, line = line.split(":", 1)
         new_line: list[str] = []
         for word in line.split(","):
-            new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
+            new_line.append(normalize_text(word, lc, stopwords=self.stopwords))
         return lc, new_line
 
     def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
@@ -291,7 +278,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     tagsids_list = []
                     for word in line.split(","):
                         tags_list.append(word.strip())
-                        word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
+                        word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)

@@ -4,7 +4,7 @@
 import unidecode
 
 
-def normalize_text(line: str, lang="default", char="-"):
+def normalize_text(line: str, lang="default", char="-", stopwords={}):
     """Normalize a string depending on the language code"""
     line = unicodedata.normalize("NFC", line)
 
@@ -29,6 +29,15 @@ def normalize_text(line: str, lang="default", char="-"):
     # Removing excess "-"
     line = re.sub(r"-+", char, line)
     line = line.strip(char)
+
+    # Remove stopwords
+    if lang in stopwords:
+        stopwords = stopwords[lang]
+        line_surrounded_by_char = char + line + char
+        for stopword in stopwords:
+            line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
+        line = line_surrounded_by_char[1:-1]
+
     return line