openfoodfacts · BryanH01 · Jul 29, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 6, 2022
@@ -110,6 +110,7 @@ def normalizing(self, line, lang="default"):
 
     def remove_stopwords(self, lc, words):
         """to remove the stopwords that were read at the beginning of the file"""
+        # First check if this language has stopwords
         if lc in self.stopwords:
             words_to_remove = self.stopwords[lc]
             new_words = []
@@ -142,7 +143,7 @@ def new_node_data(self):
             "main_language": "",
             "preceding_lines": [],
             "parent_tag": [],
-            "src_position": 0,
+            "src_position": None,
         }
         return data
 
@@ -164,7 +165,7 @@ def header_harvest(self, filename):
                 break
             h += 1
 
-        # we don't want to eat the comments of the next block and it remove the last separating line
+        # we don't want to eat the comments of the next block and it removes the last separating line
         for i in range(len(header)):
             if header.pop():
                 h -= 1
@@ -175,22 +176,37 @@ def header_harvest(self, filename):
 
     def entry_end(self, line, data):
         """Return True if the block ended"""
-        if "stopwords" in line or "synonyms" in line or not line:
-            # can be the end of an block or just 2 line separators,
+        # stopwords and synonyms are one-liner, entries are separated by a blank line
+        if line.startswith("stopwords") or line.startswith("synonyms") or not line:
+            # can be the end of an block or just additional line separator,
             # file_iter() always end with ''
             if data["id"]:  # to be sure that it's an end
                 return True
         return False
 
     def remove_separating_line(self, data):
+        """
+        To remove the one separating line that is always there,
+        between synonyms part and stopwords part and before each entry
+        """
+        # first, check if there is at least one preceding line
         if data["preceding_lines"]:
-            if "synonyms" in data["id"]:
+            if data["id"].startswith("synonyms"):
+                # it's a synonyms block,
+                # if the previous block is a stopwords block,
+                # there is at least one separating line
                 if "stopwords" in self.is_before:
                     data["preceding_lines"].pop(0)
-            elif "stopwords" in data["id"]:
+
+            elif data["id"].startswith("stopwords"):
+                # it's a stopwords block,
+                # if the previous block is a synonyms block,
+                # there is at least one separating line
                 if "synonyms" in self.is_before:
                     data["preceding_lines"].pop(0)
+
             else:
+                # it's an entry block, there is always a separating line
                 data["preceding_lines"].pop(0)
         return data
 
@@ -199,7 +215,9 @@ def harvest(self, filename):
         index_stopwords = 0
         index_synonyms = 0
         language_code_prefix = re.compile("[a-zA-Z][a-zA-Z][a-zA-Z]?:")
-        self.stopwords = dict()
+        self.stopwords = (
+            dict()
+        )  # it will contain a list of stopwords with their language code as key
-        self.stopwords = (
-            dict()
-        )  # it will contain a list of stopwords with their language code as key
+        # stopwords will contain a list of stopwords with their language code as key
+        self.stopwords = {}
-        self.stopwords = (
-            dict()
-        )  # it will contain a list of stopwords with their language code as key
+        # stopwords will contain a list of stopwords with their language code as key
+        self.stopwords = {}
 
         # header
         header, next_line = self.header_harvest(filename)
@@ -219,23 +237,18 @@ def harvest(self, filename):
             # harvest the line
             if not (line) or line[0] == "#":
                 data["preceding_lines"].append(line)
-                if not data[
-                    "src_position"
-                ]:  # to get the position of the footer if it's not empty
-                    data["src_position"] = line_number + 1
             else:
-                if (
-                    len(data) == 5 and not data["parent_tag"]
-                ):  # the beginning of the entry
+                if not data["src_position"]:
                     data["src_position"] = line_number + 1
-                if "stopword" in line:
+                if line.startswith("stopwords"):
                     id = "stopwords:" + str(index_stopwords)
                     data = self.set_data_id(data, id, line_number)
                     index_stopwords += 1
                     lc, value = self.get_lc_value(line[10:])
                     data["tags_" + lc] = value
+                    # add the list with its lc
                     self.stopwords[lc] = value
-                elif "synonym" in line:
+                elif line.startswith("synonyms"):
                     id = "synonyms:" + str(index_synonyms)
                     data = self.set_data_id(data, id, line_number)
                     index_synonyms += 1
@@ -249,9 +262,9 @@ def harvest(self, filename):
                 elif language_code_prefix.match(line):
                     if not data["id"]:
                         data["id"] = self.add_line(line.split(",", 1)[0])
-                        data["main_language"] = data["id"][
-                            :2
-                        ]  # first 2 characters are language code
+                        data["main_language"] = data["id"].split(":", 1)[
+                            0
+                        ]  # first 2 or 3 characters are language code
                     # add tags and tagsid
                     lang, line = line.split(":", 1)
                     tags_list = []
@@ -269,8 +282,7 @@ def harvest(self, filename):
                     data["prop_" + property_name + "_" + lc] = property_value
         data["id"] = "__footer__"
         data["preceding_lines"].pop(0)
-        if not data["src_position"]:
-            data["src_position"] = line_number + 1  # to get position if it's empty
+        data["src_position"] = line_number + 1 - len(data["preceding_lines"])
         yield data
 
     def create_nodes(self, filename):

@@ -1,3 +1,3 @@
-neo4j=='4.4.5'
-re=='2.2.1'
-Unidecode=='1.3.4'
+neo4j==4.4.5
+pytz==2022.1
+Unidecode==1.3.4
diff --git a/parser/openfoodfacts_taxonomy_parser/test.txt b/parser/openfoodfacts_taxonomy_parser/test.txt
@@ -5,49 +5,98 @@
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test")
 
-
-@pytest.fixture
-def new_session():
-    x = parser.Parser()
+@pytest.fixture(autouse=True)
+def test_setup():
     # delete all the nodes and relations in the database
     query="MATCH (n) DETACH DELETE n"
-    x.session.run(query)
-    return x
-
+    parser.Parser().session.run(query)
 
-def test_calling(new_session):
-    x=new_session
+def test_calling():
+    test_parser = parser.Parser()
+    session = test_parser.session
 
     #Create node test
-    x.create_nodes(TEST_TAXONOMY_TXT)
+    test_parser.create_nodes(TEST_TAXONOMY_TXT)
 
     # total number of nodes
     query="MATCH (n) RETURN COUNT(*)"
-    result = x.session.run(query)
+    result = session.run(query)
     number_of_nodes = result.value()[0]
     assert number_of_nodes == 13
 
     # header correctly added
     query="MATCH (n) WHERE n.id = '__header__' RETURN n.preceding_lines"
-    result = x.session.run(query)
+    result = session.run(query)
     header = result.value()[0]
     assert header == ['# test taxonomy']
 
-    # comment / preceding lines correctly added
-    query="MATCH (n:ENTRY) WHERE size(n.preceding_lines)>0 RETURN n.id,n.preceding_lines"
-    result = x.session.run(query)
-    nodes = result.values()
-    number_of_nodes = len(nodes)
-    assert number_of_nodes == 1
-    assert nodes[0][0] == 'en:meat'
-    assert nodes[0][1] == ['# meat','']
 
+    # synonyms correctly added
+    query="MATCH (n:SYNONYMS) RETURN n ORDER BY n.src_position"
+    results = session.run(query)
+    expected_synonyms = [
+        { "id" : "synonyms:0",
+        "tags_en" : ["passion fruit", "passionfruit"],
+        "tags_ids_en" : ["passion-fruit", "passionfruit"],
+        "preceding_lines" : [],
+        "src_position" : 5 },
+        { "id" : "synonyms:1",
+        "tags_fr" : ["fruit de la passion", "maracuja", "passion"],
+        "tags_ids_fr" : ["fruit-passion", "maracuja", "passion"],
+        "preceding_lines" : [""],
+        "src_position" : 7 }
+    ]
+    for i, result in enumerate(results):
+        node = result.value()
+        for key in expected_synonyms[i]:
+            assert node[key] == expected_synonyms[i][key]
+
+
+    # stopwords correctly added
+    query="MATCH (n:STOPWORDS) RETURN n"
+    results = session.run(query)
+    expected_stopwords = {
+        "id" : "stopwords:0",
+        "tags_fr" : ["aux", "au", "de", "le", "du", "la", "a", "et"],
+        "preceding_lines" : []
+    }
+    for result in results:
+        node = result.value()
+        for key in expected_stopwords:
+            assert node[key] == expected_stopwords[key]
+
+
+    # entries correctly added
+    query = """
+        MATCH (n:ENTRY) 
+        WHERE n.id='en:banana-yogurts' 
+        OR n.id='en:meat'
+        RETURN n
+        ORDER BY n.src_position
+    """
+    results = session.run(query)
+    expected_entries = [
+        { "tags_en" : ["banana yogurts"],
+        "tags_ids_en" : ["banana-yogurts"],
+        "tags_fr" : ["yaourts à la banane"],
+        "tags_ids_fr" : ["yaourts-banane"],
+        "preceding_lines" : [], },
+        {"tags_en" : ["meat"],
+        "tags_ids_en" : ["meat"],
+        "preceding_lines" : ['# meat',''],
+        "prop_vegan_en" : "no",
+        "prop_carbon_footprint_fr_foodges_value_fr" : "10" }
+    ]
+    for i, result in enumerate(results):
+        node = result.value()
+        for key in expected_entries[i]:
+            assert node[key] == expected_entries[i][key]
 
 
     #Child link test
-    x.create_child_link() # nodes already added
+    test_parser.create_child_link() # nodes already added
     query="MATCH (c)-[:is_child_of]->(p) RETURN c.id, p.id"
-    results = x.session.run(query)
+    results = session.run(query)
     created_pairs = results.values()
 
     # correct number of links
@@ -66,10 +115,11 @@ def test_calling(new_session):
     for pair in created_pairs:
         assert pair in expected_pairs
 
+
     # Order link test
-    x.create_previous_link()
+    test_parser.create_previous_link()
     query="MATCH (n)-[:is_before]->(p) RETURN n.id, p.id "
-    results = x.session.run(query)
+    results = session.run(query)
     created_pairs = results.values()
 
     # correct number of links

@@ -1,4 +1,11 @@
-neo4j=='4.4.5'
-pytest=='7.1.2'
-re=='2.2.1'
-Unidecode=='1.3.4'
+attrs==22.1.0
+iniconfig==1.1.1
+neo4j==4.4.5
+packaging==21.3
+pluggy==1.0.0
+py==1.11.0
+pyparsing==3.0.9
+pytest==7.1.2
+pytz==2022.1
+tomli==2.0.1
+Unidecode==1.3.4
@@ -1,4 +1,3 @@
-import pytest
 import re
 import pathlib
 from openfoodfacts_taxonomy_parser import parser