Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: taxonomy parser library #18

Merged
merged 29 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5405f36
build: add basic neo4j capability
alexgarel Jul 5, 2022
3f748b0
build: add neo4j with docker
alexgarel Jul 5, 2022
567f5ac
Creating python script
BryanH01 Jul 6, 2022
8aea655
Update Parser.py
BryanH01 Jul 6, 2022
a631182
Update Parser.py
BryanH01 Jul 6, 2022
c41afca
Update Parser.py
BryanH01 Jul 6, 2022
f9e4212
New function : parent
BryanH01 Jul 7, 2022
4a7c7b2
Update Parser.py
BryanH01 Jul 8, 2022
f71d563
More pythonic code ? + test
BryanH01 Jul 15, 2022
b2b2877
Changed parser filename and updated it
BryanH01 Jul 19, 2022
d57c678
Update test_Parser.py
BryanH01 Jul 20, 2022
0f9460e
Changed file name
BryanH01 Jul 20, 2022
002a89f
Update taxonomy_parser.py
BryanH01 Jul 20, 2022
fb01ebe
Update taxonomy_parser.py
BryanH01 Jul 20, 2022
b1118d5
Update taxonomy_parser.py
BryanH01 Jul 21, 2022
f95fdfe
Changed directory, Updated parser with the comment and new spec
BryanH01 Jul 25, 2022
784ded7
Changed name and added integration test
BryanH01 Jul 26, 2022
35bff8e
tests: fix tests to have correct import
alexgarel Jul 26, 2022
3c49563
Small fix for header reading
BryanH01 Jul 26, 2022
e2b4a2b
Add requirements.txt
BryanH01 Jul 26, 2022
e9e84ab
Added main_language, made some corrections
BryanH01 Jul 27, 2022
fdafad8
Merge branch 'main' into parser
BryanH01 Jul 27, 2022
8fafca2
Changer name previous_block to is_before
BryanH01 Jul 27, 2022
74c6dee
Updated following your comments
BryanH01 Jul 28, 2022
93ad192
Update parser/openfoodfacts_taxonomy_parser/parser.py
BryanH01 Jul 28, 2022
b11886f
Changed harvesting method to correctly harvest stopwords and synonyms
BryanH01 Jul 28, 2022
03130fc
Update parser.py
BryanH01 Jul 28, 2022
3bc6f46
Changed with your suggestions
BryanH01 Jul 29, 2022
a7044ba
Final changes ?
BryanH01 Jul 29, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 33 additions & 21 deletions parser/openfoodfacts_taxonomy_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def normalizing(self, line, lang="default"):

def remove_stopwords(self, lc, words):
"""to remove the stopwords that were read at the beginning of the file"""
# First check if this language has stopwords
if lc in self.stopwords:
words_to_remove = self.stopwords[lc]
new_words = []
Expand Down Expand Up @@ -142,7 +143,7 @@ def new_node_data(self):
"main_language": "",
"preceding_lines": [],
"parent_tag": [],
"src_position": 0,
"src_position": None,
}
return data

Expand All @@ -164,7 +165,7 @@ def header_harvest(self, filename):
break
h += 1

# we don't want to eat the comments of the next block and it remove the last separating line
# we don't want to eat the comments of the next block and it removes the last separating line
for i in range(len(header)):
if header.pop():
h -= 1
Expand All @@ -175,22 +176,37 @@ def header_harvest(self, filename):

def entry_end(self, line, data):
"""Return True if the block ended"""
if "stopwords" in line or "synonyms" in line or not line:
# can be the end of an block or just 2 line separators,
# stopwords and synonyms are one-liner, entries are separated by a blank line
if line.startswith("stopwords") or line.startswith("synonyms") or not line:
# can be the end of an block or just additional line separator,
# file_iter() always end with ''
if data["id"]: # to be sure that it's an end
return True
return False

def remove_separating_line(self, data):
"""
To remove the one separating line that is always there,
between synonyms part and stopwords part and before each entry
"""
# first, check if there is at least one preceding line
if data["preceding_lines"]:
if "synonyms" in data["id"]:
if data["id"].startswith("synonyms"):
# it's a synonyms block,
# if the previous block is a stopwords block,
# there is at least one separating line
if "stopwords" in self.is_before:
data["preceding_lines"].pop(0)
elif "stopwords" in data["id"]:

elif data["id"].startswith("stopwords"):
# it's a stopwords block,
# if the previous block is a synonyms block,
# there is at least one separating line
if "synonyms" in self.is_before:
data["preceding_lines"].pop(0)

else:
# it's an entry block, there is always a separating line
data["preceding_lines"].pop(0)
Copy link
Member

@alexgarel alexgarel Jul 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think your assumption here are too much based upon what you have seen so far, without any guarantee. There is no guarantee that we have new lines if we change type and so on. Remember that taxonomies are formatted by humans.
You need at least to check that data["preceding_lines"][0] is empty.

So I really think my proposal is the good one.
see: #18 (comment)
If in rare cases it adds or remove a blank line, this is ok, I mean we do not lose any important information.

return data

Expand All @@ -199,7 +215,9 @@ def harvest(self, filename):
index_stopwords = 0
index_synonyms = 0
language_code_prefix = re.compile("[a-zA-Z][a-zA-Z][a-zA-Z]?:")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not mandatory: As this is a constant and in someway a parameter, I would move it to a class attribute.

self.stopwords = dict()
self.stopwords = (
dict()
) # it will contain a list of stopwords with their language code as key
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not simply put the comment before the line ? That's the way we do normally !

Also better use the {} to init a dict (not a big deal though)

Suggested change
self.stopwords = (
dict()
) # it will contain a list of stopwords with their language code as key
# stopwords will contain a list of stopwords with their language code as key
self.stopwords = {}


# header
header, next_line = self.header_harvest(filename)
Expand All @@ -219,23 +237,18 @@ def harvest(self, filename):
# harvest the line
if not (line) or line[0] == "#":
data["preceding_lines"].append(line)
if not data[
"src_position"
]: # to get the position of the footer if it's not empty
data["src_position"] = line_number + 1
else:
if (
len(data) == 5 and not data["parent_tag"]
): # the beginning of the entry
if not data["src_position"]:
data["src_position"] = line_number + 1
if "stopword" in line:
if line.startswith("stopwords"):
id = "stopwords:" + str(index_stopwords)
data = self.set_data_id(data, id, line_number)
index_stopwords += 1
lc, value = self.get_lc_value(line[10:])
data["tags_" + lc] = value
# add the list with its lc
self.stopwords[lc] = value
elif "synonym" in line:
elif line.startswith("synonyms"):
id = "synonyms:" + str(index_synonyms)
data = self.set_data_id(data, id, line_number)
index_synonyms += 1
Expand All @@ -249,9 +262,9 @@ def harvest(self, filename):
elif language_code_prefix.match(line):
if not data["id"]:
data["id"] = self.add_line(line.split(",", 1)[0])
data["main_language"] = data["id"][
:2
] # first 2 characters are language code
data["main_language"] = data["id"].split(":", 1)[
0
] # first 2 or 3 characters are language code
# add tags and tagsid
lang, line = line.split(":", 1)
tags_list = []
Expand All @@ -269,8 +282,7 @@ def harvest(self, filename):
data["prop_" + property_name + "_" + lc] = property_value
data["id"] = "__footer__"
data["preceding_lines"].pop(0)
if not data["src_position"]:
data["src_position"] = line_number + 1 # to get position if it's empty
data["src_position"] = line_number + 1 - len(data["preceding_lines"])
yield data

def create_nodes(self, filename):
Expand Down
6 changes: 3 additions & 3 deletions parser/openfoodfacts_taxonomy_parser/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
neo4j=='4.4.5'
re=='2.2.1'
Unidecode=='1.3.4'
neo4j==4.4.5
pytz==2022.1
Unidecode==1.3.4
37 changes: 0 additions & 37 deletions parser/openfoodfacts_taxonomy_parser/test.txt

This file was deleted.

98 changes: 74 additions & 24 deletions parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,98 @@
# taxonomy in text format : test.txt
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test")


@pytest.fixture
def new_session():
x = parser.Parser()
@pytest.fixture(autouse=True)
def test_setup():
# delete all the nodes and relations in the database
query="MATCH (n) DETACH DELETE n"
x.session.run(query)
return x

parser.Parser().session.run(query)

def test_calling(new_session):
x=new_session
def test_calling():
test_parser = parser.Parser()
session = test_parser.session

#Create node test
x.create_nodes(TEST_TAXONOMY_TXT)
test_parser.create_nodes(TEST_TAXONOMY_TXT)

# total number of nodes
query="MATCH (n) RETURN COUNT(*)"
result = x.session.run(query)
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 13

# header correctly added
query="MATCH (n) WHERE n.id = '__header__' RETURN n.preceding_lines"
result = x.session.run(query)
result = session.run(query)
header = result.value()[0]
assert header == ['# test taxonomy']

# comment / preceding lines correctly added
query="MATCH (n:ENTRY) WHERE size(n.preceding_lines)>0 RETURN n.id,n.preceding_lines"
result = x.session.run(query)
nodes = result.values()
number_of_nodes = len(nodes)
assert number_of_nodes == 1
assert nodes[0][0] == 'en:meat'
assert nodes[0][1] == ['# meat','']

# synonyms correctly added
query="MATCH (n:SYNONYMS) RETURN n ORDER BY n.src_position"
results = session.run(query)
expected_synonyms = [
{ "id" : "synonyms:0",
"tags_en" : ["passion fruit", "passionfruit"],
"tags_ids_en" : ["passion-fruit", "passionfruit"],
"preceding_lines" : [],
"src_position" : 5 },
{ "id" : "synonyms:1",
"tags_fr" : ["fruit de la passion", "maracuja", "passion"],
"tags_ids_fr" : ["fruit-passion", "maracuja", "passion"],
"preceding_lines" : [""],
"src_position" : 7 }
]
for i, result in enumerate(results):
node = result.value()
for key in expected_synonyms[i]:
assert node[key] == expected_synonyms[i][key]


# stopwords correctly added
query="MATCH (n:STOPWORDS) RETURN n"
results = session.run(query)
expected_stopwords = {
"id" : "stopwords:0",
"tags_fr" : ["aux", "au", "de", "le", "du", "la", "a", "et"],
"preceding_lines" : []
}
for result in results:
node = result.value()
for key in expected_stopwords:
assert node[key] == expected_stopwords[key]


# entries correctly added
alexgarel marked this conversation as resolved.
Show resolved Hide resolved
query = """
MATCH (n:ENTRY)
WHERE n.id='en:banana-yogurts'
OR n.id='en:meat'
RETURN n
ORDER BY n.src_position
"""
results = session.run(query)
expected_entries = [
{ "tags_en" : ["banana yogurts"],
"tags_ids_en" : ["banana-yogurts"],
"tags_fr" : ["yaourts à la banane"],
"tags_ids_fr" : ["yaourts-banane"],
"preceding_lines" : [], },
{"tags_en" : ["meat"],
"tags_ids_en" : ["meat"],
"preceding_lines" : ['# meat',''],
"prop_vegan_en" : "no",
"prop_carbon_footprint_fr_foodges_value_fr" : "10" }
]
for i, result in enumerate(results):
node = result.value()
for key in expected_entries[i]:
assert node[key] == expected_entries[i][key]


#Child link test
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 links part is well tested.

x.create_child_link() # nodes already added
test_parser.create_child_link() # nodes already added
query="MATCH (c)-[:is_child_of]->(p) RETURN c.id, p.id"
results = x.session.run(query)
results = session.run(query)
created_pairs = results.values()

# correct number of links
Expand All @@ -66,10 +115,11 @@ def test_calling(new_session):
for pair in created_pairs:
assert pair in expected_pairs


# Order link test
x.create_previous_link()
test_parser.create_previous_link()
query="MATCH (n)-[:is_before]->(p) RETURN n.id, p.id "
results = x.session.run(query)
results = session.run(query)
created_pairs = results.values()

# correct number of links
Expand Down
15 changes: 11 additions & 4 deletions parser/tests/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
neo4j=='4.4.5'
pytest=='7.1.2'
re=='2.2.1'
Unidecode=='1.3.4'
attrs==22.1.0
iniconfig==1.1.1
neo4j==4.4.5
packaging==21.3
pluggy==1.0.0
py==1.11.0
pyparsing==3.0.9
pytest==7.1.2
pytz==2022.1
tomli==2.0.1
Unidecode==1.3.4
1 change: 0 additions & 1 deletion parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pytest
import re
import pathlib
from openfoodfacts_taxonomy_parser import parser
Expand Down