-
-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: taxonomy parser library #18
Changes from 1 commit
5405f36
3f748b0
567f5ac
8aea655
a631182
c41afca
f9e4212
4a7c7b2
f71d563
b2b2877
d57c678
0f9460e
002a89f
fb01ebe
b1118d5
f95fdfe
784ded7
35bff8e
3c49563
e2b4a2b
e9e84ab
fdafad8
8fafca2
74c6dee
93ad192
b11886f
03130fc
3bc6f46
a7044ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -110,6 +110,7 @@ def normalizing(self, line, lang="default"): | |||||||||||
|
||||||||||||
def remove_stopwords(self, lc, words): | ||||||||||||
"""to remove the stopwords that were read at the beginning of the file""" | ||||||||||||
# First check if this language has stopwords | ||||||||||||
if lc in self.stopwords: | ||||||||||||
words_to_remove = self.stopwords[lc] | ||||||||||||
new_words = [] | ||||||||||||
|
@@ -142,7 +143,7 @@ def new_node_data(self): | |||||||||||
"main_language": "", | ||||||||||||
"preceding_lines": [], | ||||||||||||
"parent_tag": [], | ||||||||||||
"src_position": 0, | ||||||||||||
"src_position": None, | ||||||||||||
} | ||||||||||||
return data | ||||||||||||
|
||||||||||||
|
@@ -164,7 +165,7 @@ def header_harvest(self, filename): | |||||||||||
break | ||||||||||||
h += 1 | ||||||||||||
|
||||||||||||
# we don't want to eat the comments of the next block and it remove the last separating line | ||||||||||||
# we don't want to eat the comments of the next block and it removes the last separating line | ||||||||||||
for i in range(len(header)): | ||||||||||||
if header.pop(): | ||||||||||||
h -= 1 | ||||||||||||
|
@@ -175,22 +176,37 @@ def header_harvest(self, filename): | |||||||||||
|
||||||||||||
def entry_end(self, line, data): | ||||||||||||
"""Return True if the block ended""" | ||||||||||||
if "stopwords" in line or "synonyms" in line or not line: | ||||||||||||
# can be the end of an block or just 2 line separators, | ||||||||||||
# stopwords and synonyms are one-liner, entries are separated by a blank line | ||||||||||||
if line.startswith("stopwords") or line.startswith("synonyms") or not line: | ||||||||||||
# can be the end of an block or just additional line separator, | ||||||||||||
# file_iter() always end with '' | ||||||||||||
if data["id"]: # to be sure that it's an end | ||||||||||||
return True | ||||||||||||
return False | ||||||||||||
|
||||||||||||
def remove_separating_line(self, data): | ||||||||||||
""" | ||||||||||||
To remove the one separating line that is always there, | ||||||||||||
between synonyms part and stopwords part and before each entry | ||||||||||||
""" | ||||||||||||
# first, check if there is at least one preceding line | ||||||||||||
if data["preceding_lines"]: | ||||||||||||
if "synonyms" in data["id"]: | ||||||||||||
if data["id"].startswith("synonyms"): | ||||||||||||
# it's a synonyms block, | ||||||||||||
# if the previous block is a stopwords block, | ||||||||||||
# there is at least one separating line | ||||||||||||
if "stopwords" in self.is_before: | ||||||||||||
data["preceding_lines"].pop(0) | ||||||||||||
elif "stopwords" in data["id"]: | ||||||||||||
|
||||||||||||
elif data["id"].startswith("stopwords"): | ||||||||||||
# it's a stopwords block, | ||||||||||||
# if the previous block is a synonyms block, | ||||||||||||
# there is at least one separating line | ||||||||||||
if "synonyms" in self.is_before: | ||||||||||||
data["preceding_lines"].pop(0) | ||||||||||||
|
||||||||||||
else: | ||||||||||||
# it's an entry block, there is always a separating line | ||||||||||||
data["preceding_lines"].pop(0) | ||||||||||||
return data | ||||||||||||
|
||||||||||||
|
@@ -199,7 +215,9 @@ def harvest(self, filename): | |||||||||||
index_stopwords = 0 | ||||||||||||
index_synonyms = 0 | ||||||||||||
language_code_prefix = re.compile("[a-zA-Z][a-zA-Z][a-zA-Z]?:") | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not mandatory: As this is a constant and in someway a parameter, I would move it to a class attribute. |
||||||||||||
self.stopwords = dict() | ||||||||||||
self.stopwords = ( | ||||||||||||
dict() | ||||||||||||
) # it will contain a list of stopwords with their language code as key | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not simply put the comment before the line ? That's the way we do normally ! Also better use the
Suggested change
|
||||||||||||
|
||||||||||||
# header | ||||||||||||
header, next_line = self.header_harvest(filename) | ||||||||||||
|
@@ -219,23 +237,18 @@ def harvest(self, filename): | |||||||||||
# harvest the line | ||||||||||||
if not (line) or line[0] == "#": | ||||||||||||
data["preceding_lines"].append(line) | ||||||||||||
if not data[ | ||||||||||||
"src_position" | ||||||||||||
]: # to get the position of the footer if it's not empty | ||||||||||||
data["src_position"] = line_number + 1 | ||||||||||||
else: | ||||||||||||
if ( | ||||||||||||
len(data) == 5 and not data["parent_tag"] | ||||||||||||
): # the beginning of the entry | ||||||||||||
if not data["src_position"]: | ||||||||||||
data["src_position"] = line_number + 1 | ||||||||||||
if "stopword" in line: | ||||||||||||
if line.startswith("stopwords"): | ||||||||||||
id = "stopwords:" + str(index_stopwords) | ||||||||||||
data = self.set_data_id(data, id, line_number) | ||||||||||||
index_stopwords += 1 | ||||||||||||
lc, value = self.get_lc_value(line[10:]) | ||||||||||||
data["tags_" + lc] = value | ||||||||||||
# add the list with its lc | ||||||||||||
self.stopwords[lc] = value | ||||||||||||
elif "synonym" in line: | ||||||||||||
elif line.startswith("synonyms"): | ||||||||||||
id = "synonyms:" + str(index_synonyms) | ||||||||||||
data = self.set_data_id(data, id, line_number) | ||||||||||||
index_synonyms += 1 | ||||||||||||
|
@@ -249,9 +262,9 @@ def harvest(self, filename): | |||||||||||
elif language_code_prefix.match(line): | ||||||||||||
if not data["id"]: | ||||||||||||
data["id"] = self.add_line(line.split(",", 1)[0]) | ||||||||||||
data["main_language"] = data["id"][ | ||||||||||||
:2 | ||||||||||||
] # first 2 characters are language code | ||||||||||||
data["main_language"] = data["id"].split(":", 1)[ | ||||||||||||
0 | ||||||||||||
] # first 2 or 3 characters are language code | ||||||||||||
# add tags and tagsid | ||||||||||||
lang, line = line.split(":", 1) | ||||||||||||
tags_list = [] | ||||||||||||
|
@@ -269,8 +282,7 @@ def harvest(self, filename): | |||||||||||
data["prop_" + property_name + "_" + lc] = property_value | ||||||||||||
data["id"] = "__footer__" | ||||||||||||
data["preceding_lines"].pop(0) | ||||||||||||
if not data["src_position"]: | ||||||||||||
data["src_position"] = line_number + 1 # to get position if it's empty | ||||||||||||
data["src_position"] = line_number + 1 - len(data["preceding_lines"]) | ||||||||||||
yield data | ||||||||||||
|
||||||||||||
def create_nodes(self, filename): | ||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
neo4j=='4.4.5' | ||
re=='2.2.1' | ||
Unidecode=='1.3.4' | ||
neo4j==4.4.5 | ||
pytz==2022.1 | ||
Unidecode==1.3.4 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,49 +5,98 @@ | |
# taxonomy in text format : test.txt | ||
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test") | ||
|
||
|
||
@pytest.fixture | ||
def new_session(): | ||
x = parser.Parser() | ||
@pytest.fixture(autouse=True) | ||
def test_setup(): | ||
# delete all the nodes and relations in the database | ||
query="MATCH (n) DETACH DELETE n" | ||
x.session.run(query) | ||
return x | ||
|
||
parser.Parser().session.run(query) | ||
|
||
def test_calling(new_session): | ||
x=new_session | ||
def test_calling(): | ||
test_parser = parser.Parser() | ||
session = test_parser.session | ||
|
||
#Create node test | ||
x.create_nodes(TEST_TAXONOMY_TXT) | ||
test_parser.create_nodes(TEST_TAXONOMY_TXT) | ||
|
||
# total number of nodes | ||
query="MATCH (n) RETURN COUNT(*)" | ||
result = x.session.run(query) | ||
result = session.run(query) | ||
number_of_nodes = result.value()[0] | ||
assert number_of_nodes == 13 | ||
|
||
# header correctly added | ||
query="MATCH (n) WHERE n.id = '__header__' RETURN n.preceding_lines" | ||
result = x.session.run(query) | ||
result = session.run(query) | ||
header = result.value()[0] | ||
assert header == ['# test taxonomy'] | ||
|
||
# comment / preceding lines correctly added | ||
query="MATCH (n:ENTRY) WHERE size(n.preceding_lines)>0 RETURN n.id,n.preceding_lines" | ||
result = x.session.run(query) | ||
nodes = result.values() | ||
number_of_nodes = len(nodes) | ||
assert number_of_nodes == 1 | ||
assert nodes[0][0] == 'en:meat' | ||
assert nodes[0][1] == ['# meat',''] | ||
|
||
# synonyms correctly added | ||
query="MATCH (n:SYNONYMS) RETURN n ORDER BY n.src_position" | ||
results = session.run(query) | ||
expected_synonyms = [ | ||
{ "id" : "synonyms:0", | ||
"tags_en" : ["passion fruit", "passionfruit"], | ||
"tags_ids_en" : ["passion-fruit", "passionfruit"], | ||
"preceding_lines" : [], | ||
"src_position" : 5 }, | ||
{ "id" : "synonyms:1", | ||
"tags_fr" : ["fruit de la passion", "maracuja", "passion"], | ||
"tags_ids_fr" : ["fruit-passion", "maracuja", "passion"], | ||
"preceding_lines" : [""], | ||
"src_position" : 7 } | ||
] | ||
for i, result in enumerate(results): | ||
node = result.value() | ||
for key in expected_synonyms[i]: | ||
assert node[key] == expected_synonyms[i][key] | ||
|
||
|
||
# stopwords correctly added | ||
query="MATCH (n:STOPWORDS) RETURN n" | ||
results = session.run(query) | ||
expected_stopwords = { | ||
"id" : "stopwords:0", | ||
"tags_fr" : ["aux", "au", "de", "le", "du", "la", "a", "et"], | ||
"preceding_lines" : [] | ||
} | ||
for result in results: | ||
node = result.value() | ||
for key in expected_stopwords: | ||
assert node[key] == expected_stopwords[key] | ||
|
||
|
||
# entries correctly added | ||
alexgarel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
query = """ | ||
MATCH (n:ENTRY) | ||
WHERE n.id='en:banana-yogurts' | ||
OR n.id='en:meat' | ||
RETURN n | ||
ORDER BY n.src_position | ||
""" | ||
results = session.run(query) | ||
expected_entries = [ | ||
{ "tags_en" : ["banana yogurts"], | ||
"tags_ids_en" : ["banana-yogurts"], | ||
"tags_fr" : ["yaourts à la banane"], | ||
"tags_ids_fr" : ["yaourts-banane"], | ||
"preceding_lines" : [], }, | ||
{"tags_en" : ["meat"], | ||
"tags_ids_en" : ["meat"], | ||
"preceding_lines" : ['# meat',''], | ||
"prop_vegan_en" : "no", | ||
"prop_carbon_footprint_fr_foodges_value_fr" : "10" } | ||
] | ||
for i, result in enumerate(results): | ||
node = result.value() | ||
for key in expected_entries[i]: | ||
assert node[key] == expected_entries[i][key] | ||
|
||
|
||
#Child link test | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 links part is well tested. |
||
x.create_child_link() # nodes already added | ||
test_parser.create_child_link() # nodes already added | ||
query="MATCH (c)-[:is_child_of]->(p) RETURN c.id, p.id" | ||
results = x.session.run(query) | ||
results = session.run(query) | ||
created_pairs = results.values() | ||
|
||
# correct number of links | ||
|
@@ -66,10 +115,11 @@ def test_calling(new_session): | |
for pair in created_pairs: | ||
assert pair in expected_pairs | ||
|
||
|
||
# Order link test | ||
x.create_previous_link() | ||
test_parser.create_previous_link() | ||
query="MATCH (n)-[:is_before]->(p) RETURN n.id, p.id " | ||
results = x.session.run(query) | ||
results = session.run(query) | ||
created_pairs = results.values() | ||
|
||
# correct number of links | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,11 @@ | ||
neo4j=='4.4.5' | ||
pytest=='7.1.2' | ||
re=='2.2.1' | ||
Unidecode=='1.3.4' | ||
attrs==22.1.0 | ||
iniconfig==1.1.1 | ||
neo4j==4.4.5 | ||
packaging==21.3 | ||
pluggy==1.0.0 | ||
py==1.11.0 | ||
pyparsing==3.0.9 | ||
pytest==7.1.2 | ||
pytz==2022.1 | ||
tomli==2.0.1 | ||
Unidecode==1.3.4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
import pytest | ||
import re | ||
import pathlib | ||
from openfoodfacts_taxonomy_parser import parser | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think your assumption here are too much based upon what you have seen so far, without any guarantee. There is no guarantee that we have new lines if we change type and so on. Remember that taxonomies are formatted by humans.
You need at least to check that
data["preceding_lines"][0]
is empty.So I really think my proposal is the good one.
see: #18 (comment)
If in rare cases it adds or remove a blank line, this is ok, I mean we do not lose any important information.