Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: taxonomy parser library #18

Merged
merged 29 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5405f36
build: add basic neo4j capability
alexgarel Jul 5, 2022
3f748b0
build: add neo4j with docker
alexgarel Jul 5, 2022
567f5ac
Creating python script
BryanH01 Jul 6, 2022
8aea655
Update Parser.py
BryanH01 Jul 6, 2022
a631182
Update Parser.py
BryanH01 Jul 6, 2022
c41afca
Update Parser.py
BryanH01 Jul 6, 2022
f9e4212
New function : parent
BryanH01 Jul 7, 2022
4a7c7b2
Update Parser.py
BryanH01 Jul 8, 2022
f71d563
More pythonic code ? + test
BryanH01 Jul 15, 2022
b2b2877
Changed parser filename and updated it
BryanH01 Jul 19, 2022
d57c678
Update test_Parser.py
BryanH01 Jul 20, 2022
0f9460e
Changed file name
BryanH01 Jul 20, 2022
002a89f
Update taxonomy_parser.py
BryanH01 Jul 20, 2022
fb01ebe
Update taxonomy_parser.py
BryanH01 Jul 20, 2022
b1118d5
Update taxonomy_parser.py
BryanH01 Jul 21, 2022
f95fdfe
Changed directory, Updated parser with the comment and new spec
BryanH01 Jul 25, 2022
784ded7
Changed name and added integration test
BryanH01 Jul 26, 2022
35bff8e
tests: fix tests to have correct import
alexgarel Jul 26, 2022
3c49563
Small fix for header reading
BryanH01 Jul 26, 2022
e2b4a2b
Add requirements.txt
BryanH01 Jul 26, 2022
e9e84ab
Added main_language, made some corrections
BryanH01 Jul 27, 2022
fdafad8
Merge branch 'main' into parser
BryanH01 Jul 27, 2022
8fafca2
Changer name previous_block to is_before
BryanH01 Jul 27, 2022
74c6dee
Updated following your comments
BryanH01 Jul 28, 2022
93ad192
Update parser/openfoodfacts_taxonomy_parser/parser.py
BryanH01 Jul 28, 2022
b11886f
Changed harvesting method to correctly harvest stopwords and synonyms
BryanH01 Jul 28, 2022
03130fc
Update parser.py
BryanH01 Jul 28, 2022
3bc6f46
Changed with your suggestions
BryanH01 Jul 29, 2022
a7044ba
Final changes ?
BryanH01 Jul 29, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/how-to-guides/develop-with-neo4j.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,4 @@ If you are using a local python environment, the database will be accessible at
## Use it in another docker service

If you run another docker service and want to access neo4j, it will be accessible at `neo4j:7687`
provided your container runs in the same network (should default to `taxonomy-editor_default`)
provided your container runs in the same network (should default to `taxonomy-editor_default`)
Empty file.
6 changes: 6 additions & 0 deletions parser/openfoodfacts_taxonomy_parser/exception.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class DuplicateIDError(Exception):
''' Raised when attempting to change id before adding the related node i.e. when the .txt file is missing a new line '''

def __init__(self,line):
exception_message = f"missing new line at line {line}"
superinit = super().__init__(exception_message)
Comment on lines +4 to +6
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for a next time (no need to change it now).
It might be better to keep line in exception.args (it's a list) and superseed the __str__ function.
This is because when loging there is a way for tools like sentries to group exception, if we keep moving parts outside :-)

346 changes: 346 additions & 0 deletions parser/openfoodfacts_taxonomy_parser/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,346 @@
from neo4j import GraphDatabase
import re, unicodedata, unidecode
from .exception import DuplicateIDError


class Parser:
def __init__(self, uri="bolt://localhost:7687"):
self.driver = GraphDatabase.driver(uri)
self.session = (
self.driver.session()
) # Doesn't create error even if there is no active database

def create_headernode(self, header):
"""create the node for the header"""
query = """
CREATE (n:TEXT {id: '__header__' })
SET n.preceding_lines= $header
SET n.src_position= 1
"""
self.session.run(query, header=header)

def create_node(self, data):
"""run the query to create the node with data dictionary"""
position_query = """
SET n.is_before = $is_before
SET n.preceding_lines= $preceding_lines
SET n.src_position= $src_position
"""
entry_query = ""
if data["id"] == "__footer__":
id_query = " CREATE (n:TEXT {id: $id }) \n "
elif data["id"].startswith("synonyms"):
id_query = " CREATE (n:SYNONYMS {id: $id }) \n "
elif data["id"].startswith("stopwords"):
id_query = " CREATE (n:STOPWORDS {id: $id }) \n "
else:
id_query = (
" CREATE (n:ENTRY {id: $id , main_language : $main_language}) \n "
)
if data["parent_tag"]:
entry_query += " SET n.parents = $parent_tag \n"
for key in data:
if key.startswith("prop_"):
entry_query += " SET n." + key + " = $" + key + "\n"

for key in data:
if key.startswith("tags_"):
entry_query += " SET n." + key + " = $" + key + "\n"

query = id_query + entry_query + position_query
self.session.run(
query,
data,
is_before=self.is_before,
)

def normalized_filename(self, filename):
"""add the .txt extension if it is missing in the filename"""
return filename + (
".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else ""
)

def file_iter(self, filename, start=0):
"""generator to get the file line by line"""
with open(filename, "r", encoding="utf8") as file:
for line_number, line in enumerate(file):
if line_number < start:
continue
# sanitizing
# remove any space characters at end of line
line = line.rstrip()
# replace ’ (typographique quote) to simple quote '
line = line.replace("’", "'")
# replace commas that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
line = re.sub(r"\\,", "\\‚", line)
# removes parenthesis for roman numeral
line = re.sub(r"\(([ivx]+)\)", r"\1", line, flags=re.I)
yield line_number, line
yield line_number, "" # to end the last entry if not ended

def normalizing(self, line, lang="default"):
"""normalize a string depending of the language code lang"""
line = unicodedata.normalize("NFC", line)

# removing accent
if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]:
line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line)
line = unidecode.unidecode(line)

# lower case except if language in list
if lang not in []:
line = line.lower()

# changing unwanted character to "-"
line = re.sub(r"[\u0000-\u0027\u200b]", "-", line)
line = re.sub(r"&\w+;", "-", line)
line = re.sub(
r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]",
"-",
line,
)

# removing excess "-"
line = re.sub(r"-+", "-", line)
line = line.strip("-")
return line

def remove_stopwords(self, lc, words):
"""to remove the stopwords that were read at the beginning of the file"""
# First check if this language has stopwords
if lc in self.stopwords:
words_to_remove = self.stopwords[lc]
new_words = []
for word in words.split("-"):
if word not in words_to_remove:
new_words.append(word)
return ("-").join(new_words)
else:
return words

def add_line(self, line):
"""to get a normalized string but keeping the language code "lc:" , used for id and parent tag"""
lc, line = line.split(":", 1)
new_line = lc + ":"
new_line += self.remove_stopwords(lc, self.normalizing(line, lc))
return new_line

def get_lc_value(self, line):
"""to get the language code "lc" and a list of normalized values"""
lc, line = line.split(":", 1)
new_line = []
for word in line.split(","):
new_line.append(self.remove_stopwords(lc, self.normalizing(word, lc)))
return lc, new_line

def new_node_data(self):
"""To create an empty dictionary that will be used to create node"""
data = {
"id": "",
"main_language": "",
"preceding_lines": [],
"parent_tag": [],
"src_position": None,
}
return data

def set_data_id(self, data, id, line_number):
if not data["id"]:
data["id"] = id
else:
raise DuplicateIDError(line_number)
return data

def header_harvest(self, filename):
"""to harvest the header (comment with #), it has its own function because some header has multiple blocks"""
h = 0
header = []
for _, line in self.file_iter(filename):
if not (line) or line[0] == "#":
header.append(line)
else:
break
h += 1

# we don't want to eat the comments of the next block and it removes the last separating line
for i in range(len(header)):
if header.pop():
h -= 1
else:
break

return header, h

def entry_end(self, line, data):
"""Return True if the block ended"""
# stopwords and synonyms are one-liner, entries are separated by a blank line
if line.startswith("stopwords") or line.startswith("synonyms") or not line:
# can be the end of an block or just additional line separator,
# file_iter() always end with ''
if data["id"]: # to be sure that it's an end
return True
return False

def remove_separating_line(self, data):
"""
To remove the one separating line that is always there,
between synonyms part and stopwords part and before each entry
"""
# first, check if there is at least one preceding line
if data["preceding_lines"] and not data["preceding_lines"][0]:
if data["id"].startswith("synonyms"):
# it's a synonyms block,
# if the previous block is a stopwords block,
# there is at least one separating line
if "stopwords" in self.is_before:
data["preceding_lines"].pop(0)

elif data["id"].startswith("stopwords"):
# it's a stopwords block,
# if the previous block is a synonyms block,
# there is at least one separating line
if "synonyms" in self.is_before:
data["preceding_lines"].pop(0)

else:
# it's an entry block, there is always a separating line
data["preceding_lines"].pop(0)
return data

def harvest(self, filename):
"""Transform data from file to dictionary"""
index_stopwords = 0
index_synonyms = 0
language_code_prefix = re.compile("[a-zA-Z][a-zA-Z][a-zA-Z]?:")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not mandatory: As this is a constant and in someway a parameter, I would move it to a class attribute.

# stopwords will contain a list of stopwords with their language code as key
self.stopwords = {}

# header
header, next_line = self.header_harvest(filename)
yield header
self.is_before = "__header__"

# the other entries
data = self.new_node_data()
for line_number, line in self.file_iter(filename, next_line):
# yield data if block ended
if self.entry_end(line, data):
data = self.remove_separating_line(data)
yield data # another function will use this dictionary to create a node
self.is_before = data["id"]
data = self.new_node_data()

# harvest the line
if not (line) or line[0] == "#":
data["preceding_lines"].append(line)
else:
if not data["src_position"]:
data["src_position"] = line_number + 1
if line.startswith("stopwords"):
id = "stopwords:" + str(index_stopwords)
data = self.set_data_id(data, id, line_number)
index_stopwords += 1
lc, value = self.get_lc_value(line[10:])
data["tags_" + lc] = value
# add the list with its lc
self.stopwords[lc] = value
elif line.startswith("synonyms"):
id = "synonyms:" + str(index_synonyms)
data = self.set_data_id(data, id, line_number)
index_synonyms += 1
line = line[9:]
tags = [words.strip() for words in line[3:].split(",")]
lc, value = self.get_lc_value(line)
data["tags_" + lc] = tags
data["tags_ids_" + lc] = value
elif line[0] == "<":
data["parent_tag"].append(self.add_line(line[1:]))
elif language_code_prefix.match(line):
if not data["id"]:
data["id"] = self.add_line(line.split(",", 1)[0])
# first 2-3 characters before ":" are the language code
data["main_language"] = data["id"].split(":", 1)[0]
# add tags and tagsid
lang, line = line.split(":", 1)
tags_list = []
tagsids_list = []
for word in line.split(","):
tags_list.append(word.strip())
word_normalized = self.remove_stopwords(
lang, self.normalizing(word, lang)
)
tagsids_list.append(word_normalized)
data["tags_" + lang] = tags_list
data["tags_ids_" + lang] = tagsids_list
else:
property_name, lc, property_value = line.split(":", 2)
data["prop_" + property_name + "_" + lc] = property_value
data["id"] = "__footer__"
data["preceding_lines"].pop(0)
data["src_position"] = line_number + 1 - len(data["preceding_lines"])
yield data

def create_nodes(self, filename):
"""Adding nodes to database"""
filename = self.normalized_filename(filename)
harvested_data = self.harvest(filename)
self.create_headernode(next(harvested_data))
for entry in harvested_data:
self.create_node(entry)

def create_previous_link(self):
query = "MATCH(n) WHERE exists(n.is_before) return n.id,n.is_before"
results = self.session.run(query)
for result in results:
id = result["n.id"]
id_previous = result["n.is_before"]

query = """
MATCH(n) WHERE n.id = $id
MATCH(p) WHERE p.id= $id_previous
CREATE (p)-[:is_before]->(n)
"""
self.session.run(query, id=id, id_previous=id_previous)

def parent_search(self):
"""Get the parent and the child to link"""
query = "match(n) WHERE size(n.parents)>0 return n.id, n.parents"
results = self.session.run(query)
for result in results:
id = result["n.id"]
parent_list = result["n.parents"]
for parent in parent_list:
yield parent, id

def create_child_link(self):
"""Create the relations between nodes"""
for parent, child_id in self.parent_search():
lc, parent_id = parent.split(":")
tags_ids = "tags_ids_" + lc
query = """ MATCH(p) WHERE $parent_id IN p.tags_ids_""" + lc
query += """
MATCH(c) WHERE c.id= $child_id
CREATE (c)-[:is_child_of]->(p)
"""
self.session.run(
query, parent_id=parent_id, tagsid=tags_ids, child_id=child_id
)

def delete_used_properties(self):
query = "MATCH (n) SET n.is_before = null, n.parents = null"
self.session.run(query)

def __call__(self, filename):
"""process the file"""
self.create_nodes(filename)
self.create_child_link()
self.create_previous_link()
# self.delete_used_properties()


if __name__ == "__main__":
use = Parser()
use("test")
3 changes: 3 additions & 0 deletions parser/openfoodfacts_taxonomy_parser/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
neo4j==4.4.5
pytz==2022.1
Unidecode==1.3.4
Empty file added parser/tests/__init__.py
Empty file.
Loading