Skip to content

Commit

Permalink
feat: add error logs to parser (#37)
Browse files Browse the repository at this point in the history
to help spot problems instead of simply ignoring them silently.
  • Loading branch information
BryanH01 authored Aug 19, 2022
1 parent 3eb9149 commit 2e8e49f
Showing 1 changed file with 116 additions and 23 deletions.
139 changes: 116 additions & 23 deletions parser/openfoodfacts_taxonomy_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from neo4j import GraphDatabase
import re, unicodedata, unidecode
from .exception import DuplicateIDError
import logging


def ellipsis(text, max=20):
"""Cut a text adding eventual ellipsis if we do not display it fully
"""
return text[:max] + ('...' if len(text) > max else '')


class Parser:
"""Parse a taxonomy file and build a neo4j graph
"""

def __init__(self, uri="bolt://localhost:7687"):
self.driver = GraphDatabase.driver(uri)
self.session = (
Expand Down Expand Up @@ -212,9 +222,14 @@ def remove_separating_line(self, data):

def harvest(self, filename):
"""Transform data from file to dictionary"""
saved_nodes = []
index_stopwords = 0
index_synonyms = 0
language_code_prefix = re.compile("[a-zA-Z][a-zA-Z][a-zA-Z]?:")
language_code_prefix = re.compile(
r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
)
# Check if it is correctly written
correctly_written = re.compile(r"\w+\Z")
# stopwords will contain a list of stopwords with their language code as key
self.stopwords = {}

Expand All @@ -228,41 +243,72 @@ def harvest(self, filename):
for line_number, line in self.file_iter(filename, next_line):
# yield data if block ended
if self.entry_end(line, data):
data = self.remove_separating_line(data)
yield data # another function will use this dictionary to create a node
self.is_before = data["id"]
if data["id"] in saved_nodes:
msg = f"Entry with same id {data['id']} already created, "
msg += f"duplicate id in file at line {data['src_position']}. "
msg += f"Node creation cancelled"
logging.error(msg)
else:
data = self.remove_separating_line(data)
yield data # another function will use this dictionary to create a node
self.is_before = data["id"]
saved_nodes.append(data["id"])
data = self.new_node_data()

# harvest the line
if not (line) or line[0] == "#":
# comment or blank
data["preceding_lines"].append(line)
else:
line = line.rstrip(",")
if not data["src_position"]:
data["src_position"] = line_number + 1
if line.startswith("stopwords"):
# general stopwords definition for a language
id = "stopwords:" + str(index_stopwords)
data = self.set_data_id(data, id, line_number)
index_stopwords += 1
lc, value = self.get_lc_value(line[10:])
data["tags_" + lc] = value
# add the list with its lc
self.stopwords[lc] = value
try:
lc, value = self.get_lc_value(line[10:])
except ValueError:
logging.error(
"Missing language code at line %d ? '%s'",
line_number + 1,
ellipsis(line),
)
else:
data["tags_" + lc] = value
# add the list with its lc
self.stopwords[lc] = value
elif line.startswith("synonyms"):
# general synonyms definition for a language
id = "synonyms:" + str(index_synonyms)
data = self.set_data_id(data, id, line_number)
index_synonyms += 1
line = line[9:]
tags = [words.strip() for words in line[3:].split(",")]
lc, value = self.get_lc_value(line)
data["tags_" + lc] = tags
data["tags_ids_" + lc] = value
try:
lc, value = self.get_lc_value(line)
except ValueError:
logging.error(
"Missing language code at line %d ? '%s'",
line_number + 1,
ellipsis(line),
)
else:
data["tags_" + lc] = tags
data["tags_ids_" + lc] = value
elif line[0] == "<":
# parent definition
data["parent_tag"].append(self.add_line(line[1:]))
elif language_code_prefix.match(line):
# synonyms definition
# to transform '-' from language code to '_'
line = line.replace("-", "_")
if not data["id"]:
data["id"] = self.add_line(line.split(",", 1)[0])
# first 2-3 characters before ":" are the language code
data["main_language"] = data["id"].split(":", 1)[0]
# first characters before ":" are the language code
data["main_language"] = data["id"].split(":", 1)[0]
# add tags and tagsid
lang, line = line.split(":", 1)
tags_list = []
Expand All @@ -272,26 +318,54 @@ def harvest(self, filename):
word_normalized = self.remove_stopwords(
lang, self.normalizing(word, lang)
)
tagsids_list.append(word_normalized)
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
data["tags_" + lang] = tags_list
data["tags_ids_" + lang] = tagsids_list
else:
property_name, lc, property_value = line.split(":", 2)
data["prop_" + property_name + "_" + lc] = property_value
# property definition
property_name = None
try:
property_name, lc, property_value = line.split(":", 2)
except ValueError:
logging.error(
"Reading error at line %d, unexpected format: '%s'",
line_number + 1,
ellipsis(line),
)
else:
# in case there is space before or after the colons
property_name = property_name.strip()
lc = lc.strip().replace("-", "_")
if not (
correctly_written.match(property_name)
and correctly_written.match(lc)
):
logging.error(
"Reading error at line %d, unexpected format: '%s'",
line_number + 1,
ellipsis(line),
)
if property_name:
data["prop_" + property_name + "_" + lc] = property_value

data["id"] = "__footer__"
data["preceding_lines"].pop(0)
data["src_position"] = line_number + 1 - len(data["preceding_lines"])
yield data

def create_nodes(self, filename):
"""Adding nodes to database"""
logging.info("Creating nodes")
filename = self.normalized_filename(filename)
harvested_data = self.harvest(filename)
self.create_headernode(next(harvested_data))
for entry in harvested_data:
self.create_node(entry)

def create_previous_link(self):
logging.info("Creating 'is_before' links")
query = "MATCH(n) WHERE exists(n.is_before) return n.id,n.is_before"
results = self.session.run(query)
for result in results:
Expand All @@ -301,9 +375,24 @@ def create_previous_link(self):
query = """
MATCH(n) WHERE n.id = $id
MATCH(p) WHERE p.id= $id_previous
CREATE (p)-[:is_before]->(n)
CREATE (p)-[r:is_before]->(n)
RETURN r
"""
self.session.run(query, id=id, id_previous=id_previous)
results = self.session.run(query, id=id, id_previous=id_previous)
relation = results.values()
if len(relation) > 1:
logging.error(
"2 or more 'is_before' links created for ids %s and %s, "
"one of the ids isn't unique",
id,
id_previous,
)
elif not relation[0]:
logging.error(
"link not created between %s and %s",
id,
id_previous,
)

def parent_search(self):
"""Get the parent and the child to link"""
Expand All @@ -317,17 +406,20 @@ def parent_search(self):

def create_child_link(self):
"""Create the relations between nodes"""
logging.info("Creating 'is_child_of' links")
for parent, child_id in self.parent_search():
lc, parent_id = parent.split(":")
tags_ids = "tags_ids_" + lc
query = """ MATCH(p) WHERE $parent_id IN p.tags_ids_""" + lc
query += """
MATCH(c) WHERE c.id= $child_id
CREATE (c)-[:is_child_of]->(p)
CREATE (c)-[r:is_child_of]->(p)
RETURN r
"""
self.session.run(
query, parent_id=parent_id, tagsid=tags_ids, child_id=child_id
)
result = self.session.run(query, parent_id=parent_id, child_id=child_id)
if not result.value():
logging.warning(
f"parent not found for child {child_id} with parent {parent_id}"
)

def delete_used_properties(self):
query = "MATCH (n) SET n.is_before = null, n.parents = null"
Expand All @@ -342,5 +434,6 @@ def __call__(self, filename):


if __name__ == "__main__":
logging.basicConfig(filename="parser.log", encoding="utf-8", level=logging.INFO)
use = Parser()
use("test")

0 comments on commit 2e8e49f

Please sign in to comment.