Skip to content

Commit

Permalink
fix: fix export to local file (#352)
Browse files Browse the repository at this point in the history
* fix: update unparser to use project_label instead of multi_label

* refactor: move functions to parser utils

* refactor: change normalizing function name

* fix: lint entries

* fix: update fucntion name
  • Loading branch information
eric-nguyen-cs authored Jan 19, 2024
1 parent 453ff89 commit b18ac03
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 52 deletions.
14 changes: 9 additions & 5 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import urllib.request # Sending requests

from fastapi import BackgroundTasks, HTTPException, UploadFile
from openfoodfacts_taxonomy_parser import normalizer # Normalizing tags
from openfoodfacts_taxonomy_parser import parser # Parser for taxonomies
from openfoodfacts_taxonomy_parser import unparser # Unparser for taxonomies
from openfoodfacts_taxonomy_parser import utils as parser_utils # Normalizing tags

from . import settings
from .exceptions import GithubBranchExistsError # Custom exceptions
Expand Down Expand Up @@ -65,7 +65,9 @@ async def create_node(self, label, entry, main_language_code):
if label == "ENTRY":
# Normalizing new canonical tag
language_code, canonical_tag = entry.split(":", 1)
normalised_canonical_tag = normalizer.normalizing(canonical_tag, main_language_code)
normalised_canonical_tag = parser_utils.normalize_text(
canonical_tag, main_language_code
)

# Reconstructing and updation of node ID
params["id"] = language_code + ":" + normalised_canonical_tag
Expand Down Expand Up @@ -226,7 +228,7 @@ def is_valid_branch_name(self):
"""
Helper function to check if a branch name is valid
"""
return normalizer.normalizing(self.branch_name, char="_") == self.branch_name
return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name

async def create_project(self, description):
"""
Expand Down Expand Up @@ -475,7 +477,9 @@ async def update_node(self, label, entry, new_node):
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(normalizer.normalizing(value, keys_language_code))
normalised_value.append(
parser_utils.normalize_text(value, keys_language_code)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
else:
Expand Down Expand Up @@ -582,7 +586,7 @@ async def full_text_search(self, text):
"""
# Escape special characters
normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text)
normalized_id_text = normalizer.normalizing(text)
normalized_id_text = parser_utils.normalize_text(text)

# If normalized text is empty, no searches are found
if normalized_text.strip() == "":
Expand Down
10 changes: 3 additions & 7 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from neo4j import GraphDatabase, Session, Transaction

from .logger import ParserConsoleLogger
from ..normalizer import normalizing
from ..utils import get_project_name, normalize_text
from .taxonomy_parser import (
NodeType,
PreviousLink,
Expand All @@ -26,10 +26,6 @@ def __init__(self, session: Session):
self.session = session
self.parser_logger = ParserConsoleLogger()

def _get_project_name(self, taxonomy_name: str, branch_name: str):
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name

def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
"""Create a TEXT, SYNONYMS or STOPWORDS node"""
if node_data.get_node_type() == NodeType.TEXT:
Expand Down Expand Up @@ -285,7 +281,7 @@ def _create_node_indexes(self, project_label: str):
self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")

def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
project_label = self._get_project_name(taxonomy_name, branch_name)
project_label = get_project_name(taxonomy_name, branch_name)
# First create nodes, then create node indexes to accelerate relationship creation, then create relationships
self._create_other_nodes(taxonomy.other_nodes, project_label)
self._create_entry_nodes(taxonomy.entry_nodes, project_label)
Expand All @@ -299,7 +295,7 @@ def __call__(self, filename: str, branch_name: str, taxonomy_name: str):
"""Process the file"""
start_time = timeit.default_timer()

branch_name = normalizing(branch_name, char="_")
branch_name = normalize_text(branch_name, char="_")
taxonomy_parser = TaxonomyParser()
taxonomy = taxonomy_parser.parse_file(filename, self.parser_logger)
self._write_to_database(taxonomy, taxonomy_name, branch_name)
Expand Down
14 changes: 5 additions & 9 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .logger import ParserConsoleLogger
from .exception import DuplicateIDError
from ..normalizer import normalizing
from ..utils import normalize_filename, normalize_text


class NodeType(str, Enum):
Expand Down Expand Up @@ -75,10 +75,6 @@ class TaxonomyParser:
def __init__(self):
self.parser_logger = ParserConsoleLogger()

def _normalized_filename(self, filename: str) -> str:
"""Add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")

def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]:
"""Generator to get the file line by line"""
with open(filename, "r", encoding="utf8") as file:
Expand Down Expand Up @@ -122,15 +118,15 @@ def _add_line(self, line: str) -> str:
"""
lc, line = line.split(":", 1)
new_line = lc + ":"
new_line += self._remove_stopwords(lc, normalizing(line, lc))
new_line += self._remove_stopwords(lc, normalize_text(line, lc))
return new_line

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
lc, line = line.split(":", 1)
new_line: list[str] = []
for word in line.split(","):
new_line.append(self._remove_stopwords(lc, normalizing(word, lc)))
new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
return lc, new_line

def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
Expand Down Expand Up @@ -290,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list = []
for word in line.split(","):
tags_list.append(word.strip())
word_normalized = self._remove_stopwords(lang, normalizing(word, lang))
word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
Expand Down Expand Up @@ -356,7 +352,7 @@ def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -
self.parser_logger = logger
"""Process the file into a Taxonomy object"""
start_time = timeit.default_timer()
filename = self._normalized_filename(filename)
filename = normalize_filename(filename)
taxonomy = self._create_taxonomy(filename)
self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
self.parser_logger.info(
Expand Down
33 changes: 10 additions & 23 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from neo4j import GraphDatabase

from .normalizer import normalizing
from .utils import get_project_name, normalize_filename, normalize_text


class WriteTaxonomy:
Expand All @@ -12,20 +12,7 @@ class WriteTaxonomy:
def __init__(self, session):
self.session = session

def normalized_filename(self, filename):
"""add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")

def get_project_name(self, taxonomy_name, branch_name):
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name

def create_multi_label(self, taxonomy_name, branch_name):
"""Create a combined label with taxonomy name and branch name"""
project_name = self.get_project_name(taxonomy_name, branch_name)
return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name)

def get_all_nodes(self, multi_label):
def get_all_nodes(self, project_label):
"""query the database and yield each node with its parents,
this function use the relationships between nodes"""
# This query first lists all the nodes in the "is_before" order
Expand All @@ -34,7 +21,7 @@ def get_all_nodes(self, multi_label):
# Note: OPTIONAL MATCH is used to return nodes without parents
query = f"""
MATCH path = ShortestPath(
(h:{multi_label}:TEXT)-[:is_before*]->(f:{multi_label}:TEXT)
(h:{project_label}:TEXT)-[:is_before*]->(f:{project_label}:TEXT)
)
WHERE h.id="__header__" AND f.id="__footer__"
WITH nodes(path) AS nodes, range(0, size(nodes(path))-1) AS indexes
Expand Down Expand Up @@ -92,9 +79,9 @@ def get_parents_lines(self, parents):
parent_id = parent["tags_" + lc][0]
yield "<" + lc + ":" + parent_id

def iter_lines(self, multi_label):
def iter_lines(self, project_label):
previous_block_id = ""
for node, parents in self.get_all_nodes(multi_label):
for node, parents in self.get_all_nodes(project_label):
node = dict(node)
has_content = node["id"] not in ["__header__", "__footer__"]
# eventually add a blank line but in specific case
Expand Down Expand Up @@ -134,16 +121,16 @@ def iter_lines(self, multi_label):

def rewrite_file(self, filename, lines):
"""Write a .txt file with the given name"""
filename = self.normalized_filename(filename)
filename = normalize_filename(filename)
with open(filename, "w", encoding="utf8") as file:
for line in lines:
file.write(line + "\n")

def __call__(self, filename, branch_name, taxonomy_name):
filename = self.normalized_filename(filename)
branch_name = normalizing(branch_name, char="_")
multi_label = self.create_multi_label(taxonomy_name, branch_name)
lines = self.iter_lines(multi_label)
filename = normalize_filename(filename)
branch_name = normalize_text(branch_name, char="_")
project_label = get_project_name(taxonomy_name, branch_name)
lines = self.iter_lines(project_label)
self.rewrite_file(filename, lines)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
String normalizer
"""
import re
import unicodedata

import unidecode


def normalizing(line: str, lang="default", char="-"):
def normalize_text(line: str, lang="default", char="-"):
"""Normalize a string depending on the language code"""
line = unicodedata.normalize("NFC", line)

Expand All @@ -33,3 +30,13 @@ def normalizing(line: str, lang="default", char="-"):
line = re.sub(r"-+", char, line)
line = line.strip(char)
return line


def normalize_filename(filename: str) -> str:
"""add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")


def get_project_name(taxonomy_name: str, branch_name: str) -> str:
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name
7 changes: 3 additions & 4 deletions parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from openfoodfacts_taxonomy_parser import normalizer, parser
from openfoodfacts_taxonomy_parser import parser, utils

# taxonomy in text format : test.txt
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
Expand All @@ -17,8 +17,7 @@
],
)
def test_normalized_filename(filename: str, normalized_name: str):
taxonomy_parser = parser.TaxonomyParser()
assert taxonomy_parser._normalized_filename(filename) == normalized_name
assert utils.normalize_filename(filename) == normalized_name


def test_fileiter(neo4j):
Expand All @@ -40,4 +39,4 @@ def test_fileiter(neo4j):
],
)
def test_normalizing(text: str, normalized_text: str, lang: str):
assert normalizer.normalizing(text, lang) == normalized_text
assert utils.normalize_text(text, lang) == normalized_text

0 comments on commit b18ac03

Please sign in to comment.