From db5f5d8185557b270d3753d558e753c9b049b64d Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Tue, 5 Nov 2024 17:20:06 +0100 Subject: [PATCH] Fix data written by dictionary_pickler.py (#156) * Fix data written by dictionary_pickler.py The changes to the return types of `DictionaryFactory.get_dictionary()` in 63933fc broke the generation of dictionaries using `training/dictionary_pickler.py`. This commit fixes that again, by undoing the changes to `training/dictionary_pickler.py` made by 63933fc. * Fix tests for dictionary_pickler --- tests/test_dictionary_pickler.py | 4 ++-- training/dictionary_pickler.py | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_dictionary_pickler.py b/tests/test_dictionary_pickler.py index 37136f2..2fc806f 100644 --- a/tests/test_dictionary_pickler.py +++ b/tests/test_dictionary_pickler.py @@ -26,9 +26,9 @@ def test_logic() -> None: # different order mydict = dictionary_pickler._read_dict(testfile, "es", silent=True) assert len(mydict) == 5 - assert mydict["closeones"] == "closeone" + assert mydict[b"closeones"] == b"closeone" item = sorted(mydict.keys(), reverse=True)[0] - assert item == "valid-word" + assert item == b"valid-word" # file I/O assert dictionary_pickler._determine_path("lists", "de").endswith("de.txt") diff --git a/training/dictionary_pickler.py b/training/dictionary_pickler.py index 69f4692..15345d1 100644 --- a/training/dictionary_pickler.py +++ b/training/dictionary_pickler.py @@ -10,7 +10,7 @@ import re from operator import itemgetter from pathlib import Path -from typing import Dict, List, Optional +from typing import ByteString, Dict, List, Optional import simplemma from simplemma.strategies.defaultrules import DEFAULT_RULES @@ -49,7 +49,9 @@ def _determine_path(listpath: str, langcode: str) -> str: return str(Path(__file__).parent / filename) -def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: +def _read_dict( + filepath: str, langcode: str, silent: bool +) -> Dict[ByteString, ByteString]: mydict: Dict[str, str] = {} myadditions: List[str] = [] i: int = 0 @@ -120,12 +122,12 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: mydict[word] = word LOGGER.debug("%s %s", langcode, i) # sort and convert to bytestrings - return dict(sorted(mydict.items())) + return {k.encode("utf-8"): v.encode("utf-8") for k, v in sorted(mydict.items())} def _load_dict( langcode: str, listpath: str = "lists", silent: bool = True -) -> Dict[str, str]: +) -> Dict[ByteString, ByteString]: filepath = _determine_path(listpath, langcode) return _read_dict(filepath, langcode, silent)