Skip to content

Commit

Permalink
Fix data written by dictionary_pickler.py
Browse files Browse the repository at this point in the history
The changes to the return types of `DictionaryFactory.get_dictionary()`
in 63933fc broke the generation of dictionaries using
`training/dictionary_pickler.py`. This commit fixes that again, by
undoing the changes to `training/dictionary_pickler.py` made by 63933fc.
  • Loading branch information
Dunedan committed Nov 5, 2024
1 parent 10cc7fb commit 7e24ae0
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re
from operator import itemgetter
from pathlib import Path
from typing import Dict, List, Optional
from typing import ByteString, Dict, List, Optional

import simplemma
from simplemma.strategies.defaultrules import DEFAULT_RULES
Expand Down Expand Up @@ -49,7 +49,9 @@ def _determine_path(listpath: str, langcode: str) -> str:
return str(Path(__file__).parent / filename)


def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
def _read_dict(
filepath: str, langcode: str, silent: bool
) -> Dict[ByteString, ByteString]:
mydict: Dict[str, str] = {}
myadditions: List[str] = []
i: int = 0
Expand Down Expand Up @@ -120,12 +122,12 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
mydict[word] = word
LOGGER.debug("%s %s", langcode, i)
# sort and convert to bytestrings
return dict(sorted(mydict.items()))
return {k.encode("utf-8"): v.encode("utf-8") for k, v in sorted(mydict.items())}


def _load_dict(
langcode: str, listpath: str = "lists", silent: bool = True
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
filepath = _determine_path(listpath, langcode)
return _read_dict(filepath, langcode, silent)

Expand Down

0 comments on commit 7e24ae0

Please sign in to comment.