Skip to content

Commit

Permalink
use binary strings in dictionaries to save memory (#128)
Browse files Browse the repository at this point in the history
* use binary strings in dictionaries to save memory

* fix mypy

* update data

* fix code and descriptions

* remove unused import

* add in_place arg to dictionary_pickler

* explicit filename for readability
  • Loading branch information
adbar authored May 24, 2024
1 parent 2caac80 commit 546ab7e
Show file tree
Hide file tree
Showing 56 changed files with 50 additions and 26 deletions.
Binary file modified simplemma/strategies/dictionaries/data/ast.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/bg.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ca.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/cs.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/cy.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/da.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/de.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/el.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/en.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/enm.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/es.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/et.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fa.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fi.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fr.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ga.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gd.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hbs.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hi.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hu.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hy.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/id.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/is.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/it.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ka.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/la.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lb.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lt.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/mk.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ms.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nb.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nn.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/pl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/pt.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ro.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ru.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/se.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sk.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sq.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sw.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/tl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/tr.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/uk.plzma
Binary file not shown.
10 changes: 5 additions & 5 deletions simplemma/strategies/dictionaries/dictionary_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from functools import lru_cache
from os import listdir, path
from pathlib import Path
from typing import Dict
from typing import ByteString, Dict

if sys.version_info >= (3, 8):
from typing import Protocol
Expand All @@ -30,7 +30,7 @@
]


def _load_dictionary_from_disk(langcode: str) -> Dict[str, str]:
def _load_dictionary_from_disk(langcode: str) -> Dict[ByteString, ByteString]:
"""
Load a dictionary from disk.
Expand Down Expand Up @@ -68,7 +68,7 @@ class DictionaryFactory(Protocol):
def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
"""
Get the dictionary for a specific language.
Expand Down Expand Up @@ -102,15 +102,15 @@ def __init__(self, cache_max_size: int = 8):
cache_max_size (int): The maximum size of the cache for loaded dictionaries.
Defaults to `8`.
"""
self._data: Dict[str, Dict[str, str]] = {}
self._data: Dict[str, Dict[ByteString, ByteString]] = {}
self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
_load_dictionary_from_disk
)

def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
"""
Get the dictionary for a specific language.
Expand Down
16 changes: 12 additions & 4 deletions simplemma/strategies/dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
It provides lemmatization using dictionary lookup.
"""

from typing import Optional
from typing import ByteString, Dict, Optional

from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy
Expand All @@ -26,6 +26,13 @@ def __init__(
"""
self._dictionary_factory = dictionary_factory

def _get(
self, token: str, dictionary: Dict[ByteString, ByteString]
) -> Optional[str]:
"Convenience function to handle bytestring to string conversion."
result = dictionary.get(token.encode("utf-8"))
return result.decode("utf-8") if result else None # type: ignore[union-attr]

def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
Get Lemma using Dictionary Lookup
Expand All @@ -43,8 +50,9 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
# Search the language data, reverse case to extend coverage.
dictionary = self._dictionary_factory.get_dictionary(lang)
if token in dictionary:
return dictionary[token]
result = self._get(token, dictionary)
if result:
return result
# Try upper or lowercase.
token = token.lower() if token[0].isupper() else token.capitalize()
return dictionary.get(token)
return self._get(token, dictionary)
4 changes: 2 additions & 2 deletions simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_lemma(self, token: str, lang: str) -> str:
return token

dictionary = self._dictionary_factory.get_dictionary(lang)
candidate = token
candidate = token.encode("utf-8")
for _ in range(self._steps):
if candidate not in dictionary:
break
Expand All @@ -73,4 +73,4 @@ def get_lemma(self, token: str, lang: str) -> str:

candidate = new_candidate

return candidate
return candidate.decode("utf-8")
8 changes: 6 additions & 2 deletions simplemma/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
- [validate_lang_input][simplemma.utils.validate_lang_input]: Validates the language input and ensures it is a valid tuple.
"""

from typing import Tuple, Union
from typing import ByteString, Tuple, Union


def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
Expand All @@ -31,7 +31,9 @@ def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
return lang # type: ignore[return-value]


def levenshtein_dist(str1: str, str2: str) -> int:
def levenshtein_dist(
first: Union[ByteString, str], second: Union[ByteString, str]
) -> int:
"""
Calculate the Levenshtein distance between two strings.
Expand All @@ -47,6 +49,8 @@ def levenshtein_dist(str1: str, str2: str) -> int:
int: The Levenshtein distance between the two strings.
"""
str1 = first.encode("utf-8") if isinstance(first, str) else first
str2 = second.encode("utf-8") if isinstance(second, str) else second
# inspired by this noticeably faster code:
# https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
if str1 == str2:
Expand Down
5 changes: 3 additions & 2 deletions tests/test_dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_logic() -> None:
# different order
mydict = dictionary_pickler._read_dict(testfile, "es", silent=True)
assert len(mydict) == 5
assert mydict["closeones"] == "closeone"
assert mydict[b"closeones"] == b"closeone"
item = sorted(mydict.keys(), reverse=True)[0]
assert item == "valid-word"
assert item == b"valid-word"

# file I/O
assert dictionary_pickler._determine_path("lists", "de").endswith("de.txt")
Expand All @@ -37,3 +37,4 @@ def test_logic() -> None:
listpath = os.path.join(TEST_DIR, "data")
os_handle, temp_outputfile = tempfile.mkstemp(suffix=".pkl", text=True)
dictionary_pickler._pickle_dict("zz", listpath, temp_outputfile)
dictionary_pickler._pickle_dict("zz", listpath, in_place=True)
6 changes: 3 additions & 3 deletions tests/test_lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Tests for `simplemma` package."""

from typing import Dict
from typing import ByteString, Dict

import pytest

Expand All @@ -17,8 +17,8 @@ class CustomDictionaryFactory(DictionaryFactory):
def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
return {"testing": "the test works!!"}
) -> Dict[ByteString, ByteString]:
return {b"testing": b"the test works!!"}

assert (
Lemmatizer(
Expand Down
27 changes: 19 additions & 8 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re
from operator import itemgetter
from pathlib import Path
from typing import Dict, List, Optional
from typing import ByteString, Dict, List, Optional

import simplemma
from simplemma.strategies.defaultrules import DEFAULT_RULES
Expand Down Expand Up @@ -49,7 +49,9 @@ def _determine_path(listpath: str, langcode: str) -> str:
return str(Path(__file__).parent / filename)


def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
def _read_dict(
filepath: str, langcode: str, silent: bool
) -> Dict[ByteString, ByteString]:
mydict: Dict[str, str] = {}
myadditions: List[str] = []
i: int = 0
Expand Down Expand Up @@ -80,8 +82,8 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
# print line if the rule is wrong
if (
len(columns[1]) > 6
and columns[1] != columns[0]
and langcode in DEFAULT_RULES
and columns[1] != columns[0]
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule is not None and rule != columns[1]:
Expand Down Expand Up @@ -119,32 +121,41 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
for word in myadditions:
mydict[word] = word
LOGGER.debug("%s %s", langcode, i)
return dict(sorted(mydict.items()))
# sort and convert to bytestrings
return {k.encode("utf-8"): v.encode("utf-8") for k, v in sorted(mydict.items())}


def _load_dict(
langcode: str, listpath: str = "lists", silent: bool = True
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
filepath = _determine_path(listpath, langcode)
return _read_dict(filepath, langcode, silent)


def _pickle_dict(
langcode: str, listpath: str = "lists", filepath: Optional[str] = None
langcode: str = "en",
listpath: str = "lists",
filepath: Optional[str] = None,
in_place: bool = False,
) -> None:
mydict = _load_dict(langcode, listpath)
# sort dictionary to help saving space during compression
if langcode not in ("lt", "sw"):
mydict = dict(sorted(mydict.items(), key=itemgetter(1)))
if filepath is None:
filename = f"strategies/dictionaries/data/{langcode}.plzma"
filepath = str(Path(simplemma.__file__).parent / filename)
directory = (
Path(simplemma.__file__).parent
if in_place
else Path(__file__).parent.parent / "simplemma"
)
filepath = str(directory / filename)
with lzma.open(filepath, "wb") as filehandle: # , filters=my_filters, preset=9
pickle.dump(mydict, filehandle, protocol=4)
LOGGER.debug("%s %s", langcode, len(mydict))


if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
for listcode in SUPPORTED_LANGUAGES:
for listcode in sorted(SUPPORTED_LANGUAGES):
_pickle_dict(listcode)

0 comments on commit 546ab7e

Please sign in to comment.