Skip to content

Commit

Permalink
Merge pull request #949 from rkingsbury/encoding
Browse files Browse the repository at this point in the history
Add character encoding kwarg to JSONStore and FileStore
  • Loading branch information
rkingsbury authored May 13, 2024
2 parents 13bf6e7 + 3dbc2d6 commit 6093cf3
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 3 deletions.
9 changes: 8 additions & 1 deletion src/maggma/stores/file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(
read_only: bool = True,
include_orphans: bool = False,
json_name: str = "FileStore.json",
encoding: Optional[str] = None,
**kwargs,
):
"""
Expand Down Expand Up @@ -84,6 +85,11 @@ def __init__(
json_name: Name of the .json file to which metadata is saved. If read_only
is False, this file will be created in the root directory of the
FileStore.
encoding: Character encoding of files to be tracked by the store. The default
(None) follows python's default behavior, which is to determine the character
encoding from the platform. This should work in the great majority of cases.
However, if you encounter a UnicodeDecodeError, consider setting the encoding
explicitly to 'utf8' or another encoding as appropriate.
kwargs: kwargs passed to MemoryStore.__init__()
"""
# this conditional block is needed in order to guarantee that the 'name'
Expand All @@ -101,6 +107,7 @@ def __init__(
self.include_orphans = include_orphans
self.read_only = read_only
self.max_depth = max_depth
self.encoding = encoding

self.metadata_store = JSONStore(
paths=[str(self.path / self.json_name)],
Expand Down Expand Up @@ -434,7 +441,7 @@ def query( # type: ignore
# TODO - could add more logic for detecting different file types
# and more nuanced exception handling
try:
with zopen(d["path"], "r") as f:
with zopen(d["path"], "r", encoding=self.encoding) as f:
data = f.read()
except Exception as e:
data = f"Unable to read: {e}"
Expand Down
11 changes: 9 additions & 2 deletions src/maggma/stores/mongolike.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,7 @@ def __init__(
read_only: bool = True,
serialization_option: Optional[int] = None,
serialization_default: Optional[Callable[[Any], Any]] = None,
encoding: Optional[str] = None,
**kwargs,
):
"""
Expand All @@ -641,9 +642,15 @@ def __init__(
option that will be passed to the orjson.dump when saving to the json the file.
serialization_default:
default that will be passed to the orjson.dump when saving to the json the file.
encoding: Character encoding of files to be tracked by the store. The default
(None) follows python's default behavior, which is to determine the character
encoding from the platform. This should work in the great majority of cases.
However, if you encounter a UnicodeDecodeError, consider setting the encoding
explicitly to 'utf8' or another encoding as appropriate.
"""
paths = paths if isinstance(paths, (list, tuple)) else [paths]
self.paths = paths
self.encoding = encoding

# file_writable overrides read_only for compatibility reasons
if "file_writable" in kwargs:
Expand Down Expand Up @@ -686,7 +693,7 @@ def connect(self, force_reset: bool = False):

# create the .json file if it does not exist
if not self.read_only and not Path(self.paths[0]).exists():
with zopen(self.paths[0], "w") as f:
with zopen(self.paths[0], "w", encoding=self.encoding) as f:
data: List[dict] = []
bytesdata = orjson.dumps(data)
f.write(bytesdata.decode("utf-8"))
Expand Down Expand Up @@ -763,7 +770,7 @@ def update_json_file(self):
"""
Updates the json file when a write-like operation is performed.
"""
with zopen(self.paths[0], "w") as f:
with zopen(self.paths[0], "w", encoding=self.encoding) as f:
data = list(self.query())
for d in data:
d.pop("_id")
Expand Down
9 changes: 9 additions & 0 deletions tests/stores/test_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,12 @@ def test_this_dir():
fs = FileStore(".")
fs.connect()
assert not fs.name.endswith(".")


def test_encoding():
"""
Make sure custom encoding works
"""
fs = FileStore(".", read_only=False, encoding="utf8")
fs.connect()
assert Path("FileStore.json").exists()
5 changes: 5 additions & 0 deletions tests/stores/test_mongolike.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,11 @@ def test_json_store_load(jsonstore, test_dir):
jsonstore.connect()
assert len(list(jsonstore.query())) == 20

# test with non-default encoding
jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz", encoding="utf8")
jsonstore.connect()
assert len(list(jsonstore.query())) == 20

# confirm descriptive error raised if you get a KeyError
jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz", key="random_key")
with pytest.raises(KeyError, match="Key field 'random_key' not found"):
Expand Down

0 comments on commit 6093cf3

Please sign in to comment.