Skip to content

Commit

Permalink
Merge pull request #51 from statisticsnorway/20240821LeoFiltypefiks
Browse files Browse the repository at this point in the history
20240821 leo filtypefiks
  • Loading branch information
joxssb authored Aug 21, 2024
2 parents d54c964 + a030ad8 commit c2a4709
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ssb-konjunk"
version = "0.1.5"
version = "0.1.6"
description = "SSB Konjunk"
authors = ["Edvard Garmannslund <[email protected]>"]
license = "MIT"
Expand Down
25 changes: 19 additions & 6 deletions src/ssb_konjunk/saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
from ssb_konjunk import timestamp


def _remove_edge_slashes(input_string: str) -> str:
def _remove_edge_slashes(input_string: str, only_last: bool = False) -> str:
"""Function to remove edge slashes in strings.
Args:
input_string: The string to remove / for.
only_last: True if only move the potential last edge slash. Default: False.
Returns:
str: String without slashes.
"""
if input_string.startswith("/"):
if input_string.startswith("/") and not only_last:
input_string = input_string[1:]
if input_string.endswith("/"):
input_string = input_string[:-1]
Expand All @@ -38,6 +39,7 @@ def _structure_ssb_filepath(
folder: str | None = None,
version_number: int | None = None,
filetype: str = "parquet",
fs: dapla.gcs.GCSFileSystem | None = None,
) -> str:
"""Structure the name of the file to SSB-format and the path.
Expand All @@ -51,14 +53,18 @@ def _structure_ssb_filepath(
folder: Optional string for if you want folders betwen 'datatilstand' and file.
version_number: Optional int for reading specific file.
filetype: String with default 'parquet', specifies file type.
fs: the filesystem, pass with gsc Filesystem if Dapla. Default: None.
Returns:
str: the full path to the file.
Raises:
ValueError: Raise if version number is not None or int.
"""
bucket = _remove_edge_slashes(bucket)
if fs is None:
bucket = _remove_edge_slashes(bucket, only_last=True)
else:
bucket = _remove_edge_slashes(bucket)
statistic = _remove_edge_slashes(statistic)
datatilstand = _remove_edge_slashes(datatilstand)
file_name = _remove_edge_slashes(file_name)
Expand All @@ -82,7 +88,9 @@ def _structure_ssb_filepath(
return file_path


def _get_files(folder_path: str, fs: dapla.gcs.GCSFileSystem | None) -> list[str]:
def _get_files(
folder_path: str, filetype: str, fs: dapla.gcs.GCSFileSystem | None
) -> list[str]:
"""Function to list files in a folder based on base name and timestamp."""
filenames = []

Expand All @@ -92,6 +100,9 @@ def _get_files(folder_path: str, fs: dapla.gcs.GCSFileSystem | None) -> list[str
else:
filenames = glob.glob(match_string)

# Only include files with the relevant file extension
filenames = [i for i in filenames if i.endswith(filetype)]

# Sorts the filenames according to version numbers.
filenames.sort()

Expand Down Expand Up @@ -299,9 +310,10 @@ def write_ssb_file(
datatilstand=datatilstand,
file_name=file_name,
folder=folder,
fs=fs,
)
# Get list with the filenames, if several, ordered by the highest version number at last.
files = _get_files(file_path, fs=fs)
files = _get_files(file_path, filetype, fs=fs)
# Find version number/decide whether to overwrite or make new version.
version_number = _find_version_number(files, stable_version)

Expand Down Expand Up @@ -358,11 +370,12 @@ def read_ssb_file(
folder=folder,
version_number=version_number,
filetype=filetype,
fs=fs,
)

if not version_number:
# If version number not specified then list out versions.
files = _get_files(file_path, fs=fs)
files = _get_files(file_path, filetype, fs=fs)
file_path = files[-1]

# Different functions used for reading depending on the filetype.
Expand Down
21 changes: 21 additions & 0 deletions tests/test_saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,24 @@ def test_verify_datatilstand() -> None:

# Need to pass input, inndata. For running: pytest -s
# assert _verify_datatilstand('overnatting') == 'inndata'


def test_verify_list_filtypes() -> None:
"""Test of function _structure_ssb_filepath without version number."""
filenames = [
"ssb-vare-tjen-korttid-data-produkt-test/vhi/inndata/utvalg/ra-0187/ra-0187_p2024-01-01_p2024-01-31.csv",
"ssb-vare-tjen-korttid-data-produkt-test/vhi/inndata/utvalg/ra-0187/ra-0187_p2024-01-01_p2024-01-31.parquet",
"ssb-vare-tjen-korttid-data-produkt-test/vhi/inndata/utvalg/ra-0187/ra-0187_p2024-01-01_p2024-01-31_69.parquet",
"ssb-vare-tjen-korttid-data-produkt-test/vhi/inndata/utvalg/ra-0187/ra-0187_p2024-01-01_p2024-01-31_v69.parquet",
"ssb-vare-tjen-korttid-data-produkt-test/vhi/inndata/utvalg/ra-0187/ra-0187_p2024-01-01_p2024-01-31_v70.csv",
]

filetype = "parquet"

# Only include files with the relevant file extension
filenames = [i for i in filenames if i.endswith(filetype)]

# Sorts the filenames according to version numbers.
filenames.sort()

assert all(item.endswith(filetype) for item in filenames)

0 comments on commit c2a4709

Please sign in to comment.