diff --git a/pyproject.toml b/pyproject.toml index bc7c071..9756c43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,9 @@ dependencies = [ "geopandas ~= 0.14", "pyogrio ~= 0.7", "pyarrow ~= 16.0", - "unidecode ~= 1.3" + "unidecode ~= 1.3", "tqdm ~= 4.66", + "typer ~= 0.12", ] [project.optional-dependencies] @@ -41,6 +42,9 @@ dev = [ "ipykernel ~= 6.26", ] +[project.scripts] +ifk-lantmateriet = "lantmateriet.cli:app" + [tool.setuptools.packages.find] where = ["src"] exclude = ["material"] diff --git a/src/lantmateriet/api.py b/src/lantmateriet/api.py index a1e1166..b94f916 100644 --- a/src/lantmateriet/api.py +++ b/src/lantmateriet/api.py @@ -5,11 +5,11 @@ import logging import os import zipfile -from typing import Optional from pathlib import Path -from tqdm import tqdm +from typing import Optional import requests +from tqdm import tqdm STATUS_OK = 200 BLOCK_SIZE = 1024 @@ -86,12 +86,21 @@ def download(self, title: str) -> None: url = self._download_enpoint[title]["href"] response = get_request(url) buffer = self._download(response) - self._unzip(buffer) + + if zipfile.is_zipfile(buffer) is True: + self._unzip(buffer) logger.info(f"Downloaded and unpacked {title} to {self._save_path}") def _download(self, response: requests.Response) -> io.BytesIO: - """Download file from url.""" + """Download file from url. + + Args: + response: requests response object + + Returns: + bytesio buffer + """ file_size = int(response.headers.get("Content-Length", 0)) buffer = io.BytesIO() with tqdm.wrapattr( @@ -104,9 +113,13 @@ def _download(self, response: requests.Response) -> io.BytesIO: return buffer - def _unzip(self, response: io.BytesIO): - """Extract zip and save to disk.""" - with zipfile.ZipFile(response) as zip: + def _unzip(self, buffer: io.BytesIO): + """Extract zip and save to disk. + + Args: + buffer: buffer of downloaded content + """ + with zipfile.ZipFile(buffer) as zip: for member in tqdm(zip.infolist(), desc="Extracting"): try: zip.extract(member, self._save_path) diff --git a/src/lantmateriet/cli.py b/src/lantmateriet/cli.py new file mode 100644 index 0000000..686e945 --- /dev/null +++ b/src/lantmateriet/cli.py @@ -0,0 +1,37 @@ +"""CLI module.""" + +import typer +from lantmateriet.api import Lantmateriet +from lantmateriet.extract import extract +from tqdm import tqdm + +app = typer.Typer() + + +@app.callback() +def callback(): + """Lantmäteriet CLI client.""" + + +@app.command() +def download_all(order_id: str, save_path: str): + """Download files. + + Args: + order_id: lantmäteriet order id + save_path: path to save files to + """ + client = Lantmateriet(order_id, save_path) + all_files = client.available_files + for file in tqdm(all_files): + client.download(file) + + +@app.command() +def extract_all(path: str): + """Extract geojson from gpkg files. + + Args: + path: path to search for files + """ + extract(path) diff --git a/scripts/extract_geojson.py b/src/lantmateriet/extract.py similarity index 66% rename from scripts/extract_geojson.py rename to src/lantmateriet/extract.py index c982bce..082d3fd 100644 --- a/scripts/extract_geojson.py +++ b/src/lantmateriet/extract.py @@ -2,18 +2,18 @@ import glob import logging +from pathlib import Path from typing import TypeVar import fiona -import geopandas as gpd import pandas as pd import shapely from lantmateriet.config import Config50, config_50 from lantmateriet.line import Line from lantmateriet.point import Point from lantmateriet.polygon import Polygon +from lantmateriet.utils import normalise_item_names, read_first_entry, read_unique_names from ray.util.multiprocessing import Pool -from unidecode import unidecode Geometry = TypeVar("Geometry", Line, Polygon, Point) @@ -33,41 +33,6 @@ config = Config50() -def read_unique_names(file: str, layer: str, field: str) -> list[str]: - """Read unique names from specified field in file.""" - return sorted( - list( - set( - gpd.read_file( - file, - use_arrow=True, - include_fields=[field], - ignore_geometry=True, - layer=layer, - )[field] - ) - ) - ) - - -def read_first_entry(file: str, layer: str) -> gpd.GeoDataFrame: - """Read info from file.""" - return gpd.read_file(file, use_arrow=True, layer=layer, rows=1) - - -def normalise_item_names(item_names: list[str]) -> dict[str, str]: - """Normalise item names to save format.""" - return { - x: "{:02d}_".format(i + 1) - + unidecode(x.lower()) - .replace(" ", "_") - .replace("-", "") - .replace(",", "") - .replace("/", "_") - for i, x in enumerate(item_names) - } - - def save_sweden_base(processed_geo_objects): """Save sweden base from all dissolved ground.""" df_sverige = ( @@ -87,7 +52,7 @@ def parallel_process(geo_object, output_name): """Parallel process.""" if geo_object.df is not None: geo_object.process() - geo_object.save("tmp", output_name) + geo_object.save("tmp2", output_name) if "mark" in geo_object._file_path: return geo_object.df.dissolve().explode(index_parts=False) @@ -97,7 +62,7 @@ def parallel_process(geo_object, output_name): def extract_geojson(file: str, layer: str): """Extract and save geojson files.""" - print(f"Working on {file} - {layer}") + logger.info(f"Working on {file} - {layer}") field = "objekttyp" if "text" in file or "text" in layer: @@ -118,12 +83,17 @@ def extract_geojson(file: str, layer: str): if "mark" in file: save_sweden_base(processed_geo_objects) - print(f"Saved {file} - {layer}") + logger.info(f"Saved {file} - {layer}") + +def extract(path: str): + """Run extraction of gkpg to geojson. -def run(): - """Run extraction.""" - files = glob.glob("topografi_50/*.gpkg") + Args: + path: path to search for gkpg files + """ + file_pattern = str(Path(path) / "*.gpkg") + files = glob.glob(file_pattern) all_files = [] for file in files: @@ -133,7 +103,3 @@ def run(): with Pool(WORKER_OUTER) as pool: pool.starmap(extract_geojson, all_files) - - -if __name__ == "__main__": - run() diff --git a/src/lantmateriet/utils.py b/src/lantmateriet/utils.py index 0338aa1..6e2ce0f 100644 --- a/src/lantmateriet/utils.py +++ b/src/lantmateriet/utils.py @@ -5,6 +5,9 @@ from functools import wraps from typing import Callable +import geopandas as gpd +from unidecode import unidecode + logger = logging.getLogger(__name__) @@ -34,3 +37,38 @@ def wrap(*args, **kw): return wrap return timeit_decorator + + +def read_unique_names(file: str, layer: str, field: str) -> list[str]: + """Read unique names from specified field in file.""" + return sorted( + list( + set( + gpd.read_file( + file, + use_arrow=True, + include_fields=[field], + ignore_geometry=True, + layer=layer, + )[field] + ) + ) + ) + + +def read_first_entry(file: str, layer: str) -> gpd.GeoDataFrame: + """Read info from file.""" + return gpd.read_file(file, use_arrow=True, layer=layer, rows=1) + + +def normalise_item_names(item_names: list[str]) -> dict[str, str]: + """Normalise item names to save format.""" + return { + x: "{:02d}_".format(i + 1) + + unidecode(x.lower()) + .replace(" ", "_") + .replace("-", "") + .replace(",", "") + .replace("/", "_") + for i, x in enumerate(item_names) + }