Skip to content

Commit

Permalink
Add CLI to download and extract files
Browse files Browse the repository at this point in the history
  • Loading branch information
mgcth committed May 4, 2024
1 parent c087f71 commit 741b187
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 55 deletions.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ dependencies = [
"geopandas ~= 0.14",
"pyogrio ~= 0.7",
"pyarrow ~= 16.0",
"unidecode ~= 1.3"
"unidecode ~= 1.3",
"tqdm ~= 4.66",
"typer ~= 0.12",
]

[project.optional-dependencies]
Expand All @@ -41,6 +42,9 @@ dev = [
"ipykernel ~= 6.26",
]

[project.scripts]
ifk-lantmateriet = "lantmateriet.cli:app"

[tool.setuptools.packages.find]
where = ["src"]
exclude = ["material"]
Expand Down
27 changes: 20 additions & 7 deletions src/lantmateriet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import logging
import os
import zipfile
from typing import Optional
from pathlib import Path
from tqdm import tqdm
from typing import Optional

import requests
from tqdm import tqdm

STATUS_OK = 200
BLOCK_SIZE = 1024
Expand Down Expand Up @@ -86,12 +86,21 @@ def download(self, title: str) -> None:
url = self._download_enpoint[title]["href"]
response = get_request(url)
buffer = self._download(response)
self._unzip(buffer)

if zipfile.is_zipfile(buffer) is True:
self._unzip(buffer)

logger.info(f"Downloaded and unpacked {title} to {self._save_path}")

def _download(self, response: requests.Response) -> io.BytesIO:
"""Download file from url."""
"""Download file from url.
Args:
response: requests response object
Returns:
bytesio buffer
"""
file_size = int(response.headers.get("Content-Length", 0))
buffer = io.BytesIO()
with tqdm.wrapattr(
Expand All @@ -104,9 +113,13 @@ def _download(self, response: requests.Response) -> io.BytesIO:

return buffer

def _unzip(self, response: io.BytesIO):
"""Extract zip and save to disk."""
with zipfile.ZipFile(response) as zip:
def _unzip(self, buffer: io.BytesIO):
"""Extract zip and save to disk.
Args:
buffer: buffer of downloaded content
"""
with zipfile.ZipFile(buffer) as zip:
for member in tqdm(zip.infolist(), desc="Extracting"):
try:
zip.extract(member, self._save_path)
Expand Down
37 changes: 37 additions & 0 deletions src/lantmateriet/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""CLI module."""

import typer
from lantmateriet.api import Lantmateriet
from lantmateriet.extract import extract
from tqdm import tqdm

app = typer.Typer()


@app.callback()
def callback():
"""Lantmäteriet CLI client."""


@app.command()
def download_all(order_id: str, save_path: str):
"""Download files.
Args:
order_id: lantmäteriet order id
save_path: path to save files to
"""
client = Lantmateriet(order_id, save_path)
all_files = client.available_files
for file in tqdm(all_files):
client.download(file)


@app.command()
def extract_all(path: str):
"""Extract geojson from gpkg files.
Args:
path: path to search for files
"""
extract(path)
60 changes: 13 additions & 47 deletions scripts/extract_geojson.py → src/lantmateriet/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@

import glob
import logging
from pathlib import Path
from typing import TypeVar

import fiona
import geopandas as gpd
import pandas as pd
import shapely
from lantmateriet.config import Config50, config_50
from lantmateriet.line import Line
from lantmateriet.point import Point
from lantmateriet.polygon import Polygon
from lantmateriet.utils import normalise_item_names, read_first_entry, read_unique_names
from ray.util.multiprocessing import Pool
from unidecode import unidecode

Geometry = TypeVar("Geometry", Line, Polygon, Point)

Expand All @@ -33,41 +33,6 @@
config = Config50()


def read_unique_names(file: str, layer: str, field: str) -> list[str]:
"""Read unique names from specified field in file."""
return sorted(
list(
set(
gpd.read_file(
file,
use_arrow=True,
include_fields=[field],
ignore_geometry=True,
layer=layer,
)[field]
)
)
)


def read_first_entry(file: str, layer: str) -> gpd.GeoDataFrame:
"""Read info from file."""
return gpd.read_file(file, use_arrow=True, layer=layer, rows=1)


def normalise_item_names(item_names: list[str]) -> dict[str, str]:
"""Normalise item names to save format."""
return {
x: "{:02d}_".format(i + 1)
+ unidecode(x.lower())
.replace(" ", "_")
.replace("-", "")
.replace(",", "")
.replace("/", "_")
for i, x in enumerate(item_names)
}


def save_sweden_base(processed_geo_objects):
"""Save sweden base from all dissolved ground."""
df_sverige = (
Expand All @@ -87,7 +52,7 @@ def parallel_process(geo_object, output_name):
"""Parallel process."""
if geo_object.df is not None:
geo_object.process()
geo_object.save("tmp", output_name)
geo_object.save("tmp2", output_name)

if "mark" in geo_object._file_path:
return geo_object.df.dissolve().explode(index_parts=False)
Expand All @@ -97,7 +62,7 @@ def parallel_process(geo_object, output_name):

def extract_geojson(file: str, layer: str):
"""Extract and save geojson files."""
print(f"Working on {file} - {layer}")
logger.info(f"Working on {file} - {layer}")
field = "objekttyp"

if "text" in file or "text" in layer:
Expand All @@ -118,12 +83,17 @@ def extract_geojson(file: str, layer: str):
if "mark" in file:
save_sweden_base(processed_geo_objects)

print(f"Saved {file} - {layer}")
logger.info(f"Saved {file} - {layer}")


def extract(path: str):
"""Run extraction of gkpg to geojson.
def run():
"""Run extraction."""
files = glob.glob("topografi_50/*.gpkg")
Args:
path: path to search for gkpg files
"""
file_pattern = str(Path(path) / "*.gpkg")
files = glob.glob(file_pattern)

all_files = []
for file in files:
Expand All @@ -133,7 +103,3 @@ def run():

with Pool(WORKER_OUTER) as pool:
pool.starmap(extract_geojson, all_files)


if __name__ == "__main__":
run()
38 changes: 38 additions & 0 deletions src/lantmateriet/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from functools import wraps
from typing import Callable

import geopandas as gpd
from unidecode import unidecode

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -34,3 +37,38 @@ def wrap(*args, **kw):
return wrap

return timeit_decorator


def read_unique_names(file: str, layer: str, field: str) -> list[str]:
"""Read unique names from specified field in file."""
return sorted(
list(
set(
gpd.read_file(
file,
use_arrow=True,
include_fields=[field],
ignore_geometry=True,
layer=layer,
)[field]
)
)
)


def read_first_entry(file: str, layer: str) -> gpd.GeoDataFrame:
"""Read info from file."""
return gpd.read_file(file, use_arrow=True, layer=layer, rows=1)


def normalise_item_names(item_names: list[str]) -> dict[str, str]:
"""Normalise item names to save format."""
return {
x: "{:02d}_".format(i + 1)
+ unidecode(x.lower())
.replace(" ", "_")
.replace("-", "")
.replace(",", "")
.replace("/", "_")
for i, x in enumerate(item_names)
}

0 comments on commit 741b187

Please sign in to comment.