diff --git a/climateset/download/downloader.py b/climateset/download/downloader.py index 638fd29..a212cfa 100644 --- a/climateset/download/downloader.py +++ b/climateset/download/downloader.py @@ -1,6 +1,8 @@ import argparse import os import os.path +import pathlib +import subprocess from typing import Union import numpy as np @@ -135,7 +137,7 @@ def __init__( # noqa: C901 variables = [v.replace(" ", "_").replace("-", "_") for v in variables] self.logger.info(f"Cleaned variables : {variables}") for v in variables: - t = get_keys_from_value(VAR_SOURCE_LOOKUP, v) + t = get_keys_from_value(VAR_SOURCE_LOOKUP, v, self.logger) if t == "model": self.model_vars.append(v) elif t == "raw": @@ -258,6 +260,7 @@ def download_from_model_single_var( # noqa: C901 raise RuntimeError """ + self.logger.info("Using download_from_model_single_var() function") ctx = conn.new_context( project=project, @@ -364,10 +367,12 @@ def download_from_model_single_var( # noqa: C901 self.logger.info(f"Chunksize : {chunksize}") nominal_resolution = nominal_resolution.replace(" ", "_") + self.logger.info(f"Nominal resolution : {nominal_resolution}") for f in file_names: # try to opend datset try: + self.logger.info(f"Opening {f}") ds = xr.open_dataset(f, chunks={"time": chunksize}, engine="netcdf4") except OSError: @@ -377,19 +382,7 @@ def download_from_model_single_var( # noqa: C901 continue if nominal_resolution == "none": - try: - # check if we really have no nominal resolution - - # first compute degree by looking at the longitude increment - degree = abs(ds.lon[0].item() - ds.lon[1].item()) - # in principle lon and lat should be the same, however, this is just an approximation - # same approximation used by climate modeling centers - # information is just for informing the structure, resolution will be checked - # in preprocessing - nominal_resolution = int(degree * 100) - self.logger.info(f"Infering nominal resolution: {nominal_resolution}") - except Exception as error: - self.logger.warning(f"Caught the following exception but continuing : {error}") + nominal_resolution = self.infer_nominal_resolution(ds, nominal_resolution) years = np.unique(ds.time.dt.year.to_numpy()) self.logger.info(f"Data covering years: {years[0]} to {years[-1]}") @@ -468,17 +461,7 @@ def download_meta_historic_biomassburning_single_var( # choose nominal resolution if existent try: - nominal_resolutions = list(ctx.facet_counts["nominal_resolution"].keys()) - self.logger.info(f"Available nominal resolution : {nominal_resolutions}") - - # deal with mulitple nominal resoulitions, taking smalles one as default - if len(nominal_resolutions) > 1: - self.logger.info( - "Multiple nominal resolutions exist, choosing smallest_nominal resolution (trying), " - "please do a check up" - ) - nominal_resolution = nominal_resolutions[0] - self.logger.info(f"Choosing nominal resolution : {nominal_resolution}") + nominal_resolution = self.get_nominal_resolution(ctx) except IndexError: self.logger.info("No nominal resolution") nominal_resolution = "none" @@ -552,6 +535,19 @@ def download_meta_historic_biomassburning_single_var( self.logger.info(outfile) ds_y.to_netcdf(outfile) + def get_nominal_resolution(self, ctx): + nominal_resolutions = list(ctx.facet_counts["nominal_resolution"].keys()) + self.logger.info(f"Available nominal resolution : {nominal_resolutions}") + # deal with mulitple nominal resoulitions, taking smalles one as default + if len(nominal_resolutions) > 1: + self.logger.info( + "Multiple nominal resolutions exist, choosing smallest_nominal resolution (trying), " + "please do a check up" + ) + nominal_resolution = nominal_resolutions[0] + self.logger.info(f"Choosing nominal resolution : {nominal_resolution}") + return nominal_resolution + # TODO Fix complexity issue def download_raw_input_single_var( # noqa: C901 self, @@ -575,11 +571,12 @@ def download_raw_input_single_var( # noqa: C901 defaul_grid_label (str): default gridding method in which the data is provided save_to_meta (bool): if data should be saved to the meta folder instead of the input4mips folder """ + self.logger.info("Using download_raw_input_single_var() function") conn = SearchConnection(self.model_node_link, distrib=False) facets = "project,frequency,variable,nominal_resolution,version,target_mip,grid_label" - # basic constraining (projec, var, institution) + # basic constraining (project, var, institution) ctx = conn.new_context( project=project, @@ -605,17 +602,7 @@ def download_raw_input_single_var( # noqa: C901 # choose nominal resolution if existent try: - nominal_resolutions = list(ctx.facet_counts["nominal_resolution"].keys()) - self.logger.info(f"Available nominal resolution : {nominal_resolutions}") - - # deal with multiple nominal resolutions, taking smallest one as default - if len(nominal_resolutions) > 1: - self.logger.info( - "Multiple nominal resolutions exist, choosing smallest_nominal resolution (trying), " - "please do a check up" - ) - nominal_resolution = nominal_resolutions[0] - self.logger.info(f"Choosing nominal resolution : {nominal_resolution}") + nominal_resolution = self.get_nominal_resolution(ctx) ctx = ctx.constrain(nominal_resolution=nominal_resolution) except IndexError: @@ -641,17 +628,17 @@ def download_raw_input_single_var( # noqa: C901 frequency = "" # target mip group - target_mips = list(ctx.facet_counts["target_mip"].keys()) - self.logger.info(f"Available target mips: {target_mips}") + mips_targets = list(ctx.facet_counts["target_mip"].keys()) + self.logger.info(f"Available target mips: {mips_targets}") ctx_origin = ctx self.logger.info("\n") - if len(target_mips) == 0: - target_mips = [None] - for t in target_mips: - self.logger.info(f"Target mip: {t}") - if t is not None: - ctx = ctx_origin.constrain(target_mip=t) + if len(mips_targets) == 0: + mips_targets = [None] + for target in mips_targets: + self.logger.info(f"Target mip: {target}") + if target is not None: + ctx = ctx_origin.constrain(target_mip=target) versions = list(ctx.facet_counts["version"].keys()) self.logger.info(f"Available versions : {versions}") @@ -672,98 +659,103 @@ def download_raw_input_single_var( # noqa: C901 ctx = ctx_origin_v.constrain(version=version) - result = ctx.search() + results = ctx.search() - self.logger.info(f"Result len {len(result)}") + self.logger.info(f"Result len {len(results)}") - files_list = [r.file_context().search() for r in result] + temp_download_path = RAW_DATA / "tmp" + if not pathlib.Path.exists(temp_download_path): + pathlib.Path(temp_download_path).mkdir(parents=True, exist_ok=True) + for result in results: + file_context = result.file_context() + download_script = file_context.get_download_script() + subprocess.run(["bash", "-c", download_script, "download", "-s"], shell=False, cwd=temp_download_path) - for i, files in enumerate(files_list): - file_names = [files[i].opendap_url for i in range(len(files))] - self.logger.info(f"File {i} names: {file_names}") + files_list = temp_download_path.glob("*.nc") - # find out chunking dependent on resolution - chunksize = RES_TO_CHUNKSIZE[frequency] - self.logger.info(f"Chunksize : {chunksize}") + for f in files_list: + experiment = self.extract_target_mip_exp_name(str(f), target) + self.logger.info(f"Experiment : {experiment}") - # replacing spaces for file naming - nominal_resolution = nominal_resolution.replace(" ", "_") + # make sure to only download data for wanted scenarios + if experiment in self.experiments: + self.logger.info(f"Saving data for experiment : {experiment}") + else: + self.logger.info( + f"Experiment {experiment} not in wanted experiments ({self.experiments}). Skipping" + ) + continue - for f in file_names: - experiment = self.extract_target_mip_exp_name(f, t) + try: + self.logger.info(f"Opening dataset [{f}]") + with xr.open_dataset(f) as ds: + dataset = ds + except OSError as os_error: + self.logger.error(f"Having problems opening the dateset [{f}]. Original file will not be") + self.logger.error(os_error) + continue - # make sure to only download data for wanted scenarios - if experiment in self.experiments: - self.logger.info(f"Downloading data for experiment : {experiment}") - else: - self.logger.info( - f"Experiment {experiment} not in wanted experiments ({self.experiments}). Skipping" - ) - continue + if nominal_resolution == "none": + nominal_resolution = self.infer_nominal_resolution(dataset, nominal_resolution) - try: - ds = xr.open_dataset(f, chunks={"time": chunksize}) - except OSError: - self.logger.info("Having problems downloading the dateset. The server might be down. Skipping") - continue + years = np.unique(dataset.time.dt.year.to_numpy()) + self.logger.info(f"Data covering years: {years[0]} to {years[-1]}") + year_tag = f"{years[0]}_{years[-1]}" - if nominal_resolution == "none": - try: - # check if we really have no nomianl resolution - # first compute degree by looking at the longitude increment - degree = abs(ds.lon[0].item() - ds.lon[1].item()) - # in principal lon and lat should be the same, however, this is just an approximation - # same approximation used by climate modeling centers - # information is just for informing the structure, resolution will be checked - # in preprocessing - nominal_resolution = int(degree * 100) - self.logger.info(f"Infering nominal resolution: {nominal_resolution}") - except Exception as error: - self.logger.warning(f"Caught the following exception but continuing : {error}") - - years = np.unique(ds.time.dt.year.to_numpy()) - self.logger.info(f"Data covering years: {years[0]} to {years[-1]}") - - if variable in self.biomass_vars: - variable = f"{variable}_em_biomassburning" - - for y in years: - y = str(y) - - # Check whether the specified path exists or not - if save_to_meta: - # if meta, we have future openburning stuff + if variable in self.biomass_vars: + variable = f"{variable}_em_biomassburning" + nominal_resolution = nominal_resolution.strip() + nominal_resolution = nominal_resolution.replace(" ", "_") + # Check whether the specified path exists or not + base_file_name = f"{experiment}_{variable}_{nominal_resolution}_{frequency}_{grid_label}_{year_tag}.nc" + if save_to_meta: + # if meta, we have future openburning stuff - out_dir = ( - f"future-openburning/{experiment}/{variable.split('_')[0]}/" - f"{nominal_resolution}/{frequency}/{y}/" - ) - out_name = ( - f"future_openburning_{experiment}_{variable}_{nominal_resolution}" - f"_{frequency}_{grid_label}_{y}.nc" - ) - path = os.path.join(self.meta_dir_parent, out_dir) - else: - out_dir = f"{project}/{experiment}/{variable}/{nominal_resolution}/{frequency}/{y}/" - out_name = ( - f"{project}_{experiment}_{variable}_{nominal_resolution}_{frequency}" - f"_{grid_label}_{y}.nc" - ) - path = os.path.join(self.data_dir_parent, out_dir) + out_dir = ( + f"future-openburning/{experiment}/{variable.split('_')[0]}/{nominal_resolution}/{frequency}/" + ) + out_name = f"future_openburning_{base_file_name}" + path = os.path.join(self.meta_dir_parent, out_dir) + else: + out_dir = f"{project}/{experiment}/{variable}/{nominal_resolution}/{frequency}/" + out_name = f"{project}_{base_file_name}" + path = os.path.join(self.data_dir_parent, out_dir) - os.makedirs(path, exist_ok=True) - outfile = path + out_name + os.makedirs(path, exist_ok=True) + outfile = path + out_name - if (not self.overwrite) and os.path.isfile(outfile): - self.logger.info(f"File {outfile} already exists, skipping.") - else: - self.logger.info(f"Selecting specific year : {y}") - ds_y = ds.sel(time=y) - self.logger.info(ds_y) + if (not self.overwrite) and os.path.isfile(outfile): + self.logger.info(f"File {outfile} already exists, skipping.") + else: + self.logger.info("Writing file") + self.logger.info(outfile) + chunk_size = RES_TO_CHUNKSIZE[frequency] + dataset = dataset.chunk({"time": chunk_size}) + dataset.to_netcdf(outfile, engine="h5netcdf") - self.logger.info("Writing file") - self.logger.info(outfile) - ds_y.to_netcdf(outfile) + def infer_nominal_resolution(self, ds: xr.Dataset, nominal_resolution: str) -> str: + """ + This method checks if there really is not nominal resolution by trying to compute it from the longitude + increment. + + In principle lon and lat should be the same, however, this is just an approximation + same approximation used by climate modeling centers information is just for + informing the structure, resolution will be checked in preprocessing. + + Args: + ds: + nominal_resolution: + + Returns: + """ + nom_res = nominal_resolution + try: + degree = abs(ds.lon[0].item() - ds.lon[1].item()) + nom_res = int(degree * 100) + self.logger.info(f"Inferring nominal resolution: {nom_res}") + except Exception as error: + self.logger.warning(f"Caught the following exception but continuing : {error}") + return nom_res def extract_target_mip_exp_name(self, filename: str, target_mip: str): """ @@ -903,6 +895,7 @@ def download_raw_input( parser = argparse.ArgumentParser() parser.add_argument("--cfg", help="Path to config file.") args = parser.parse_args() + cli_logger = create_logger("Runtime") with open(args.cfg, "r", encoding="utf-8") as stream: cfg = yaml.safe_load(stream) @@ -910,15 +903,15 @@ def download_raw_input( try: models = cfg["models"] except Exception as error: - LOGGER.warning(f"Caught the following exception but continuing : {error}") - LOGGER.info("No climate models specified. Assuming only input4mips data should be downloaded.") + cli_logger.warning(f"Caught the following exception but continuing : {error}") + cli_logger.info("No climate models specified. Assuming only input4mips data should be downloaded.") models = [None] downloader_kwargs = cfg["downloader_kwargs"] - LOGGER.info(f"Downloader kwargs : {downloader_kwargs}") + cli_logger.info(f"Downloader kwargs : {downloader_kwargs}") # one downloader per climate model for m in models: - downloader = Downloader(model=m, **downloader_kwargs, logger=LOGGER) + downloader = Downloader(model=m, **downloader_kwargs, logger=cli_logger) downloader.download_raw_input() if m is not None: downloader.download_from_model() diff --git a/climateset/utils.py b/climateset/utils.py index 09b61d9..02f859d 100644 --- a/climateset/utils.py +++ b/climateset/utils.py @@ -22,6 +22,7 @@ def create_logger(logger_name: str) -> logging.Logger: ) handler.setFormatter(formatter) logger.addHandler(handler) + logger.propagate = False return logger diff --git a/poetry.lock b/poetry.lock index 29068aa..633e24e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1035,6 +1035,57 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "h5netcdf" +version = "1.3.0" +description = "netCDF4 via h5py" +optional = false +python-versions = ">=3.9" +files = [ + {file = "h5netcdf-1.3.0-py3-none-any.whl", hash = "sha256:f2df69dcd3665dc9c4d43eb6529dedd113b2508090d12ac973573305a8406465"}, + {file = "h5netcdf-1.3.0.tar.gz", hash = "sha256:a171c027daeb34b24c24a3b6304195b8eabbb6f10c748256ed3cfe19806383cf"}, +] + +[package.dependencies] +h5py = "*" +packaging = "*" + +[package.extras] +test = ["netCDF4", "pytest"] + +[[package]] +name = "h5py" +version = "3.11.0" +description = "Read and write HDF5 files from Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "h5py-3.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1625fd24ad6cfc9c1ccd44a66dac2396e7ee74940776792772819fc69f3a3731"}, + {file = "h5py-3.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c072655ad1d5fe9ef462445d3e77a8166cbfa5e599045f8aa3c19b75315f10e5"}, + {file = "h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77b19a40788e3e362b54af4dcf9e6fde59ca016db2c61360aa30b47c7b7cef00"}, + {file = "h5py-3.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:ef4e2f338fc763f50a8113890f455e1a70acd42a4d083370ceb80c463d803972"}, + {file = "h5py-3.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bbd732a08187a9e2a6ecf9e8af713f1d68256ee0f7c8b652a32795670fb481ba"}, + {file = "h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75bd7b3d93fbeee40860fd70cdc88df4464e06b70a5ad9ce1446f5f32eb84007"}, + {file = "h5py-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52c416f8eb0daae39dabe71415cb531f95dce2d81e1f61a74537a50c63b28ab3"}, + {file = "h5py-3.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:083e0329ae534a264940d6513f47f5ada617da536d8dccbafc3026aefc33c90e"}, + {file = "h5py-3.11.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a76cae64080210389a571c7d13c94a1a6cf8cb75153044fd1f822a962c97aeab"}, + {file = "h5py-3.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3736fe21da2b7d8a13fe8fe415f1272d2a1ccdeff4849c1421d2fb30fd533bc"}, + {file = "h5py-3.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6ae84a14103e8dc19266ef4c3e5d7c00b68f21d07f2966f0ca7bdb6c2761fb"}, + {file = "h5py-3.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:21dbdc5343f53b2e25404673c4f00a3335aef25521bd5fa8c707ec3833934892"}, + {file = "h5py-3.11.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:754c0c2e373d13d6309f408325343b642eb0f40f1a6ad21779cfa9502209e150"}, + {file = "h5py-3.11.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:731839240c59ba219d4cb3bc5880d438248533366f102402cfa0621b71796b62"}, + {file = "h5py-3.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ec9df3dd2018904c4cc06331951e274f3f3fd091e6d6cc350aaa90fa9b42a76"}, + {file = "h5py-3.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:55106b04e2c83dfb73dc8732e9abad69d83a436b5b82b773481d95d17b9685e1"}, + {file = "h5py-3.11.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f4e025e852754ca833401777c25888acb96889ee2c27e7e629a19aee288833f0"}, + {file = "h5py-3.11.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c4b760082626120031d7902cd983d8c1f424cdba2809f1067511ef283629d4b"}, + {file = "h5py-3.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67462d0669f8f5459529de179f7771bd697389fcb3faab54d63bf788599a48ea"}, + {file = "h5py-3.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:d9c944d364688f827dc889cf83f1fca311caf4fa50b19f009d1f2b525edd33a3"}, + {file = "h5py-3.11.0.tar.gz", hash = "sha256:7b7e8f78072a2edec87c9836f25f34203fd492a4475709a18b417a33cfb21fa9"}, +] + +[package.dependencies] +numpy = ">=1.17.3" + [[package]] name = "httpcore" version = "1.0.5" @@ -3775,4 +3826,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "d76b3689feb9d2d29deadd68ddfd3b2adf291a77bdcc4f7c57910d704d3713d4" +content-hash = "37e6b99df4033a199590905ead26bb5f1dda8a03bdac776eaf6e87bdcc87bb51" diff --git a/pyproject.toml b/pyproject.toml index b301a05..e896455 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ xarray = "^2024.5.0" esgf-pyclient = "^0.3.1" docformatter = {extras = ["toml"], version = "^1.7.5"} myproxyclient = "^2.1.1" +h5netcdf = "^1.3.0" [tool.poetry.group.secondary.dependencies] jinja2 = ">=3.1.4"