From b32f058413fc6be46aac945d45a04b219bbf5c1c Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 21:14:10 +0200 Subject: [PATCH] add notebook for chunking methods --- docs/chunking.ipynb | 891 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 891 insertions(+) create mode 100644 docs/chunking.ipynb diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb new file mode 100644 index 00000000..c171dea6 --- /dev/null +++ b/docs/chunking.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96e9149e-fc6d-4048-8e45-a29966e5c6b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from itertools import product\n", + "import numpy as np\n", + "\n", + "import xarray as xr\n", + "import xbitinfo as xb" + ] + }, + { + "cell_type": "markdown", + "id": "b64e0873-0a27-4757-947a-4a559a102288", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320224c9-06e2-428a-8614-8ed0d15eee82", + "metadata": {}, + "outputs": [], + "source": [ + "# load data\n", + "ds = xr.tutorial.load_dataset(\"air_temperature\") \n", + "chunks = {'lat':5,'lon':10} # Defining chunks that will be used for the reading/bitrounding/writing\n", + "ds = ds.chunk(chunks) # Apply chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f3120040-79f1-4a7f-a61f-afec9fb3ca5b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 25, time: 2920, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float32 dask.array<chunksize=(2920, 5, 10), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    title:        4x daily NMC reanalysis (1948)\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 25, time: 2920, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float32 dask.array\n", + "Attributes:\n", + " Conventions: COARDS\n", + " title: 4x daily NMC reanalysis (1948)\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "b9e8fe5a-2e4e-4dfd-8026-0991e9988668", + "metadata": {}, + "source": [ + "## Saving to file" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_11221/350902741.py:1: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " ds.to_netcdf(\"0.air_original.nc\")\n" + ] + } + ], + "source": [ + "ds.to_netcdf(\"0.air_original.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b98628e-cbcb-4018-8565-4c0324cf2d61", + "metadata": {}, + "source": [ + "## Compress with `to_compressed_netcdf`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ayoubf/Projects/xbitinfo/xbitinfo/save_compressed.py:121: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " self._obj.to_netcdf(\n" + ] + } + ], + "source": [ + "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f5aae30-4a0a-401c-9018-9e34626c3d2c", + "metadata": {}, + "source": [ + "## Compress with bitrounding" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eedef4b12a25419fba0f9d7348e619a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00>> slices_from_chunks(((2, 2), (3, 3, 3))) # doctest: +NORMALIZE_WHITESPACE\n", + " [(slice(0, 2, None), slice(0, 3, None)),\n", + " (slice(0, 2, None), slice(3, 6, None)),\n", + " (slice(0, 2, None), slice(6, 9, None)),\n", + " (slice(2, 4, None), slice(0, 3, None)),\n", + " (slice(2, 4, None), slice(3, 6, None)),\n", + " (slice(2, 4, None), slice(6, 9, None))]\n", + " \"\"\"\n", + " cumdims = []\n", + " for bds in chunks:\n", + " out = np.empty(len(bds)+1, dtype=int)\n", + " out[0] = 0\n", + " np.cumsum(bds, out=out[1:])\n", + " cumdims.append(out)\n", + " slices = [\n", + " [slice(s, s + dim) for s, dim in zip(starts, shapes)]\n", + " for starts, shapes in zip(cumdims, chunks)\n", + " ]\n", + " return list(product(*slices))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", + "metadata": {}, + "outputs": [], + "source": [ + "fn = 'air.zarr' # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode='w') # Creates empty file structure" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7221b47f-b8f4-4ebf-bc2b-cb61d12989be", + "metadata": {}, + "outputs": [], + "source": [ + "dims = ds.air.dims\n", + "len_dims = len(dims)\n", + "slices = slices_from_chunks(ds.air.chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7c2ed18f-4dc8-4f5c-88ed-ae5ad41d1647", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", + " #slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + " ds_block = xr.Dataset({'air':(dims, block.compute())}) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", + " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", + " rounded_ds.to_zarr(fn, region={dims[d]:s for (d,s) in enumerate(slices[b])}) # Write individual chunk to disk" + ] + }, + { + "cell_type": "markdown", + "id": "9ae3603f-291d-4c7f-92a8-95d9935daf35", + "metadata": {}, + "source": [ + "## Creating smaller datasets as chunks and compressing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1d53f86f-fa72-4161-a364-8c1f78dba6d6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "at_least_zero = lambda x: max(x, 0)\n", + "\n", + "chunk_long, chunk_lat = [10, 5] # for int division\n", + "var = 'lat'\n", + "\n", + "dss = []\n", + "dss_bitrounded = []\n", + "dss_kbits = []\n", + "\n", + "long_c = int(ds.lon.size / chunk_long)\n", + "lat_c = int(ds.lat.size / chunk_lat)\n", + "\n", + "for i in range(long_c):\n", + " for j in range(lat_c):\n", + " temp_ds = ds.isel(lon=slice(i*chunk_long, (i+1)*chunk_long),\n", + " lat=slice(j*chunk_lat, (j+1)*chunk_lat))\n", + " dss.append(temp_ds)\n", + " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=var, implementation=\"python\")\n", + " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", + " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", + " dss_kbits.append(temp_keepbits)\n", + " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", + " dss_bitrounded.append(temp_ds_bitrounded)\n", + "\n", + " if i == 0 and j == 0 : \n", + " MERGED_ds_bitr = temp_ds_bitrounded\n", + " else:\n", + " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e1c4b35-4767-47e7-9ddf-357250b631b8", + "metadata": {}, + "outputs": [], + "source": [ + "MERGED_ds_bitr.to_compressed_netcdf(\"3.air_chunked_bitr_compressed.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3b60c66-252d-48a6-af93-a00c9ca8f0ba", + "metadata": {}, + "source": [ + "## ALL" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "998581b5-6ad9-4f6f-9c61-d0bf1486ec7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.5M\t0.air_original.nc\n", + "1.7M\t1.air_compressed_all.nc\n", + "1.3M\t2.air_bitrounded_compressed.nc\n", + "776K\t3.air_chunked_bitr_compressed.nc\n", + "1.1M\tair.zarr\n" + ] + } + ], + "source": [ + "!du -hs *.nc *.zarr" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:bitinfo] *", + "language": "python", + "name": "conda-env-bitinfo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}