📦 Refresh ATL11 Zarr data to 20200404

Phew, lots of behind the scenes issues to get all these conversions going! The disk volume I was chucking my files into ran out of space, so I had to restart the ATL06_to_ATL11_Antarctica.sh script using `parallel --resume-failed` at about 25% (even though it actually went to 50%, but no files were stored!). Thankgoodness for the resumable 'log' file. The HDF5 to Zarr conversion also had some hiccups, seemingly runing to 99% and then pausing there, so I had to manually run a few conversions. That, and juggling files across different disk volumes on the servers. Anyways, we've got two more reference ground tracks and are at 1387 instead of 1385! Also reduced the number of 'exception' cases since there's more data to concatenate now, and they should disappear altogether give or take a few more ICESat-2 cycles!
weiji14 · Jun 9, 2020 · 5198b2b · 5198b2b
1 parent 303f4f4
commit 5198b2b
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 47 deletions.
diff --git a/atl06_to_atl11.ipynb b/atl06_to_atl11.ipynb
@@ -145,7 +145,7 @@
     "            f\" --cycles {first_cycle} {last_cycle}\"\n",
     "            f\" --Release 3\"\n",
     "            f\" --directory 'ATL06.003/**/'\"\n",
-    "            f\" --out_dir ATL11.001\\n\",\n",
+    "            f\" --out_dir ATL11.001\\n\"\n",
     "        )\n",
     "    writelines.sort()  # sort writelines in place\n",
     "\n",
@@ -168,11 +168,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[7m100% 4161:0=0s python3 ATL11/ATL06_to_ATL11.py 1387 12 --cycles 01 06 --Release \u001b[0m\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !PYTHONPATH=`pwd` PYTHONWARNINGS=\"ignore\" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --resume-failed --results logdir --joblog log --jobs 64 > /dev/null"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# !PYTHONPATH=`pwd` PYTHONWARNINGS=\"ignore\" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --results logdir --joblog log --jobs 64 > /dev/null"
+    "# df_log = pd.read_csv(filepath_or_buffer=\"log\", sep=\"\\t\")\n",
+    "# df_log.query(expr=\"Exitval > 0\")"
    ]
   },
   {
@@ -194,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {
     "lines_to_next_cell": 1
    },
@@ -203,7 +221,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "6 ICESat-2 cycles available\n"
+      "7 ICESat-2 cycles available\n"
      ]
     }
    ],
@@ -217,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -254,7 +272,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -263,7 +281,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1387/1387 [00:08<00:00, 170.72it/s]\n"
+      "100%|██████████| 1387/1387 [00:09<00:00, 142.77it/s]\n"
      ]
     }
    ],
@@ -278,20 +296,16 @@
     "    try:\n",
     "        assert len(atl11files) == 3  # Should be 3 files for Orbital Segments 10,11,12\n",
     "    except AssertionError:\n",
-    "        if (len(atl11files) == 0 and rgt + 1 in [47, 214]) or (\n",
-    "            len(atl11files) == 2\n",
-    "            and rgt + 1 in [31, 54, 73, 100, 106, 161, 603, 915, 1045, 1106, 1151]\n",
-    "        ):\n",
+    "        if len(atl11files) == 2 and rgt + 1 in [208, 1036]:\n",
     "            pass\n",
     "        else:\n",
     "            raise\n",
-    "    # Note [\"ATL11.001/ATL11_014512_0103_03_v001.h5\", \"ATL11.001/ATL11_115810_0104_03_v001.h5\"]\n",
-    "    # are missing pt2 and pt3 groups\n",
+    "    # Note [\"ATL11.001/ATL11_014512_0206_03_v001.h5\"] is missing pt2 and pt3 groups\n",
     "\n",
     "    if atl11files:\n",
     "        pattern: dict = intake.source.utils.reverse_format(\n",
     "            format_string=\"ATL11.001/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{revision:2}_v{version:3}.h5\",\n",
-    "            resolved_string=sorted(atl11files)[1],\n",
+    "            resolved_string=sorted(atl11files)[1],  # get the '11' one, not '10' or '12'\n",
     "        )\n",
     "        zarrfilepath: str = \"ATL11.001z123/ATL11_{referencegroundtrack}1x_{cycles}_{revision}_v{version}.zarr\".format(\n",
     "            **pattern\n",
@@ -301,14 +315,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1385/1385 [00:33<00:00, 41.79it/s]\n"
+      "100%|██████████| 1387/1387 [00:23<00:00, 60.25it/s]\n"
      ]
     }
    ],
@@ -332,10 +346,10 @@
     "            )\n",
     "\n",
     "            # Special exceptions to skip over\n",
-    "            if atl11file in (\n",
-    "                \"ATL11.001/ATL11_014512_0103_03_v001.h5\",\n",
-    "                \"ATL11.001/ATL11_115810_0104_03_v001.h5\",\n",
-    "            ) and pair in (\"pt2\", \"pt3\"):\n",
+    "            if atl11file in (\"ATL11.001/ATL11_014512_0206_03_v001.h5\",) and pair in (\n",
+    "                \"pt2\",\n",
+    "                \"pt3\",\n",
+    "            ):\n",
     "                continue\n",
     "                # print(atl11file, pair)\n",
     "                # xr.open_dataset(\n",
@@ -350,19 +364,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1385/1385 [39:46<00:00,  1.72s/it]\n"
+      "100%|█████████▉| 1387/1387 [46:42<00:00,  3.00it/s]   "
      ]
     }
    ],
    "source": [
-    "# Do all the HDF5 to Zarr conversion!\n",
+    "# Do all the HDF5 to Zarr conversion! Should take less than an hour to run.\n",
     "# Check conversion progress here, https://stackoverflow.com/a/37901797/6611055\n",
     "futures = [client.compute(store_task) for store_task in stores]\n",
     "for f in tqdm.tqdm(\n",
@@ -373,26 +387,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {
     "lines_to_next_cell": 2
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(147664, 4)"
+       "(185215, 6)"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "ds = xr.open_dataset(\n",
-    "    zarrfilepath, engine=\"zarr\", backend_kwargs={\"consolidated\": True},\n",
-    ")\n",
+    "ds = xr.open_dataset(zarrfilepath, engine=\"zarr\", backend_kwargs={\"consolidated\": True})\n",
     "ds.h_corr.__array__().shape"
    ]
   },

diff --git a/atl06_to_atl11.py b/atl06_to_atl11.py
@@ -89,7 +89,7 @@ def first_last_cycle_numbers(referencegroundtrack: int, orbitalsegment: int):
             f" --cycles {first_cycle} {last_cycle}"
             f" --Release 3"
             f" --directory 'ATL06.003/**/'"
-            f" --out_dir ATL11.001\n",
+            f" --out_dir ATL11.001\n"
         )
     writelines.sort()  # sort writelines in place
 
@@ -107,7 +107,11 @@ def first_last_cycle_numbers(referencegroundtrack: int, orbitalsegment: int):
 # - O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881, DOI https://doi.org/10.5281/zenodo.1146014
 
 # %%
-# !PYTHONPATH=`pwd` PYTHONWARNINGS="ignore" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --results logdir --joblog log --jobs 64 > /dev/null
+# !PYTHONPATH=`pwd` PYTHONWARNINGS="ignore" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --resume-failed --results logdir --joblog log --jobs 64 > /dev/null
+
+# %%
+# df_log = pd.read_csv(filepath_or_buffer="log", sep="\t")
+# df_log.query(expr="Exitval > 0")
 
 # %% [markdown]
 # ## Convert from HDF5 to Zarr format
@@ -172,20 +176,16 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
     try:
         assert len(atl11files) == 3  # Should be 3 files for Orbital Segments 10,11,12
     except AssertionError:
-        if (len(atl11files) == 0 and rgt + 1 in [47, 214]) or (
-            len(atl11files) == 2
-            and rgt + 1 in [31, 54, 73, 100, 106, 161, 603, 915, 1045, 1106, 1151]
-        ):
+        if len(atl11files) == 2 and rgt + 1 in [208, 1036]:
             pass
         else:
             raise
-    # Note ["ATL11.001/ATL11_014512_0103_03_v001.h5", "ATL11.001/ATL11_115810_0104_03_v001.h5"]
-    # are missing pt2 and pt3 groups
+    # Note ["ATL11.001/ATL11_014512_0206_03_v001.h5"] is missing pt2 and pt3 groups
 
     if atl11files:
         pattern: dict = intake.source.utils.reverse_format(
             format_string="ATL11.001/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{revision:2}_v{version:3}.h5",
-            resolved_string=sorted(atl11files)[1],
+            resolved_string=sorted(atl11files)[1],  # get the '11' one, not '10' or '12'
         )
         zarrfilepath: str = "ATL11.001z123/ATL11_{referencegroundtrack}1x_{cycles}_{revision}_v{version}.zarr".format(
             **pattern
@@ -213,10 +213,10 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
             )
 
             # Special exceptions to skip over
-            if atl11file in (
-                "ATL11.001/ATL11_014512_0103_03_v001.h5",
-                "ATL11.001/ATL11_115810_0104_03_v001.h5",
-            ) and pair in ("pt2", "pt3"):
+            if atl11file in ("ATL11.001/ATL11_014512_0206_03_v001.h5",) and pair in (
+                "pt2",
+                "pt3",
+            ):
                 continue
                 # print(atl11file, pair)
                 # xr.open_dataset(
@@ -229,7 +229,7 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
     stores.append(store_task)
 
 # %%
-# Do all the HDF5 to Zarr conversion!
+# Do all the HDF5 to Zarr conversion! Should take less than an hour to run.
 # Check conversion progress here, https://stackoverflow.com/a/37901797/6611055
 futures = [client.compute(store_task) for store_task in stores]
 for f in tqdm.tqdm(
@@ -238,9 +238,7 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
     pass
 
 # %%
-ds = xr.open_dataset(
-    zarrfilepath, engine="zarr", backend_kwargs={"consolidated": True},
-)
+ds = xr.open_dataset(zarrfilepath, engine="zarr", backend_kwargs={"consolidated": True})
 ds.h_corr.__array__().shape