Skip to content

Commit

Permalink
📦 Refresh ATL11 Zarr data to 20200404
Browse files Browse the repository at this point in the history
Phew, lots of behind the scenes issues to get all these conversions going! The disk volume I was chucking my files into ran out of space, so I had to restart the ATL06_to_ATL11_Antarctica.sh script using `parallel --resume-failed` at about 25% (even though it actually went to 50%, but no files were stored!). Thankgoodness for the resumable 'log' file. The HDF5 to Zarr conversion also had some hiccups, seemingly runing to 99% and then pausing there, so I had to manually run a few conversions. That, and juggling files across different disk volumes on the servers. Anyways, we've got two more reference ground tracks and are at 1387 instead of 1385! Also reduced the number of 'exception' cases since there's more data to concatenate now, and they should disappear altogether give or take a few more ICESat-2 cycles!
  • Loading branch information
weiji14 committed Jun 9, 2020
1 parent 303f4f4 commit 5198b2b
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 47 deletions.
72 changes: 42 additions & 30 deletions atl06_to_atl11.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
" f\" --cycles {first_cycle} {last_cycle}\"\n",
" f\" --Release 3\"\n",
" f\" --directory 'ATL06.003/**/'\"\n",
" f\" --out_dir ATL11.001\\n\",\n",
" f\" --out_dir ATL11.001\\n\"\n",
" )\n",
" writelines.sort() # sort writelines in place\n",
"\n",
Expand All @@ -168,11 +168,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[7m100% 4161:0=0s python3 ATL11/ATL06_to_ATL11.py 1387 12 --cycles 01 06 --Release \u001b[0m\u001b[0m\n"
]
}
],
"source": [
"# !PYTHONPATH=`pwd` PYTHONWARNINGS=\"ignore\" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --resume-failed --results logdir --joblog log --jobs 64 > /dev/null"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# !PYTHONPATH=`pwd` PYTHONWARNINGS=\"ignore\" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --results logdir --joblog log --jobs 64 > /dev/null"
"# df_log = pd.read_csv(filepath_or_buffer=\"log\", sep=\"\\t\")\n",
"# df_log.query(expr=\"Exitval > 0\")"
]
},
{
Expand All @@ -194,7 +212,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {
"lines_to_next_cell": 1
},
Expand All @@ -203,7 +221,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"6 ICESat-2 cycles available\n"
"7 ICESat-2 cycles available\n"
]
}
],
Expand All @@ -217,7 +235,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -254,7 +272,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {
"lines_to_next_cell": 2
},
Expand All @@ -263,7 +281,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1387/1387 [00:08<00:00, 170.72it/s]\n"
"100%|██████████| 1387/1387 [00:09<00:00, 142.77it/s]\n"
]
}
],
Expand All @@ -278,20 +296,16 @@
" try:\n",
" assert len(atl11files) == 3 # Should be 3 files for Orbital Segments 10,11,12\n",
" except AssertionError:\n",
" if (len(atl11files) == 0 and rgt + 1 in [47, 214]) or (\n",
" len(atl11files) == 2\n",
" and rgt + 1 in [31, 54, 73, 100, 106, 161, 603, 915, 1045, 1106, 1151]\n",
" ):\n",
" if len(atl11files) == 2 and rgt + 1 in [208, 1036]:\n",
" pass\n",
" else:\n",
" raise\n",
" # Note [\"ATL11.001/ATL11_014512_0103_03_v001.h5\", \"ATL11.001/ATL11_115810_0104_03_v001.h5\"]\n",
" # are missing pt2 and pt3 groups\n",
" # Note [\"ATL11.001/ATL11_014512_0206_03_v001.h5\"] is missing pt2 and pt3 groups\n",
"\n",
" if atl11files:\n",
" pattern: dict = intake.source.utils.reverse_format(\n",
" format_string=\"ATL11.001/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{revision:2}_v{version:3}.h5\",\n",
" resolved_string=sorted(atl11files)[1],\n",
" resolved_string=sorted(atl11files)[1], # get the '11' one, not '10' or '12'\n",
" )\n",
" zarrfilepath: str = \"ATL11.001z123/ATL11_{referencegroundtrack}1x_{cycles}_{revision}_v{version}.zarr\".format(\n",
" **pattern\n",
Expand All @@ -301,14 +315,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1385/1385 [00:33<00:00, 41.79it/s]\n"
"100%|██████████| 1387/1387 [00:23<00:00, 60.25it/s]\n"
]
}
],
Expand All @@ -332,10 +346,10 @@
" )\n",
"\n",
" # Special exceptions to skip over\n",
" if atl11file in (\n",
" \"ATL11.001/ATL11_014512_0103_03_v001.h5\",\n",
" \"ATL11.001/ATL11_115810_0104_03_v001.h5\",\n",
" ) and pair in (\"pt2\", \"pt3\"):\n",
" if atl11file in (\"ATL11.001/ATL11_014512_0206_03_v001.h5\",) and pair in (\n",
" \"pt2\",\n",
" \"pt3\",\n",
" ):\n",
" continue\n",
" # print(atl11file, pair)\n",
" # xr.open_dataset(\n",
Expand All @@ -350,19 +364,19 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1385/1385 [39:46<00:00, 1.72s/it]\n"
"100%|█████████▉| 1387/1387 [46:42<00:00, 3.00it/s] "
]
}
],
"source": [
"# Do all the HDF5 to Zarr conversion!\n",
"# Do all the HDF5 to Zarr conversion! Should take less than an hour to run.\n",
"# Check conversion progress here, https://stackoverflow.com/a/37901797/6611055\n",
"futures = [client.compute(store_task) for store_task in stores]\n",
"for f in tqdm.tqdm(\n",
Expand All @@ -373,26 +387,24 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [
{
"data": {
"text/plain": [
"(147664, 4)"
"(185215, 6)"
]
},
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds = xr.open_dataset(\n",
" zarrfilepath, engine=\"zarr\", backend_kwargs={\"consolidated\": True},\n",
")\n",
"ds = xr.open_dataset(zarrfilepath, engine=\"zarr\", backend_kwargs={\"consolidated\": True})\n",
"ds.h_corr.__array__().shape"
]
},
Expand Down
32 changes: 15 additions & 17 deletions atl06_to_atl11.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def first_last_cycle_numbers(referencegroundtrack: int, orbitalsegment: int):
f" --cycles {first_cycle} {last_cycle}"
f" --Release 3"
f" --directory 'ATL06.003/**/'"
f" --out_dir ATL11.001\n",
f" --out_dir ATL11.001\n"
)
writelines.sort() # sort writelines in place

Expand All @@ -107,7 +107,11 @@ def first_last_cycle_numbers(referencegroundtrack: int, orbitalsegment: int):
# - O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881, DOI https://doi.org/10.5281/zenodo.1146014

# %%
# !PYTHONPATH=`pwd` PYTHONWARNINGS="ignore" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --results logdir --joblog log --jobs 64 > /dev/null
# !PYTHONPATH=`pwd` PYTHONWARNINGS="ignore" parallel -a ATL06_to_ATL11_Antarctica.sh --bar --resume-failed --results logdir --joblog log --jobs 64 > /dev/null

# %%
# df_log = pd.read_csv(filepath_or_buffer="log", sep="\t")
# df_log.query(expr="Exitval > 0")

# %% [markdown]
# ## Convert from HDF5 to Zarr format
Expand Down Expand Up @@ -172,20 +176,16 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
try:
assert len(atl11files) == 3 # Should be 3 files for Orbital Segments 10,11,12
except AssertionError:
if (len(atl11files) == 0 and rgt + 1 in [47, 214]) or (
len(atl11files) == 2
and rgt + 1 in [31, 54, 73, 100, 106, 161, 603, 915, 1045, 1106, 1151]
):
if len(atl11files) == 2 and rgt + 1 in [208, 1036]:
pass
else:
raise
# Note ["ATL11.001/ATL11_014512_0103_03_v001.h5", "ATL11.001/ATL11_115810_0104_03_v001.h5"]
# are missing pt2 and pt3 groups
# Note ["ATL11.001/ATL11_014512_0206_03_v001.h5"] is missing pt2 and pt3 groups

if atl11files:
pattern: dict = intake.source.utils.reverse_format(
format_string="ATL11.001/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{revision:2}_v{version:3}.h5",
resolved_string=sorted(atl11files)[1],
resolved_string=sorted(atl11files)[1], # get the '11' one, not '10' or '12'
)
zarrfilepath: str = "ATL11.001z123/ATL11_{referencegroundtrack}1x_{cycles}_{revision}_v{version}.zarr".format(
**pattern
Expand Down Expand Up @@ -213,10 +213,10 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
)

# Special exceptions to skip over
if atl11file in (
"ATL11.001/ATL11_014512_0103_03_v001.h5",
"ATL11.001/ATL11_115810_0104_03_v001.h5",
) and pair in ("pt2", "pt3"):
if atl11file in ("ATL11.001/ATL11_014512_0206_03_v001.h5",) and pair in (
"pt2",
"pt3",
):
continue
# print(atl11file, pair)
# xr.open_dataset(
Expand All @@ -229,7 +229,7 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
stores.append(store_task)

# %%
# Do all the HDF5 to Zarr conversion!
# Do all the HDF5 to Zarr conversion! Should take less than an hour to run.
# Check conversion progress here, https://stackoverflow.com/a/37901797/6611055
futures = [client.compute(store_task) for store_task in stores]
for f in tqdm.tqdm(
Expand All @@ -238,9 +238,7 @@ def open_ATL11(atl11file: str, group: str) -> xr.Dataset:
pass

# %%
ds = xr.open_dataset(
zarrfilepath, engine="zarr", backend_kwargs={"consolidated": True},
)
ds = xr.open_dataset(zarrfilepath, engine="zarr", backend_kwargs={"consolidated": True})
ds.h_corr.__array__().shape


Expand Down

0 comments on commit 5198b2b

Please sign in to comment.