From 789618dedb4588a25c06f46159ec1e00d5d7e6b5 Mon Sep 17 00:00:00 2001 From: Rob Hanna - NOAA <90854818+RobHanna-NOAA@users.noreply.github.com> Date: Fri, 25 Oct 2024 19:03:50 +0000 Subject: [PATCH 1/3] v4.5.11.2 docker file for OWP, non root user (#1322) --- Dockerfile => Dockerfile.dev | 0 Dockerfile.prod | 129 +++++++++++++++++++++++++++++++++++ README.md | 12 ++-- docs/CHANGELOG.md | 24 +++++++ fim_pipeline.sh | 2 +- fim_post_processing.sh | 2 +- fim_pre_processing.sh | 4 ++ fim_process_unit_wb.sh | 2 + 8 files changed, 168 insertions(+), 7 deletions(-) rename Dockerfile => Dockerfile.dev (100%) create mode 100644 Dockerfile.prod diff --git a/Dockerfile b/Dockerfile.dev similarity index 100% rename from Dockerfile rename to Dockerfile.dev diff --git a/Dockerfile.prod b/Dockerfile.prod new file mode 100644 index 000000000..5b702e75e --- /dev/null +++ b/Dockerfile.prod @@ -0,0 +1,129 @@ +## Temporary image to build the libraries and only save the needed artifacts +FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4 AS builder +WORKDIR /opt/builder +ARG dataDir=/data +ARG projectDir=/foss_fim +ARG depDir=/dependencies +ARG taudemVersion=98137bb6541a0d0077a9c95becfed4e56d0aa0ac +ARG taudemVersion2=81f7a07cdd3721617a30ee4e087804fddbcffa88 +ENV taudemDir=$depDir/taudem/bin +ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin + +# remove reference to missing repo +# RUN rm /etc/apt/sources.list.d/apache-arrow.sources + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/dtarb/taudem.git +RUN git clone https://github.com/fernandoa123/cybergis-toolkit.git taudem_accelerated_flowDirections + +RUN apt-get update --fix-missing && apt-get install -y cmake mpich \ + libgtest-dev libboost-test-dev libnetcdf-dev && rm -rf /var/lib/apt/lists/* + +## Compile Main taudem repo ## +RUN mkdir -p taudem/bin +RUN cd taudem \ + && git checkout $taudemVersion \ + && cd src \ + && make + +## Compile taudem repo with accelerated flow directions ## +RUN cd taudem_accelerated_flowDirections/taudem \ + && git checkout $taudemVersion2 \ + && mkdir build \ + && cd build \ + && cmake .. \ + && make + +RUN mkdir -p $taudemDir +RUN mkdir -p $taudemDir2 + +## Move needed binaries to the next stage of the image +RUN cd taudem/bin && mv -t $taudemDir flowdircond streamnet gagewatershed catchhydrogeo dinfdistdown +RUN cd taudem_accelerated_flowDirections/taudem/build/bin && mv -t $taudemDir2 d8flowdir dinfflowdir + + +############################################################################################### +# Base Image that has GDAL, PROJ, etc +FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4 +ARG dataDir=/data +ENV projectDir=/foss_fim +ARG depDir=/dependencies +ENV inputsDir=$dataDir/inputs +ENV outputsDir=/outputs +ENV srcDir=$projectDir/src +ENV toolsDir=$projectDir/tools +ENV workDir=/fim_temp +ENV taudemDir=$depDir/taudem/bin +ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin + +## ADDING FIM GROUP ## +ARG GroupID=1370800235 +ARG GroupName=fim +RUN addgroup --gid $GroupID $GroupName +ENV GID=$GroupID +ENV GN=$GroupName + +RUN mkdir -p $workDir +RUN mkdir -p $depDir +COPY --from=builder $depDir $depDir + +# remove reference to missing repo +# RUN rm /etc/apt/sources.list.d/apache-arrow.sources + +RUN apt-get update --fix-missing && rm -rf /var/lib/apt/lists/* +RUN apt update --fix-missing + +RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y p7zip-full python3-pip time mpich parallel libgeos-dev expect tmux rsync tzdata wget + +RUN apt auto-remove + +## adding AWS CLI (for bash) ## +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install + +## adding environment variables for numba and python ## +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV PYTHONUNBUFFERED=TRUE + +## ADD TO PATHS ## +ENV PATH="$projectDir:${PATH}" +ENV PYTHONPATH="${PYTHONPATH}:${projectDir}:${srcDir}:${srcDir}/utils:${projectDir}/data:${toolsDir}" + +## install python 3 modules ## +COPY Pipfile . +COPY Pipfile.lock . +RUN pip3 install pipenv==2024.0.1 && PIP_NO_CACHE_DIR=off pipenv install --system --deploy --ignore-pipfile + +# ---------------------------------- +# Mar 2023 / Sep 2024 +# There are some nuances in the whitebox python downloads in that the first time it loads +# it goes to the internet and downloads the latest/greatest WBT (whiteboxtools) engine which is +# required for the whitebox python library to work. We don't want to have FIM attempting a download +# each time a container is opened and the whitebox engine is called. +# Instead we will setup the WBT engine at time of docker build (same as Taudem and AWS). +# Whitebox code detects that the engine it there and makes no attempt to update it. +# We download and unzip it to the same file folder that pip deployed the whitebox library. +# Whitebox also attempts to always download a folder called testdata regardless of use. +# We added an empty folder to fake out whitebox_tools.py so it doesn't try to download the folder +ENV WBT_PATH=/usr/local/lib/python3.10/dist-packages/whitebox/WBT +RUN wget -P $WBT_PATH https://www.whiteboxgeo.com/WBT_Linux/WhiteboxTools_linux_musl.zip && \ + unzip -o $WBT_PATH/WhiteboxTools_linux_musl.zip -d $WBT_PATH && \ + cp $WBT_PATH/WhiteboxTools_linux_amd64/WBT/whitebox_tools $WBT_PATH +# ---------------------------------- + +# The containiner will auto use this account to run +ARG RuntimeUser=svc_user +RUN useradd -u 8877 -g $GroupName -s /bin/bash $RuntimeUser +RUN chmod 777 $workDir +RUN mkdir -p "/home/${RuntimeUser}" +RUN chmod 777 /home/$RuntimeUser + +## RUN UMASK TO CHANGE DEFAULT PERMISSIONS ## +ADD ./src/entrypoint.sh / +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] + +## This results in the default user being the svc_user user +USER $RuntimeUser diff --git a/README.md b/README.md index 7bb7207c6..8e3d786e9 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ This software uses the Height Above Nearest Drainage (HAND) method to generate R # FIM Version 4 +#### Note: While we use the phrase "FIM" regularily, the phrase "HAND" is also used and is generally interchangeable. Most output folders now follow the convenction of "hand_4_x_x_x". + ## Accessing Data through ESIP S3 Bucket The latest national generated HAND data and a subset of the inputs can be found in an Amazon S3 Bucket hosted by [Earth Science Information Partners (ESIP)](https://www.esipfed.org/). These data can be accessed using the AWS CLI tools. Please contact Carson Pruitt (carson.pruitt@noaa.gov) or Fernando Salas (fernando.salas@noaa.gov) if you experience issues with permissions. @@ -49,12 +51,12 @@ aws s3 ls s3://noaa-nws-owp-fim/hand_fim/ --profile esip Download a directory of sample outputs for a single HUC8: ``` -aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/fim_4_4_0_0/12090301 \ +aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/hand_4_5_2_11/12090301 \ /your_local_folder_name/12090301 --profile esip ``` -By adjusting pathing, you can also download entire directories such as the `fim_4_4_0_0` folder. An entire output FIM set (e.g. `fim_4_4_0_0`) is approximately 1.1 TB. +By adjusting pathing, you can also download entire directories such as the `hand_4_5_2_11` folder. An entire output HAND set is approximately 1.7 TB. -**Note**: There may be newer editions than `fim_4_4_0_0`, and it is recommended to adjust the command above for the latest version. +**Note**: There may be newer editions than `hand_4_5_11_1`, and it is recommended to adjust the command above for the latest version. ## Setting up your Environment @@ -85,7 +87,7 @@ Git will auto create a subfolder named `inundation-mapping` where the code will ### Installation 1. Install Docker : [Docker](https://docs.docker.com/get-docker/) -2. Build Docker Image : `docker build -f Dockerfile -t : ` +2. Build Docker Image : `docker build -f Dockerfile.dev -t : ` 3. Create FIM group on host machine: - Linux: `groupadd -g 1370800178 fim` 4. Change group ownership of repo (needs to be redone when a new file occurs in the repo): @@ -128,7 +130,7 @@ docker run --rm -it --name \ ``` For example: ```bash -docker run --rm -it --name robs_container \ +docker run --rm -it --name Robs_container \ -v /home/projects/fim/code/inundation-mapping/:/foss_fim \ -v /home/projects/fim/data/outputs/:/outputs \ -v /home/projects/fim/data/outputs_temp/:/fim_temp \ diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2498181f6..4618b15f7 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,30 @@ All notable changes to this project will be documented in this file. We follow the [Semantic Versioning 2.0.0](http://semver.org/) format. +## v4.5.11.2 - 2024-10-25 - [PR#1322](https://github.com/NOAA-OWP/inundation-mapping/pull/1322) + +For security reasons, we needed to create a docker image that does not use the root user in anyway. The new `Dockerfile.prod` file is to be used when we want to use a non-root user. The original `Dockerfile` has been renamed to `Dockerfile.dev` and will continue to use it's root users which has no problems with interacting with external mounts. + +Note: Re: using pip or pipenv installs. +In the Dockerfile.prod, you can not do installs or update using either pipenv or pip. Those types of tests and adjustments need to be done in the `Dockerfile.dev`. `Dockerfile.dev` will also allow change to the `Pipfile` and `Pipfile.lock` . Both docker files share the Pipfiles so it should be just fine. + +### File Renames +- Was: `Dockerfile`, now `Dockerfile.dev` + +### Additions + +- Dockerfile.prod: as described + +### Changes +- `README.md`: change notes from phrase `Dockerfile` to `Dockerfile.dev`. Also added some notes about the new convention of outputs no longer starting with `fim_` but now `hand_` +- `fim_pipeline.sh`: Change for the new `Dockerfile.prod` for permissions. +- `fim_post_processing.sh`: Change for the new `Dockerfile.prod` for permissions. +- `fim_pre_processing.sh`: Change for the new `Dockerfile.prod` for permissions. +- `fim_process_unit_wb.sh`: Change for the new `Dockerfile.prod` for permissions. + +

+ + ## v4.5.11.1 - 2024-10-16 - [PR#1318](https://github.com/NOAA-OWP/inundation-mapping/pull/1318) Bug fixes to address issues during `fim_pipeline.sh`. diff --git a/fim_pipeline.sh b/fim_pipeline.sh index 3934c27a7..35987e0f1 100755 --- a/fim_pipeline.sh +++ b/fim_pipeline.sh @@ -121,7 +121,7 @@ echo "---------------------------------------------------" ## POST PROCESSING # Remove run from the fim_temp directory -rm -d $workDir/$runName +rm -df $workDir/$runName # Pipe into post processing . $projectDir/fim_post_processing.sh -n $runName -j $jobMaxLimit diff --git a/fim_post_processing.sh b/fim_post_processing.sh index e866f3c9d..ccf2e10df 100755 --- a/fim_post_processing.sh +++ b/fim_post_processing.sh @@ -267,7 +267,7 @@ Tcount l_echo $startDiv"Resetting Permissions" Tstart # super slow to change chmod on the log folder. Not really manditory anyways - find $outputDestDir -maxdepth 1 -type f -exec chmod 666 {} + # just root level files + find $outputDestDir -maxdepth 1 -type f -exec chmod 777 {} + # just root level files Tcount diff --git a/fim_pre_processing.sh b/fim_pre_processing.sh index bd556940c..658377db8 100755 --- a/fim_pre_processing.sh +++ b/fim_pre_processing.sh @@ -204,6 +204,7 @@ if [ ! -d $outputDestDir ]; then mkdir -p $outputDestDir chmod 777 $outputDestDir mkdir -p $tempRunDir + chmod 777 $tempRunDir else # remove these directories and files on a new or overwrite run rm -rdf $outputDestDir/logs @@ -231,6 +232,9 @@ cp $envFile $outputDestDir/params.env args_file=$outputDestDir/runtime_args.env +# reset it again (this time recursive for the new incoming folders +chmod 777 -R $outputDestDir + # the jobHucLimit is not from the args files, only jobBranchLimit echo "export runName=$runName" >> $args_file echo "export jobHucLimit=$jobHucLimit" >> $args_file diff --git a/fim_process_unit_wb.sh b/fim_process_unit_wb.sh index 1c013f777..962b1df15 100755 --- a/fim_process_unit_wb.sh +++ b/fim_process_unit_wb.sh @@ -78,6 +78,8 @@ fi # make outputs directory mkdir -p $tempHucDataDir mkdir -p $tempBranchDataDir +chmod 777 $tempHucDataDir +chmod 777 $tempBranchDataDir # Clean out previous unit logs and branch logs starting with this huc rm -f $outputDestDir/logs/unit/"$hucNumber"_unit.log From 889a7854613e314ae0e57471332a795bc357541f Mon Sep 17 00:00:00 2001 From: Rob Hanna - NOAA <90854818+RobHanna-NOAA@users.noreply.github.com> Date: Fri, 25 Oct 2024 19:07:58 +0000 Subject: [PATCH 2/3] 4.5.11.3 Fix error and warning scan bug (#1320) --- docs/CHANGELOG.md | 13 +++++++++++++ fim_post_processing.sh | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 4618b15f7..f818d53da 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,19 @@ All notable changes to this project will be documented in this file. We follow the [Semantic Versioning 2.0.0](http://semver.org/) format. + +## v4.5.11.3 - 2024-10-25 - [PR#1320](https://github.com/NOAA-OWP/inundation-mapping/pull/1320) + +The fix: During the post processing scan for the word "error" or "warning", it was only finding records which had either of those two words as stand alone words and not part of bigger phrases. ie); "error" was found, but not "fielderror". Added wildcards and it is now fixed. + +Note: it is finding a good handful more errors and warnings that were being missed in earlier code versions. + +### Changes +`fim_post_processing.sh`: fix as described. + +

+ + ## v4.5.11.2 - 2024-10-25 - [PR#1322](https://github.com/NOAA-OWP/inundation-mapping/pull/1322) For security reasons, we needed to create a docker image that does not use the root user in anyway. The new `Dockerfile.prod` file is to be used when we want to use a non-root user. The original `Dockerfile` has been renamed to `Dockerfile.dev` and will continue to use it's root users which has no problems with interacting with external mounts. diff --git a/fim_post_processing.sh b/fim_post_processing.sh index ccf2e10df..91db004d6 100755 --- a/fim_post_processing.sh +++ b/fim_post_processing.sh @@ -275,11 +275,11 @@ l_echo $startDiv"Scanning logs for errors and warnings. This can take quite a fe echo "Results will be saved in root not inside the log folder." Tstart # grep -H -r -i -n "error" $outputDestDir/logs/ > $outputDestDir/all_errors_from_logs.log - find $outputDestDir -type f | grep -H -r -i -n "error" $outputDestDir/logs/ > \ + find $outputDestDir -type f | grep -H -R -i -n ".*error.*" $outputDestDir/logs/ > \ $outputDestDir/all_errors_from_logs.log & l_echo "error scan done, now on to warnings scan" - find $outputDestDir -type f | grep -H -r -i -n "warning" $outputDestDir/logs/ > \ + find $outputDestDir -type f | grep -H -R -i -n ".*warning.*" $outputDestDir/logs/ > \ $outputDestDir/all_warnings_from_logs.log & l_echo "warning scan done" Tcount From 3acec5e89e075771e42b53abb6a36fb46eeb9aeb Mon Sep 17 00:00:00 2001 From: ZahraGhahremani-NOAA <100253864+ZahraGhahremani@users.noreply.github.com> Date: Fri, 1 Nov 2024 19:30:30 +0000 Subject: [PATCH 3/3] v4.5.12.0 Cut down Alaska HUCs runtime (#1327) --- docs/CHANGELOG.md | 13 +++++++ src/add_crosswalk.py | 49 +++++++++++++++++++----- src/delineate_hydros_and_produce_HAND.sh | 1 + tools/bridge_inundation.py | 7 +++- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index f818d53da..a07eac6a2 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,19 @@ All notable changes to this project will be documented in this file. We follow the [Semantic Versioning 2.0.0](http://semver.org/) format. +## v4.5.12.0 - 2024-11-01 - [PR#1327](https://github.com/NOAA-OWP/inundation-mapping/pull/1327) + +The purpose of this PR is to cut down the runtime for four Alaska HUCs (19020104, 19020503, 19020402 , and 19020602). It significantly optimizes runtime by replacing a nested for loop, used for updating rating curve for small segments, with a vectorized process. This changes were applied only to the Alaska HUCs. +As part of this PR, small modification was applied to bridge_inundation.py. + +### Changes + +- `src/add_crosswalk.py` +- `src/delineate_hydros_and_produce_HAND.sh` +- `tools/bridge_inundation.py` + +

+ ## v4.5.11.3 - 2024-10-25 - [PR#1320](https://github.com/NOAA-OWP/inundation-mapping/pull/1320) diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py index 61622b2e3..ba1f31abf 100755 --- a/src/add_crosswalk.py +++ b/src/add_crosswalk.py @@ -36,6 +36,7 @@ def add_crosswalk( small_segments_filename, min_catchment_area, min_stream_length, + huc_id, calibration_mode=False, ): input_catchments = gpd.read_file(input_catchments_fileName, engine="pyogrio", use_arrow=True) @@ -110,6 +111,8 @@ def add_crosswalk( output_flows = output_flows.merge(output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID') + output_flows = output_flows.drop_duplicates(subset='HydroID') + output_flows['ManningN'] = mannings_n if output_flows.NextDownID.dtype != 'int': @@ -281,16 +284,43 @@ def add_crosswalk( sml_segs.to_csv(small_segments_filename, index=False) print("Update rating curves for short reaches.") - for index, segment in sml_segs.iterrows(): - short_id = segment[0] - update_id = segment[1] - new_values = output_src.loc[output_src['HydroID'] == update_id][['Stage', 'Discharge (m3s-1)']] + if huc_id.startswith('19'): + print("Update rating curves for short reaches in Alaska.") + # Create a DataFrame with new values for discharge based on 'update_id' + new_values = output_src[output_src['HydroID'].isin(sml_segs['update_id'])][ + ['HydroID', 'Stage', 'Discharge (m3s-1)'] + ] - for src_index, src_stage in new_values.iterrows(): - output_src.loc[ - (output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]), - ['Discharge (m3s-1)'], - ] = src_stage[1] + # Merge this new values DataFrame with sml_segs on 'update_id' and 'HydroID' + sml_segs_with_values = sml_segs.merge( + new_values, left_on='update_id', right_on='HydroID', suffixes=('', '_new') + ) + sml_segs_with_values = sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']] + merged_output_src = output_src.merge( + sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']], + left_on=['HydroID', 'Stage'], + right_on=['short_id', 'Stage'], + suffixes=('', '_df2'), + ) + merged_output_src = merged_output_src[['HydroID', 'Stage', 'Discharge (m3s-1)_df2']] + output_src = pd.merge(output_src, merged_output_src, on=['HydroID', 'Stage'], how='left') + output_src['Discharge (m3s-1)'] = output_src['Discharge (m3s-1)_df2'].fillna( + output_src['Discharge (m3s-1)'] + ) + output_src = output_src.drop(columns=['Discharge (m3s-1)_df2']) + else: + for index, segment in sml_segs.iterrows(): + short_id = segment[0] + update_id = segment[1] + new_values = output_src.loc[output_src['HydroID'] == update_id][ + ['Stage', 'Discharge (m3s-1)'] + ] + + for src_index, src_stage in new_values.iterrows(): + output_src.loc[ + (output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]), + ['Discharge (m3s-1)'], + ] = src_stage[1] output_src = output_src.merge(crosswalk[['HydroID', 'feature_id']], on='HydroID') @@ -429,6 +459,7 @@ def add_crosswalk( help="Mannings n. Accepts single parameter set or list of parameter set in calibration mode. Currently input as csv.", required=True, ) + parser.add_argument("-u", "--huc-id", help="HUC ID", required=False) parser.add_argument("-z", "--input-nwmcat-fileName", help="NWM catchment polygon", required=True) parser.add_argument("-p", "--extent", help="GMS only for now", default="GMS", required=False) parser.add_argument( diff --git a/src/delineate_hydros_and_produce_HAND.sh b/src/delineate_hydros_and_produce_HAND.sh index 2c827cc57..2ebab872e 100755 --- a/src/delineate_hydros_and_produce_HAND.sh +++ b/src/delineate_hydros_and_produce_HAND.sh @@ -242,6 +242,7 @@ python3 $srcDir/add_crosswalk.py \ -t $tempCurrentBranchDataDir/hydroTable_$current_branch_id.csv \ -w $tempHucDataDir/wbd8_clp.gpkg \ -b $b_arg \ + -u $hucNumber \ -y $tempCurrentBranchDataDir/nwm_catchments_proj_subset.tif \ -m $manning_n \ -z $z_arg \ diff --git a/tools/bridge_inundation.py b/tools/bridge_inundation.py index 6f9238196..720aedbf1 100644 --- a/tools/bridge_inundation.py +++ b/tools/bridge_inundation.py @@ -87,6 +87,11 @@ def bridge_risk_status( # Concatenate all GeoDataFrame into a single GeoDataFrame bridge_points = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True)) + if bridge_points.feature_id.dtype != 'int': + bridge_points.feature_id = bridge_points.feature_id.astype(int) + if flow_file_data.feature_id.dtype != 'int': + flow_file_data.feature_id = flow_file_data.feature_id.astype(int) + # Find the common feature_id between flow_file and bridge_points merged_bri = bridge_points.merge(flow_file_data, on='feature_id', how='inner') @@ -111,7 +116,7 @@ def risk_class(row): bridge_out = merged_bri.loc[merged_data_max] bridge_out.reset_index(drop=True, inplace=True) bridge_out.drop('risk', axis=1, inplace=True) - bridge_out.to_file(output_dir, driver='GPKG', layer='bridge_risk_status') + bridge_out.to_file(output_dir, index=False, driver="GPKG", engine='fiona') return bridge_out