Merge branch 'dev' of https://github.com/NOAA-OWP/inundation-mapping …

…into dev-fim-performance-bug
NOAA-OWP · Nov 19, 2024 · 8cc206d · 8cc206d
2 parents aaf9810 + 3acec5e
commit 8cc206d
Show file tree

Hide file tree

Showing 11 changed files with 243 additions and 19 deletions.
diff --git a/Dockerfile → Dockerfile.dev b/Dockerfile → Dockerfile.dev
diff --git a/Dockerfile.prod b/Dockerfile.prod
@@ -0,0 +1,129 @@
+## Temporary image to build the libraries and only save the needed artifacts
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4 AS builder
+WORKDIR /opt/builder
+ARG dataDir=/data
+ARG projectDir=/foss_fim
+ARG depDir=/dependencies
+ARG taudemVersion=98137bb6541a0d0077a9c95becfed4e56d0aa0ac
+ARG taudemVersion2=81f7a07cdd3721617a30ee4e087804fddbcffa88
+ENV taudemDir=$depDir/taudem/bin
+ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin
+
+# remove reference to missing repo
+# RUN rm /etc/apt/sources.list.d/apache-arrow.sources
+
+RUN apt-get update && apt-get install -y git  && rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/dtarb/taudem.git
+RUN git clone https://github.com/fernandoa123/cybergis-toolkit.git taudem_accelerated_flowDirections
+
+RUN apt-get update --fix-missing && apt-get install -y cmake mpich \
+    libgtest-dev libboost-test-dev libnetcdf-dev && rm -rf /var/lib/apt/lists/*
+
+## Compile Main taudem repo ##
+RUN mkdir -p taudem/bin
+RUN cd taudem \
+    && git checkout $taudemVersion \
+    && cd src \
+    && make
+
+## Compile taudem repo with accelerated flow directions ##
+RUN cd taudem_accelerated_flowDirections/taudem \
+    && git checkout $taudemVersion2 \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make
+
+RUN mkdir -p $taudemDir
+RUN mkdir -p $taudemDir2
+
+## Move needed binaries to the next stage of the image
+RUN cd taudem/bin && mv -t $taudemDir flowdircond streamnet gagewatershed catchhydrogeo dinfdistdown
+RUN cd taudem_accelerated_flowDirections/taudem/build/bin && mv -t $taudemDir2 d8flowdir dinfflowdir
+
+
+###############################################################################################
+# Base Image that has GDAL, PROJ, etc
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4
+ARG dataDir=/data
+ENV projectDir=/foss_fim
+ARG depDir=/dependencies
+ENV inputsDir=$dataDir/inputs
+ENV outputsDir=/outputs
+ENV srcDir=$projectDir/src
+ENV toolsDir=$projectDir/tools
+ENV workDir=/fim_temp
+ENV taudemDir=$depDir/taudem/bin
+ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin
+
+## ADDING FIM GROUP ##
+ARG GroupID=1370800235
+ARG GroupName=fim
+RUN addgroup --gid $GroupID $GroupName
+ENV GID=$GroupID
+ENV GN=$GroupName
+
+RUN mkdir -p $workDir
+RUN mkdir -p $depDir
+COPY --from=builder $depDir $depDir
+
+# remove reference to missing repo
+# RUN rm /etc/apt/sources.list.d/apache-arrow.sources
+
+RUN apt-get update --fix-missing && rm -rf /var/lib/apt/lists/*
+RUN apt update --fix-missing
+
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y p7zip-full python3-pip time mpich parallel libgeos-dev expect tmux rsync tzdata wget
+
+RUN apt auto-remove
+
+## adding AWS CLI (for bash) ##
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
+   unzip awscliv2.zip && \
+   ./aws/install
+
+## adding environment variables for numba and python ##
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ENV PYTHONUNBUFFERED=TRUE
+
+## ADD TO PATHS ##
+ENV PATH="$projectDir:${PATH}"
+ENV PYTHONPATH="${PYTHONPATH}:${projectDir}:${srcDir}:${srcDir}/utils:${projectDir}/data:${toolsDir}"
+
+## install python 3 modules ##
+COPY Pipfile .
+COPY Pipfile.lock .
+RUN pip3 install pipenv==2024.0.1 && PIP_NO_CACHE_DIR=off pipenv install --system --deploy --ignore-pipfile
+
+# ----------------------------------
+# Mar 2023 / Sep 2024
+# There are some nuances in the whitebox python downloads in that the first time it loads
+# it goes to the internet and downloads the latest/greatest WBT (whiteboxtools) engine which is
+# required for the whitebox python library to work. We don't want to have FIM attempting a download
+# each time a container is opened and the whitebox engine is called.
+# Instead we will setup the WBT engine at time of docker build (same as Taudem and AWS).
+# Whitebox code detects that the engine it there and makes no attempt to update it.
+# We download and unzip it to the same file folder that pip deployed the whitebox library.
+# Whitebox also attempts to always download a folder called testdata regardless of use.
+# We added an empty folder to fake out whitebox_tools.py so it doesn't try to download the folder
+ENV WBT_PATH=/usr/local/lib/python3.10/dist-packages/whitebox/WBT
+RUN wget -P $WBT_PATH https://www.whiteboxgeo.com/WBT_Linux/WhiteboxTools_linux_musl.zip && \
+    unzip -o $WBT_PATH/WhiteboxTools_linux_musl.zip -d $WBT_PATH && \
+    cp $WBT_PATH/WhiteboxTools_linux_amd64/WBT/whitebox_tools $WBT_PATH
+# ----------------------------------
+
+# The containiner will auto use this account to run
+ARG RuntimeUser=svc_user
+RUN useradd -u 8877 -g $GroupName -s /bin/bash $RuntimeUser
+RUN chmod 777 $workDir
+RUN mkdir -p "/home/${RuntimeUser}"
+RUN chmod 777 /home/$RuntimeUser
+
+## RUN UMASK TO CHANGE DEFAULT PERMISSIONS ##
+ADD ./src/entrypoint.sh /
+ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]
+
+## This results in the default user being the svc_user user
+USER $RuntimeUser
diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@ This software uses the Height Above Nearest Drainage (HAND) method to generate R
 
 # FIM Version 4
 
+#### Note: While we use the phrase "FIM" regularily, the phrase "HAND" is also used and is generally interchangeable. Most output folders now follow the convenction of "hand_4_x_x_x".
+
 ## Accessing Data through ESIP S3 Bucket
 The latest national generated HAND data and a subset of the inputs can be found in an Amazon S3 Bucket hosted by [Earth Science Information Partners (ESIP)](https://www.esipfed.org/). These data can be accessed using the AWS CLI tools. Please contact Carson Pruitt ([email protected]) or Fernando Salas ([email protected]) if you experience issues with permissions.
 
@@ -49,12 +51,12 @@ aws s3 ls s3://noaa-nws-owp-fim/hand_fim/  --profile esip
 
 Download a directory of sample outputs for a single HUC8:
 ```
-aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/fim_4_4_0_0/12090301 \
+aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/hand_4_5_2_11/12090301 \
     /your_local_folder_name/12090301 --profile esip
 ```
-By adjusting pathing, you can also download entire directories such as the `fim_4_4_0_0` folder. An entire output FIM set (e.g. `fim_4_4_0_0`) is approximately 1.1 TB.
+By adjusting pathing, you can also download entire directories such as the `hand_4_5_2_11` folder. An entire output HAND set is approximately 1.7 TB.
 
-**Note**: There may be newer editions than `fim_4_4_0_0`, and it is recommended to adjust the command above for the latest version.
+**Note**: There may be newer editions than `hand_4_5_11_1`, and it is recommended to adjust the command above for the latest version.
 
 ## Setting up your Environment
 
@@ -85,7 +87,7 @@ Git will auto create a subfolder named `inundation-mapping` where the code will
 
 ### Installation
 1. Install Docker : [Docker](https://docs.docker.com/get-docker/)
-2. Build Docker Image : `docker build -f Dockerfile -t <image_name>:<tag> <path/to/repository>`
+2. Build Docker Image : `docker build -f Dockerfile.dev -t <image_name>:<tag> <path/to/repository>`
 3. Create FIM group on host machine:
     - Linux: `groupadd -g 1370800178 fim`
 4. Change group ownership of repo (needs to be redone when a new file occurs in the repo):
@@ -128,7 +130,7 @@ docker run --rm -it --name <your_container_name> \
 ```
 For example:
 ```bash
-docker run --rm -it --name robs_container \
+docker run --rm -it --name Robs_container \
     -v /home/projects/fim/code/inundation-mapping/:/foss_fim \
     -v /home/projects/fim/data/outputs/:/outputs \
     -v /home/projects/fim/data/outputs_temp/:/fim_temp \

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,6 +1,56 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.12.0 - 2024-11-01 - [PR#1327](https://github.com/NOAA-OWP/inundation-mapping/pull/1327)
+
+The purpose of this PR is to cut down the runtime for four Alaska HUCs (19020104, 19020503, 19020402 , and 19020602). It significantly optimizes runtime by replacing a nested for loop, used for updating rating curve for small segments, with a vectorized process. This changes were applied only to the Alaska HUCs.
+As part of this PR, small modification was applied to bridge_inundation.py.
+
+### Changes
+
+- `src/add_crosswalk.py`
+- `src/delineate_hydros_and_produce_HAND.sh`
+- `tools/bridge_inundation.py`
+
+<br/><br/>
+
+
+## v4.5.11.3 - 2024-10-25 - [PR#1320](https://github.com/NOAA-OWP/inundation-mapping/pull/1320)
+
+The fix: During the post processing scan for the word "error" or "warning", it was only finding records which had either of those two words as stand alone words and not part of bigger phrases.  ie); "error" was found, but not "fielderror". Added wildcards and it is now fixed.
+
+Note: it is finding a good handful more errors and warnings that were being missed in earlier code versions.
+
+### Changes
+`fim_post_processing.sh`: fix as described.
+
+<br/><br/>
+
+
+## v4.5.11.2 - 2024-10-25 - [PR#1322](https://github.com/NOAA-OWP/inundation-mapping/pull/1322)
+
+For security reasons, we needed to create a docker image that does not use the root user in anyway. The new `Dockerfile.prod` file is to be used when we want to use a non-root user. The  original `Dockerfile` has been renamed to `Dockerfile.dev` and will continue to use it's root users which has no problems with interacting with external mounts.
+
+Note: Re: using pip or pipenv installs.
+In the Dockerfile.prod, you can not do installs or update using either pipenv or pip.  Those types of tests and adjustments need to be done in the `Dockerfile.dev`. `Dockerfile.dev` will also allow change to the `Pipfile` and `Pipfile.lock` . Both docker files share the Pipfiles so it should be just fine.
+
+### File Renames
+- Was: `Dockerfile`,  now `Dockerfile.dev`
+
+### Additions
+
+- Dockerfile.prod: as described
+
+### Changes
+- `README.md`: change notes from phrase `Dockerfile` to `Dockerfile.dev`. Also added some notes about the new convention of outputs no longer starting with `fim_` but now `hand_`
+- `fim_pipeline.sh`: Change for the new `Dockerfile.prod` for permissions.
+- `fim_post_processing.sh`: Change for the new `Dockerfile.prod` for permissions.
+- `fim_pre_processing.sh`: Change for the new `Dockerfile.prod` for permissions.
+- `fim_process_unit_wb.sh`: Change for the new `Dockerfile.prod` for permissions.
+
+<br/><br/>
+
+
 ## v4.5.11.1 - 2024-10-16 - [PR#1318](https://github.com/NOAA-OWP/inundation-mapping/pull/1318)
 
 Bug fixes to address issues during `fim_pipeline.sh`.

diff --git a/fim_pipeline.sh b/fim_pipeline.sh
@@ -121,7 +121,7 @@ echo "---------------------------------------------------"
 ## POST PROCESSING
 
 # Remove run from the fim_temp directory
-rm -d $workDir/$runName
+rm -df $workDir/$runName
 
 # Pipe into post processing
 . $projectDir/fim_post_processing.sh -n $runName -j $jobMaxLimit

diff --git a/fim_post_processing.sh b/fim_post_processing.sh
@@ -267,19 +267,19 @@ Tcount
 l_echo $startDiv"Resetting Permissions"
 Tstart
     # super slow to change chmod on the log folder. Not really manditory anyways
-    find $outputDestDir -maxdepth 1 -type f -exec chmod 666 {} +  # just root level files
+    find $outputDestDir -maxdepth 1 -type f -exec chmod 777 {} +  # just root level files
 Tcount
 
 
 l_echo $startDiv"Scanning logs for errors and warnings. This can take quite a few minutes so stand by."
 echo "Results will be saved in root not inside the log folder."
 Tstart
     # grep -H -r -i -n "error" $outputDestDir/logs/ > $outputDestDir/all_errors_from_logs.log
-    find $outputDestDir -type f | grep -H -r -i -n "error" $outputDestDir/logs/ > \
+    find $outputDestDir -type f | grep -H -R -i -n ".*error.*" $outputDestDir/logs/ > \
          $outputDestDir/all_errors_from_logs.log &
     l_echo "error scan done, now on to warnings scan"
 
-    find $outputDestDir -type f | grep -H -r -i -n "warning" $outputDestDir/logs/ > \
+    find $outputDestDir -type f | grep -H -R -i -n ".*warning.*" $outputDestDir/logs/ > \
          $outputDestDir/all_warnings_from_logs.log &
     l_echo "warning scan done"
 Tcount

diff --git a/fim_pre_processing.sh b/fim_pre_processing.sh
@@ -204,6 +204,7 @@ if [ ! -d $outputDestDir ]; then
     mkdir -p $outputDestDir
     chmod 777 $outputDestDir
     mkdir -p $tempRunDir
+	chmod 777 $tempRunDir
 else
     # remove these directories and files on a new or overwrite run
     rm -rdf $outputDestDir/logs
@@ -231,6 +232,9 @@ cp $envFile $outputDestDir/params.env
 
 args_file=$outputDestDir/runtime_args.env
 
+# reset it again (this time recursive for the new incoming folders
+chmod 777 -R $outputDestDir
+
 # the jobHucLimit is not from the args files, only jobBranchLimit
 echo "export runName=$runName" >> $args_file
 echo "export jobHucLimit=$jobHucLimit" >> $args_file

diff --git a/fim_process_unit_wb.sh b/fim_process_unit_wb.sh
@@ -78,6 +78,8 @@ fi
 # make outputs directory
 mkdir -p $tempHucDataDir
 mkdir -p $tempBranchDataDir
+chmod 777 $tempHucDataDir
+chmod 777 $tempBranchDataDir
 
 # Clean out previous unit logs and branch logs starting with this huc
 rm -f $outputDestDir/logs/unit/"$hucNumber"_unit.log

diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
@@ -36,6 +36,7 @@ def add_crosswalk(
     small_segments_filename,
     min_catchment_area,
     min_stream_length,
+    huc_id,
     calibration_mode=False,
 ):
     input_catchments = gpd.read_file(input_catchments_fileName, engine="pyogrio", use_arrow=True)
@@ -110,6 +111,8 @@ def add_crosswalk(
 
     output_flows = output_flows.merge(output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID')
 
+    output_flows = output_flows.drop_duplicates(subset='HydroID')
+
     output_flows['ManningN'] = mannings_n
 
     if output_flows.NextDownID.dtype != 'int':
@@ -281,16 +284,43 @@ def add_crosswalk(
         sml_segs.to_csv(small_segments_filename, index=False)
         print("Update rating curves for short reaches.")
 
-        for index, segment in sml_segs.iterrows():
-            short_id = segment[0]
-            update_id = segment[1]
-            new_values = output_src.loc[output_src['HydroID'] == update_id][['Stage', 'Discharge (m3s-1)']]
+        if huc_id.startswith('19'):
+            print("Update rating curves for short reaches in Alaska.")
+            # Create a DataFrame with new values for discharge based on 'update_id'
+            new_values = output_src[output_src['HydroID'].isin(sml_segs['update_id'])][
+                ['HydroID', 'Stage', 'Discharge (m3s-1)']
+            ]
 
-            for src_index, src_stage in new_values.iterrows():
-                output_src.loc[
-                    (output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]),
-                    ['Discharge (m3s-1)'],
-                ] = src_stage[1]
+            # Merge this new values DataFrame with sml_segs on 'update_id' and 'HydroID'
+            sml_segs_with_values = sml_segs.merge(
+                new_values, left_on='update_id', right_on='HydroID', suffixes=('', '_new')
+            )
+            sml_segs_with_values = sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']]
+            merged_output_src = output_src.merge(
+                sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']],
+                left_on=['HydroID', 'Stage'],
+                right_on=['short_id', 'Stage'],
+                suffixes=('', '_df2'),
+            )
+            merged_output_src = merged_output_src[['HydroID', 'Stage', 'Discharge (m3s-1)_df2']]
+            output_src = pd.merge(output_src, merged_output_src, on=['HydroID', 'Stage'], how='left')
+            output_src['Discharge (m3s-1)'] = output_src['Discharge (m3s-1)_df2'].fillna(
+                output_src['Discharge (m3s-1)']
+            )
+            output_src = output_src.drop(columns=['Discharge (m3s-1)_df2'])
+        else:
+            for index, segment in sml_segs.iterrows():
+                short_id = segment[0]
+                update_id = segment[1]
+                new_values = output_src.loc[output_src['HydroID'] == update_id][
+                    ['Stage', 'Discharge (m3s-1)']
+                ]
+
+                for src_index, src_stage in new_values.iterrows():
+                    output_src.loc[
+                        (output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]),
+                        ['Discharge (m3s-1)'],
+                    ] = src_stage[1]
 
     output_src = output_src.merge(crosswalk[['HydroID', 'feature_id']], on='HydroID')
 
@@ -429,6 +459,7 @@ def add_crosswalk(
         help="Mannings n. Accepts single parameter set or list of parameter set in calibration mode. Currently input as csv.",
         required=True,
     )
+    parser.add_argument("-u", "--huc-id", help="HUC ID", required=False)
     parser.add_argument("-z", "--input-nwmcat-fileName", help="NWM catchment polygon", required=True)
     parser.add_argument("-p", "--extent", help="GMS only for now", default="GMS", required=False)
     parser.add_argument(

diff --git a/src/delineate_hydros_and_produce_HAND.sh b/src/delineate_hydros_and_produce_HAND.sh
@@ -242,6 +242,7 @@ python3 $srcDir/add_crosswalk.py \
     -t $tempCurrentBranchDataDir/hydroTable_$current_branch_id.csv \
     -w $tempHucDataDir/wbd8_clp.gpkg \
     -b $b_arg \
+    -u $hucNumber \
     -y $tempCurrentBranchDataDir/nwm_catchments_proj_subset.tif \
     -m $manning_n \
     -z $z_arg \

diff --git a/tools/bridge_inundation.py b/tools/bridge_inundation.py
@@ -87,6 +87,11 @@ def bridge_risk_status(
     # Concatenate all GeoDataFrame into a single GeoDataFrame
     bridge_points = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))
 
+    if bridge_points.feature_id.dtype != 'int':
+        bridge_points.feature_id = bridge_points.feature_id.astype(int)
+    if flow_file_data.feature_id.dtype != 'int':
+        flow_file_data.feature_id = flow_file_data.feature_id.astype(int)
+
     # Find the common feature_id between flow_file and bridge_points
     merged_bri = bridge_points.merge(flow_file_data, on='feature_id', how='inner')
 
@@ -111,7 +116,7 @@ def risk_class(row):
     bridge_out = merged_bri.loc[merged_data_max]
     bridge_out.reset_index(drop=True, inplace=True)
     bridge_out.drop('risk', axis=1, inplace=True)
-    bridge_out.to_file(output_dir, driver='GPKG', layer='bridge_risk_status')
+    bridge_out.to_file(output_dir, index=False, driver="GPKG", engine='fiona')
 
     return bridge_out