Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into dev-fim-performance-bug
  • Loading branch information
RobHanna-NOAA committed Nov 19, 2024
2 parents aaf9810 + 3acec5e commit 8cc206d
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 19 deletions.
File renamed without changes.
129 changes: 129 additions & 0 deletions Dockerfile.prod
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
## Temporary image to build the libraries and only save the needed artifacts
FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4 AS builder
WORKDIR /opt/builder
ARG dataDir=/data
ARG projectDir=/foss_fim
ARG depDir=/dependencies
ARG taudemVersion=98137bb6541a0d0077a9c95becfed4e56d0aa0ac
ARG taudemVersion2=81f7a07cdd3721617a30ee4e087804fddbcffa88
ENV taudemDir=$depDir/taudem/bin
ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin

# remove reference to missing repo
# RUN rm /etc/apt/sources.list.d/apache-arrow.sources

RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*

RUN git clone https://github.com/dtarb/taudem.git
RUN git clone https://github.com/fernandoa123/cybergis-toolkit.git taudem_accelerated_flowDirections

RUN apt-get update --fix-missing && apt-get install -y cmake mpich \
libgtest-dev libboost-test-dev libnetcdf-dev && rm -rf /var/lib/apt/lists/*

## Compile Main taudem repo ##
RUN mkdir -p taudem/bin
RUN cd taudem \
&& git checkout $taudemVersion \
&& cd src \
&& make

## Compile taudem repo with accelerated flow directions ##
RUN cd taudem_accelerated_flowDirections/taudem \
&& git checkout $taudemVersion2 \
&& mkdir build \
&& cd build \
&& cmake .. \
&& make

RUN mkdir -p $taudemDir
RUN mkdir -p $taudemDir2

## Move needed binaries to the next stage of the image
RUN cd taudem/bin && mv -t $taudemDir flowdircond streamnet gagewatershed catchhydrogeo dinfdistdown
RUN cd taudem_accelerated_flowDirections/taudem/build/bin && mv -t $taudemDir2 d8flowdir dinfflowdir


###############################################################################################
# Base Image that has GDAL, PROJ, etc
FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4
ARG dataDir=/data
ENV projectDir=/foss_fim
ARG depDir=/dependencies
ENV inputsDir=$dataDir/inputs
ENV outputsDir=/outputs
ENV srcDir=$projectDir/src
ENV toolsDir=$projectDir/tools
ENV workDir=/fim_temp
ENV taudemDir=$depDir/taudem/bin
ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin

## ADDING FIM GROUP ##
ARG GroupID=1370800235
ARG GroupName=fim
RUN addgroup --gid $GroupID $GroupName
ENV GID=$GroupID
ENV GN=$GroupName

RUN mkdir -p $workDir
RUN mkdir -p $depDir
COPY --from=builder $depDir $depDir

# remove reference to missing repo
# RUN rm /etc/apt/sources.list.d/apache-arrow.sources

RUN apt-get update --fix-missing && rm -rf /var/lib/apt/lists/*
RUN apt update --fix-missing

RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y p7zip-full python3-pip time mpich parallel libgeos-dev expect tmux rsync tzdata wget

RUN apt auto-remove

## adding AWS CLI (for bash) ##
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
unzip awscliv2.zip && \
./aws/install

## adding environment variables for numba and python ##
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE

## ADD TO PATHS ##
ENV PATH="$projectDir:${PATH}"
ENV PYTHONPATH="${PYTHONPATH}:${projectDir}:${srcDir}:${srcDir}/utils:${projectDir}/data:${toolsDir}"

## install python 3 modules ##
COPY Pipfile .
COPY Pipfile.lock .
RUN pip3 install pipenv==2024.0.1 && PIP_NO_CACHE_DIR=off pipenv install --system --deploy --ignore-pipfile

# ----------------------------------
# Mar 2023 / Sep 2024
# There are some nuances in the whitebox python downloads in that the first time it loads
# it goes to the internet and downloads the latest/greatest WBT (whiteboxtools) engine which is
# required for the whitebox python library to work. We don't want to have FIM attempting a download
# each time a container is opened and the whitebox engine is called.
# Instead we will setup the WBT engine at time of docker build (same as Taudem and AWS).
# Whitebox code detects that the engine it there and makes no attempt to update it.
# We download and unzip it to the same file folder that pip deployed the whitebox library.
# Whitebox also attempts to always download a folder called testdata regardless of use.
# We added an empty folder to fake out whitebox_tools.py so it doesn't try to download the folder
ENV WBT_PATH=/usr/local/lib/python3.10/dist-packages/whitebox/WBT
RUN wget -P $WBT_PATH https://www.whiteboxgeo.com/WBT_Linux/WhiteboxTools_linux_musl.zip && \
unzip -o $WBT_PATH/WhiteboxTools_linux_musl.zip -d $WBT_PATH && \
cp $WBT_PATH/WhiteboxTools_linux_amd64/WBT/whitebox_tools $WBT_PATH
# ----------------------------------

# The containiner will auto use this account to run
ARG RuntimeUser=svc_user
RUN useradd -u 8877 -g $GroupName -s /bin/bash $RuntimeUser
RUN chmod 777 $workDir
RUN mkdir -p "/home/${RuntimeUser}"
RUN chmod 777 /home/$RuntimeUser

## RUN UMASK TO CHANGE DEFAULT PERMISSIONS ##
ADD ./src/entrypoint.sh /
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]

## This results in the default user being the svc_user user
USER $RuntimeUser
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This software uses the Height Above Nearest Drainage (HAND) method to generate R

# FIM Version 4

#### Note: While we use the phrase "FIM" regularily, the phrase "HAND" is also used and is generally interchangeable. Most output folders now follow the convenction of "hand_4_x_x_x".

## Accessing Data through ESIP S3 Bucket
The latest national generated HAND data and a subset of the inputs can be found in an Amazon S3 Bucket hosted by [Earth Science Information Partners (ESIP)](https://www.esipfed.org/). These data can be accessed using the AWS CLI tools. Please contact Carson Pruitt ([email protected]) or Fernando Salas ([email protected]) if you experience issues with permissions.

Expand Down Expand Up @@ -49,12 +51,12 @@ aws s3 ls s3://noaa-nws-owp-fim/hand_fim/ --profile esip

Download a directory of sample outputs for a single HUC8:
```
aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/fim_4_4_0_0/12090301 \
aws s3 sync s3://noaa-nws-owp-fim/hand_fim/outputs/hand_4_5_2_11/12090301 \
/your_local_folder_name/12090301 --profile esip
```
By adjusting pathing, you can also download entire directories such as the `fim_4_4_0_0` folder. An entire output FIM set (e.g. `fim_4_4_0_0`) is approximately 1.1 TB.
By adjusting pathing, you can also download entire directories such as the `hand_4_5_2_11` folder. An entire output HAND set is approximately 1.7 TB.

**Note**: There may be newer editions than `fim_4_4_0_0`, and it is recommended to adjust the command above for the latest version.
**Note**: There may be newer editions than `hand_4_5_11_1`, and it is recommended to adjust the command above for the latest version.

## Setting up your Environment

Expand Down Expand Up @@ -85,7 +87,7 @@ Git will auto create a subfolder named `inundation-mapping` where the code will

### Installation
1. Install Docker : [Docker](https://docs.docker.com/get-docker/)
2. Build Docker Image : `docker build -f Dockerfile -t <image_name>:<tag> <path/to/repository>`
2. Build Docker Image : `docker build -f Dockerfile.dev -t <image_name>:<tag> <path/to/repository>`
3. Create FIM group on host machine:
- Linux: `groupadd -g 1370800178 fim`
4. Change group ownership of repo (needs to be redone when a new file occurs in the repo):
Expand Down Expand Up @@ -128,7 +130,7 @@ docker run --rm -it --name <your_container_name> \
```
For example:
```bash
docker run --rm -it --name robs_container \
docker run --rm -it --name Robs_container \
-v /home/projects/fim/code/inundation-mapping/:/foss_fim \
-v /home/projects/fim/data/outputs/:/outputs \
-v /home/projects/fim/data/outputs_temp/:/fim_temp \
Expand Down
50 changes: 50 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,56 @@
All notable changes to this project will be documented in this file.
We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.

## v4.5.12.0 - 2024-11-01 - [PR#1327](https://github.com/NOAA-OWP/inundation-mapping/pull/1327)

The purpose of this PR is to cut down the runtime for four Alaska HUCs (19020104, 19020503, 19020402 , and 19020602). It significantly optimizes runtime by replacing a nested for loop, used for updating rating curve for small segments, with a vectorized process. This changes were applied only to the Alaska HUCs.
As part of this PR, small modification was applied to bridge_inundation.py.

### Changes

- `src/add_crosswalk.py`
- `src/delineate_hydros_and_produce_HAND.sh`
- `tools/bridge_inundation.py`

<br/><br/>


## v4.5.11.3 - 2024-10-25 - [PR#1320](https://github.com/NOAA-OWP/inundation-mapping/pull/1320)

The fix: During the post processing scan for the word "error" or "warning", it was only finding records which had either of those two words as stand alone words and not part of bigger phrases. ie); "error" was found, but not "fielderror". Added wildcards and it is now fixed.

Note: it is finding a good handful more errors and warnings that were being missed in earlier code versions.

### Changes
`fim_post_processing.sh`: fix as described.

<br/><br/>


## v4.5.11.2 - 2024-10-25 - [PR#1322](https://github.com/NOAA-OWP/inundation-mapping/pull/1322)

For security reasons, we needed to create a docker image that does not use the root user in anyway. The new `Dockerfile.prod` file is to be used when we want to use a non-root user. The original `Dockerfile` has been renamed to `Dockerfile.dev` and will continue to use it's root users which has no problems with interacting with external mounts.

Note: Re: using pip or pipenv installs.
In the Dockerfile.prod, you can not do installs or update using either pipenv or pip. Those types of tests and adjustments need to be done in the `Dockerfile.dev`. `Dockerfile.dev` will also allow change to the `Pipfile` and `Pipfile.lock` . Both docker files share the Pipfiles so it should be just fine.

### File Renames
- Was: `Dockerfile`, now `Dockerfile.dev`

### Additions

- Dockerfile.prod: as described

### Changes
- `README.md`: change notes from phrase `Dockerfile` to `Dockerfile.dev`. Also added some notes about the new convention of outputs no longer starting with `fim_` but now `hand_`
- `fim_pipeline.sh`: Change for the new `Dockerfile.prod` for permissions.
- `fim_post_processing.sh`: Change for the new `Dockerfile.prod` for permissions.
- `fim_pre_processing.sh`: Change for the new `Dockerfile.prod` for permissions.
- `fim_process_unit_wb.sh`: Change for the new `Dockerfile.prod` for permissions.

<br/><br/>


## v4.5.11.1 - 2024-10-16 - [PR#1318](https://github.com/NOAA-OWP/inundation-mapping/pull/1318)

Bug fixes to address issues during `fim_pipeline.sh`.
Expand Down
2 changes: 1 addition & 1 deletion fim_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ echo "---------------------------------------------------"
## POST PROCESSING

# Remove run from the fim_temp directory
rm -d $workDir/$runName
rm -df $workDir/$runName

# Pipe into post processing
. $projectDir/fim_post_processing.sh -n $runName -j $jobMaxLimit
Expand Down
6 changes: 3 additions & 3 deletions fim_post_processing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -267,19 +267,19 @@ Tcount
l_echo $startDiv"Resetting Permissions"
Tstart
# super slow to change chmod on the log folder. Not really manditory anyways
find $outputDestDir -maxdepth 1 -type f -exec chmod 666 {} + # just root level files
find $outputDestDir -maxdepth 1 -type f -exec chmod 777 {} + # just root level files
Tcount


l_echo $startDiv"Scanning logs for errors and warnings. This can take quite a few minutes so stand by."
echo "Results will be saved in root not inside the log folder."
Tstart
# grep -H -r -i -n "error" $outputDestDir/logs/ > $outputDestDir/all_errors_from_logs.log
find $outputDestDir -type f | grep -H -r -i -n "error" $outputDestDir/logs/ > \
find $outputDestDir -type f | grep -H -R -i -n ".*error.*" $outputDestDir/logs/ > \
$outputDestDir/all_errors_from_logs.log &
l_echo "error scan done, now on to warnings scan"

find $outputDestDir -type f | grep -H -r -i -n "warning" $outputDestDir/logs/ > \
find $outputDestDir -type f | grep -H -R -i -n ".*warning.*" $outputDestDir/logs/ > \
$outputDestDir/all_warnings_from_logs.log &
l_echo "warning scan done"
Tcount
Expand Down
4 changes: 4 additions & 0 deletions fim_pre_processing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ if [ ! -d $outputDestDir ]; then
mkdir -p $outputDestDir
chmod 777 $outputDestDir
mkdir -p $tempRunDir
chmod 777 $tempRunDir
else
# remove these directories and files on a new or overwrite run
rm -rdf $outputDestDir/logs
Expand Down Expand Up @@ -231,6 +232,9 @@ cp $envFile $outputDestDir/params.env

args_file=$outputDestDir/runtime_args.env

# reset it again (this time recursive for the new incoming folders
chmod 777 -R $outputDestDir

# the jobHucLimit is not from the args files, only jobBranchLimit
echo "export runName=$runName" >> $args_file
echo "export jobHucLimit=$jobHucLimit" >> $args_file
Expand Down
2 changes: 2 additions & 0 deletions fim_process_unit_wb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ fi
# make outputs directory
mkdir -p $tempHucDataDir
mkdir -p $tempBranchDataDir
chmod 777 $tempHucDataDir
chmod 777 $tempBranchDataDir

# Clean out previous unit logs and branch logs starting with this huc
rm -f $outputDestDir/logs/unit/"$hucNumber"_unit.log
Expand Down
49 changes: 40 additions & 9 deletions src/add_crosswalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def add_crosswalk(
small_segments_filename,
min_catchment_area,
min_stream_length,
huc_id,
calibration_mode=False,
):
input_catchments = gpd.read_file(input_catchments_fileName, engine="pyogrio", use_arrow=True)
Expand Down Expand Up @@ -110,6 +111,8 @@ def add_crosswalk(

output_flows = output_flows.merge(output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID')

output_flows = output_flows.drop_duplicates(subset='HydroID')

output_flows['ManningN'] = mannings_n

if output_flows.NextDownID.dtype != 'int':
Expand Down Expand Up @@ -281,16 +284,43 @@ def add_crosswalk(
sml_segs.to_csv(small_segments_filename, index=False)
print("Update rating curves for short reaches.")

for index, segment in sml_segs.iterrows():
short_id = segment[0]
update_id = segment[1]
new_values = output_src.loc[output_src['HydroID'] == update_id][['Stage', 'Discharge (m3s-1)']]
if huc_id.startswith('19'):
print("Update rating curves for short reaches in Alaska.")
# Create a DataFrame with new values for discharge based on 'update_id'
new_values = output_src[output_src['HydroID'].isin(sml_segs['update_id'])][
['HydroID', 'Stage', 'Discharge (m3s-1)']
]

for src_index, src_stage in new_values.iterrows():
output_src.loc[
(output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]),
['Discharge (m3s-1)'],
] = src_stage[1]
# Merge this new values DataFrame with sml_segs on 'update_id' and 'HydroID'
sml_segs_with_values = sml_segs.merge(
new_values, left_on='update_id', right_on='HydroID', suffixes=('', '_new')
)
sml_segs_with_values = sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']]
merged_output_src = output_src.merge(
sml_segs_with_values[['short_id', 'Stage', 'Discharge (m3s-1)']],
left_on=['HydroID', 'Stage'],
right_on=['short_id', 'Stage'],
suffixes=('', '_df2'),
)
merged_output_src = merged_output_src[['HydroID', 'Stage', 'Discharge (m3s-1)_df2']]
output_src = pd.merge(output_src, merged_output_src, on=['HydroID', 'Stage'], how='left')
output_src['Discharge (m3s-1)'] = output_src['Discharge (m3s-1)_df2'].fillna(
output_src['Discharge (m3s-1)']
)
output_src = output_src.drop(columns=['Discharge (m3s-1)_df2'])
else:
for index, segment in sml_segs.iterrows():
short_id = segment[0]
update_id = segment[1]
new_values = output_src.loc[output_src['HydroID'] == update_id][
['Stage', 'Discharge (m3s-1)']
]

for src_index, src_stage in new_values.iterrows():
output_src.loc[
(output_src['HydroID'] == short_id) & (output_src['Stage'] == src_stage[0]),
['Discharge (m3s-1)'],
] = src_stage[1]

output_src = output_src.merge(crosswalk[['HydroID', 'feature_id']], on='HydroID')

Expand Down Expand Up @@ -429,6 +459,7 @@ def add_crosswalk(
help="Mannings n. Accepts single parameter set or list of parameter set in calibration mode. Currently input as csv.",
required=True,
)
parser.add_argument("-u", "--huc-id", help="HUC ID", required=False)
parser.add_argument("-z", "--input-nwmcat-fileName", help="NWM catchment polygon", required=True)
parser.add_argument("-p", "--extent", help="GMS only for now", default="GMS", required=False)
parser.add_argument(
Expand Down
1 change: 1 addition & 0 deletions src/delineate_hydros_and_produce_HAND.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ python3 $srcDir/add_crosswalk.py \
-t $tempCurrentBranchDataDir/hydroTable_$current_branch_id.csv \
-w $tempHucDataDir/wbd8_clp.gpkg \
-b $b_arg \
-u $hucNumber \
-y $tempCurrentBranchDataDir/nwm_catchments_proj_subset.tif \
-m $manning_n \
-z $z_arg \
Expand Down
7 changes: 6 additions & 1 deletion tools/bridge_inundation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ def bridge_risk_status(
# Concatenate all GeoDataFrame into a single GeoDataFrame
bridge_points = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

if bridge_points.feature_id.dtype != 'int':
bridge_points.feature_id = bridge_points.feature_id.astype(int)
if flow_file_data.feature_id.dtype != 'int':
flow_file_data.feature_id = flow_file_data.feature_id.astype(int)

# Find the common feature_id between flow_file and bridge_points
merged_bri = bridge_points.merge(flow_file_data, on='feature_id', how='inner')

Expand All @@ -111,7 +116,7 @@ def risk_class(row):
bridge_out = merged_bri.loc[merged_data_max]
bridge_out.reset_index(drop=True, inplace=True)
bridge_out.drop('risk', axis=1, inplace=True)
bridge_out.to_file(output_dir, driver='GPKG', layer='bridge_risk_status')
bridge_out.to_file(output_dir, index=False, driver="GPKG", engine='fiona')

return bridge_out

Expand Down

0 comments on commit 8cc206d

Please sign in to comment.