From 3307021100c130de1e5838487ab0f6bd773b1131 Mon Sep 17 00:00:00 2001 From: "Leaf, Andrew T" Date: Wed, 16 Oct 2024 11:35:55 -0500 Subject: [PATCH] feat(nhdplus_utils.read_nhdplus_hr): support reading multiple geodatabases; add test --- sfrmaker/nhdplus_utils.py | 115 ++++++++++++++++++++-------- sfrmaker/test/conftest.py | 9 +-- sfrmaker/test/test_nhdplus_utils.py | 33 +++++++- 3 files changed, 117 insertions(+), 40 deletions(-) diff --git a/sfrmaker/nhdplus_utils.py b/sfrmaker/nhdplus_utils.py index 079e125a..4225de50 100644 --- a/sfrmaker/nhdplus_utils.py +++ b/sfrmaker/nhdplus_utils.py @@ -2,6 +2,7 @@ from pathlib import Path import time import warnings +import numpy as np import pandas as pd import geopandas as gpd from gisutils import shp2df, get_shapefile_crs @@ -260,44 +261,90 @@ def read_nhdplus(shapefiles, bbox_filter=None, return df -def read_nhdplus_hr(NHDPlusHR_path, bbox_filter=None, drop_fcodes=None): +def read_nhdplus_hr(NHDPlusHR_paths, bbox_filter=None, drop_fcodes=None): + """_summary_ + + Parameters + ---------- + NHDPlusHR_paths : str, pathlike or sequence of paths + NHDPlus High Resolution (HR) geodatabases. + bbox_filter : tuple or polygon shapefile, optional + Only read NHDPlus HR content within this area, + by default None + drop_fcodes : int or sequence of ints, optional + Exclude NHDPlus data with these FCodes, + by default None + + Returns + ------- + fl : pandas DataFrame + NHDPlus HR Flowlines including the following attributes: + + ========== =========================================================================== + NHDPlusID Unique identifier for each flowline and it's attributes + ArbolateSu Total length of upstream drainage, in km (from NHDPlusFlowlineVAA table) + StreamOrde Strahler stream order (from NHDPlusFlowlineVAA table) + MaxElevSmo Starting elevation for each flowline, in cm (from NHDPlusFlowlineVAA table) + MinElevSmo Ending elevation for each flowline, in cm (from NHDPlusFlowlineVAA table) + Divergence Distributary number (from NHDPlusFlowlineVAA table) + ToNHDPID Next downstream NHDPlusID (from NHDPlusFlow table) + ========== =========================================================================== + + Notes + ----- + * At divergences where there are multiple downstream flowlines, + those with Divergence == 2 (minor distributaries) are removed, + so that there is only one downstream NHDPlusID. + """ ta = time.time() - print('reading {}...'.format(NHDPlusHR_path)) # read NHDFLowlines from NHDPlusHR_path (NHDPlus HR OpenFileGDB) - fl = gpd.read_file(NHDPlusHR_path, driver='OpenFileGDB', layer='NHDFlowline') - # get crs information from flowlines - fl_crs = fl.crs - - if filter is not None: - print('filtering flowlines...') - - # ensure that filter bbox is in same crs as flowlines - # get filters from shapefiles, shapley Polygons or GeoJSON polygons - if bbox_filter is not None: - if bbox_filter is not isinstance(bbox_filter, tuple): - bbox_filter = get_bbox(bbox_filter, dest_crs=fl_crs) + if isinstance(NHDPlusHR_paths, str) or isinstance(NHDPlusHR_paths, Path): + NHDPlusHR_paths = [NHDPlusHR_paths] + fls = [] + for f in NHDPlusHR_paths: + print(f'reading {f}...') + fl = gpd.read_file(f, driver='OpenFileGDB', layer='NHDFlowline') + # get crs information from flowlines + fl_crs = fl.crs - # filter to bbox using geopandas spatial indexing - fl = fl.cx[bbox_filter[0]:bbox_filter[2], bbox_filter[1]:bbox_filter[3]] + if filter is not None: + print('filtering flowlines...') - # read NHDPlusFlowlineVAA file from NHDPlusHR_path (NHDPlus HR OpenFileGDB) and merge with flowlines - flvaa = gpd.read_file(NHDPlusHR_path, driver='OpenFileGDB', layer='NHDPlusFlowlineVAA') - fl = fl.merge(flvaa[['NHDPlusID', 'ArbolateSu','StreamOrde', 'MaxElevSmo', 'MinElevSmo', 'Divergence']], - on='NHDPlusID', how='left' - ) - - # read NHDPlusFlow file from NHDPlusHR_path (NHDPlus HR OpenFileGDB) - pf = gpd.read_file(NHDPlusHR_path, driver='OpenFileGDB', layer='NHDPlusFlow') - - # Remove features classified as minor divergence pathways (Divergence == 2) - # from PlusFlow table - pf_routing_dict = get_hr_routing(pf, fl) - - # Add routing information from PlusFlow table. - # Set any remaining comids not in fromcomid_list to zero - # (outlets or inlets from outside model) - fl['ToNHDPID'] = [pf_routing_dict[i] if i in pf_routing_dict else 0.0 for i in fl.NHDPlusID] - print("finished in {:.2f}s\n".format(time.time() - ta)) + # ensure that filter bbox is in same crs as flowlines + # get filters from shapefiles, shapley Polygons or GeoJSON polygons + if bbox_filter is not None: + if not isinstance(bbox_filter, tuple): + bbox_filter = get_bbox(bbox_filter, dest_crs=fl_crs) + + # filter to bbox using geopandas spatial indexing + fl = fl.cx[bbox_filter[0]:bbox_filter[2], bbox_filter[1]:bbox_filter[3]] + + # read NHDPlusFlowlineVAA file from NHDPlusHR_path (NHDPlus HR OpenFileGDB) and merge with flowlines + flvaa = gpd.read_file(f, driver='OpenFileGDB', layer='NHDPlusFlowlineVAA') + fl = fl.merge(flvaa[['NHDPlusID', 'ArbolateSu','StreamOrde', 'MaxElevSmo', 'MinElevSmo', 'Divergence']], + on='NHDPlusID', how='left' + ) + + # read NHDPlusFlow file from NHDPlusHR_path (NHDPlus HR OpenFileGDB) + pf = gpd.read_file(f, driver='OpenFileGDB', layer='NHDPlusFlow') + + # Remove features classified as minor divergence pathways (Divergence == 2) + # from PlusFlow table + pf_routing_dict = get_hr_routing(pf, fl) + + # Add routing information from PlusFlow table. + # Set any remaining comids not in fromcomid_list to zero + # (outlets or inlets from outside model) + fl['ToNHDPID'] = [pf_routing_dict[i] if i in pf_routing_dict else 0.0 for i in fl.NHDPlusID] + print("finished in {:.2f}s\n".format(time.time() - ta)) + fls.append(fl) + fl = pd.concat(fls, axis=0) + if drop_fcodes is not None: + if np.isscalar(drop_fcodes): + keep = fl['FCode'] != drop_fcodes + else: + keep = ~fl['FCode'].isin(drop_fcodes) + fl = fl.loc[keep].copy() return fl diff --git a/sfrmaker/test/conftest.py b/sfrmaker/test/conftest.py index 67656799..4e630067 100644 --- a/sfrmaker/test/conftest.py +++ b/sfrmaker/test/conftest.py @@ -48,13 +48,12 @@ def test_data_path(): @pytest.fixture(scope="session", autouse=True) def outdir(project_root_path): - folder = os.path.join(project_root_path, 'sfrmaker/test/temp/') + folder = project_root_path / 'sfrmaker/test/temp/' reset = True if reset: - if os.path.isdir(folder): - shutil.rmtree(folder) - os.makedirs(folder) - return Path(folder) + shutil.rmtree(folder, ignore_errors=True) + folder.mkdir() + return folder @pytest.fixture(scope="session") diff --git a/sfrmaker/test/test_nhdplus_utils.py b/sfrmaker/test/test_nhdplus_utils.py index f56fadc4..6df53e50 100644 --- a/sfrmaker/test/test_nhdplus_utils.py +++ b/sfrmaker/test/test_nhdplus_utils.py @@ -1 +1,32 @@ -# TODO: add unit tests for test_nhdplus_utils.py +"""Unit tests for test_nhdplus_utils.py""" +import pytest +from sfrmaker.nhdplus_utils import ( + read_nhdplus_hr +) + + +@pytest.mark.parametrize('nhdplus_files,bbox_filter,expected_nrows,drop_fcodes', ( + ('neversink_rondout/NHDPlus_HR_1.gdb', + (-74.68061523310072, 41.74556802490179, -74.44311875132166, 41.848862965337595), + 132, None), + ('neversink_rondout/NHDPlus_HR_1.gdb', + 'examples/neversink_rondout/Model_Extent.shp', + 75, 55800), + (['neversink_rondout/NHDPlus_HR_1.gdb', 'neversink_rondout/NHDPlus_HR_2.gdb'], + 'examples/neversink_rondout/Model_Extent.shp', 386, None + ) +) +) +def test_read_nhdplus_hr(nhdplus_files, bbox_filter, expected_nrows, drop_fcodes, + datapath): + + if not isinstance(nhdplus_files, str): + NHDPlusHR_paths = [datapath / f for f in nhdplus_files] + else: + NHDPlusHR_paths = datapath / nhdplus_files + results = read_nhdplus_hr(NHDPlusHR_paths, bbox_filter=bbox_filter, + drop_fcodes=drop_fcodes) + expected_cols = ['NHDPlusID', 'ArbolateSu','StreamOrde', 'MaxElevSmo', + 'MinElevSmo', 'Divergence', 'ToNHDPID'] + assert not set(expected_cols).difference(results.columns) + assert len(results) == expected_nrows \ No newline at end of file