From 11f05dd7fd6540cfd5f561a382c20b311f4fecc3 Mon Sep 17 00:00:00 2001 From: "Leaf, Andrew T" Date: Thu, 17 Oct 2024 14:35:02 -0500 Subject: [PATCH] refactor: use strings universally as datatype for line identifiers, to avoid confusion withfloats vs ints and issues with long identifiers such as those in NHDPlus HR that can't be represented as integers in 32-bit contexts --- sfrmaker/flows.py | 18 ++++++++++---- sfrmaker/lines.py | 38 ++++++++++++++++++----------- sfrmaker/nhdplus_utils.py | 8 +++--- sfrmaker/observations.py | 1 + sfrmaker/routing.py | 12 ++++----- sfrmaker/sfrdata.py | 7 ++++-- sfrmaker/test/conftest.py | 3 ++- sfrmaker/test/test_flows.py | 33 ++++++++++++++----------- sfrmaker/test/test_nhdplus_utils.py | 20 +++++++++++++++ sfrmaker/test/test_observations.py | 2 +- sfrmaker/test/test_preprocessing.py | 2 +- sfrmaker/test/test_routing.py | 27 +++++++++++--------- sfrmaker/utils.py | 2 +- 13 files changed, 112 insertions(+), 61 deletions(-) diff --git a/sfrmaker/flows.py b/sfrmaker/flows.py index 401f283d..499a03f9 100644 --- a/sfrmaker/flows.py +++ b/sfrmaker/flows.py @@ -5,6 +5,7 @@ import warnings import numpy as np import pandas as pd +import geopandas as gpd from shapely.geometry import box import flopy from sfrmaker.routing import find_path, make_graph @@ -68,7 +69,7 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data, # parent and inset reach data if isinstance(parent_reach_data, str) or isinstance(parent_reach_data, Path): - prd = shp2df(parent_reach_data) + prd = gpd.read_file(parent_reach_data) elif isinstance(parent_reach_data, pd.DataFrame): prd = parent_reach_data.copy() else: @@ -78,10 +79,11 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data, prd['ireach'] = 1 mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach', 'geometry'} assert len(mustinclude_cols.intersection(prd.columns)) == len(mustinclude_cols) - + prd['line_id'] = prd['line_id'].astype(int).astype(str) + if isinstance(inset_reach_data, str) or isinstance(inset_reach_data, Path): if inset_reach_data.endswith('.shp'): - ird = shp2df(inset_reach_data) + ird = gpd.read_file(inset_reach_data) else: ird = pd.read_csv(inset_reach_data) elif isinstance(inset_reach_data, pd.DataFrame): @@ -93,7 +95,8 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data, ird['ireach'] = 1 mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach'} assert len(mustinclude_cols.intersection(ird.columns)) == len(mustinclude_cols) - + ird['line_id'] = ird['line_id'].astype(int).astype(str) + graph = make_graph(ird.rno.values, ird.outreach.values, one_to_many=False) # cull parent reach data to only lines that cross or are just upstream of inset boundary @@ -267,6 +270,10 @@ def add_to_perioddata(sfrdata, data, flowline_routing=None, # cull data to valid periods data = data.loc[data[period_column] >= 0].copy() + + # convert line IDs to strings + if line_id_column in data.columns: + data[line_id_column] = data[line_id_column].astype(int).astype(str) # map NHDPlus COMIDs to reach numbers if flowline_routing is not None: @@ -274,7 +281,8 @@ def add_to_perioddata(sfrdata, data, flowline_routing=None, "Data need an id column so {} locations can be mapped to reach numbers".format(variable) # replace ids that are not keys (those outside the network) with zeros # (0 is the exit condition for finding paths in get_next_id_in_subset) - flowline_routing = {k: v if v in flowline_routing.keys() else 0 + # also force all IDs to strings + flowline_routing = {str(int(k)): str(int(v)) if v in flowline_routing.keys() else '0' for k, v in flowline_routing.items()} rno_column = 'rno' r1 = sfrd.reach_data.loc[sfrd.reach_data.ireach == 1] diff --git a/sfrmaker/lines.py b/sfrmaker/lines.py index 3e0fc5be..3d11eb79 100644 --- a/sfrmaker/lines.py +++ b/sfrmaker/lines.py @@ -141,13 +141,14 @@ def geometry_length_units(self): 'foot': 'feet', 'meters': 'meters', 'metre': 'meters'} - if self.crs.is_geographic: - raise ValueError('Flowline geometries need to be in a ' - 'projected coordinate system; supply a model grid ' - 'with a projected CRS to the Lines.to_sfr() method or ' - 'run Lines.to_crs() to reproject the flowlines.' - ) - self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name) + if self.crs is not None: + if self.crs.is_geographic: + raise ValueError('Flowline geometries need to be in a ' + 'projected coordinate system; supply a model grid ' + 'with a projected CRS to the Lines.to_sfr() method or ' + 'run Lines.to_crs() to reproject the flowlines.' + ) + self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name) if self._geometry_length_units is None: print("Warning: No length units specified in CRS for input LineStrings " "or length units not recognized" @@ -195,10 +196,10 @@ def routing(self): routing = pick_toids(routing, self.elevup) # set toids not in routing dataset to 0 # (outlet condition) - routing = {k: v if v in routing.keys() else 0 + routing = {k: v if v in routing.keys() else '0' for k, v in routing.items()} else: - routing = {self.df.id.values[0]: 0} + routing = {self.df.id.values[0]: '0'} self._routing = routing self._last_routing_dict_update = time.time() return self._routing @@ -569,7 +570,7 @@ def from_shapefile(cls, shapefile, df = gpd.read_file(shapefile, bbox=bbox_filter) assert 'geometry' in df.columns, "No feature geometries found in {}.".format(shapefile) - return cls.from_dataframe(df, + return cls.from_dataframe(df, id_column=id_column, routing_column=routing_column, arbolate_sum_column2=arbolate_sum_column2, @@ -697,6 +698,14 @@ def from_dataframe(cls, df, to_drop = set(rename_cols.values()).intersection(df.columns) df.drop(to_drop, axis=1, inplace=True) df.rename(columns=rename_cols, inplace=True) + + # convert IDs to strings + df['id'] = df['id'].astype(int).astype(str) + # if reading from NHDPlus, to-ids may already be in lists + if np.isscalar(df['toid'].values[0]): + df['toid'] = df['toid'].astype(int).astype(str) + else: + df['toid'] = [[str(int(toid)) for toid in toids] for toids in df['toid']] column_order = ['id', 'toid', 'asum1', 'asum2', @@ -1078,13 +1087,14 @@ def to_sfr(self, grid=None, groups = lengths.groupby('line_id') # fragments grouped by parent line reach_cumsums = [] - ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values - for id in ordered_ids: - grp = groups.get_group(id).sort_values(by='ireach') + #ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values + ordered_ids = lengths['line_id'].unique() + for line_id in ordered_ids: + grp = groups.get_group(line_id).sort_values(by='ireach') dist = np.cumsum(grp.rchlen.values) - 0.5 * grp.rchlen.values reach_cumsums.append(dist) reach_cumsums = np.concatenate(reach_cumsums) - segment_asums = [asum1s[id] for id in lengths.line_id] + segment_asums = [asum1s[line_id] for line_id in lengths.line_id] reach_asums = segment_asums + reach_cumsums *\ convert_length_units(self.geometry_length_units, self.asum_units) # maintain positive asums; lengths in NHD often aren't exactly equal to feature lengths diff --git a/sfrmaker/nhdplus_utils.py b/sfrmaker/nhdplus_utils.py index e25158a4..72b5bbed 100644 --- a/sfrmaker/nhdplus_utils.py +++ b/sfrmaker/nhdplus_utils.py @@ -167,7 +167,7 @@ def load_nhdplus_v2(NHDPlus_paths=None, # read flowlines and attribute tables into dataframes fl = read_nhdplus(NHDFlowlines, bbox_filter=bbox_filter) pfvaa = read_nhdplus(PlusFlowlineVAA) - pf = shp2df(PlusFlow) + pf = read_nhdplus(PlusFlow) elevs = read_nhdplus(elevslope) # join flowline and attribute dataframes @@ -195,7 +195,7 @@ def get_tocomids(pf, fromcomid_list): # subset PlusFlow entries for comids that are not in fromcomid_list # comids may be missing because they are outside of the model # or if the fromcomid_list dataset was edited (resulting in breaks in the routing) - missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != 0) + missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != '0') missing = pf.loc[missing_tocomids, ['FROMCOMID', 'TOCOMID']].copy() # recursively crawl the PlusFlow table # to try to find a downstream comid that is in fromcomid_list @@ -205,7 +205,7 @@ def get_tocomids(pf, fromcomid_list): # set any remaining comids not in fromcomid_list to zero # (outlets or inlets from outside model) - pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = 0 + pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = '0' tocomid = pf.TOCOMID.values fromcomid = pf.FROMCOMID.values tocomids = [tocomid[fromcomid == c].tolist() for c in comids] @@ -228,7 +228,7 @@ def find_next_comid(comid, pftable, comids, max_levels=10): # (often these will be in different levelpaths, # so there is no way to determine a preferred routing path) return list(set(nextocomid).intersection(comids))[0] - return 0 + return '0' def read_nhdplus(shapefiles, bbox_filter=None, diff --git a/sfrmaker/observations.py b/sfrmaker/observations.py index 72d74686..f943dc9a 100644 --- a/sfrmaker/observations.py +++ b/sfrmaker/observations.py @@ -123,6 +123,7 @@ def add_observations(sfrdata, data, flowline_routing=None, # get reach number from site locations in source hydrography (line_ids) elif line_id_column in data.columns: + data[line_id_column] = data[line_id_column].astype(int).astype(str) # map NHDPlus COMIDs to reach numbers if flowline_routing is None: line_id = dict(zip(reach_data.iseg, reach_data.line_id)) diff --git a/sfrmaker/routing.py b/sfrmaker/routing.py index 6574aa20..3420ba42 100644 --- a/sfrmaker/routing.py +++ b/sfrmaker/routing.py @@ -98,7 +98,7 @@ def get_upsegs(graph_r, seg): return all_upsegs -def find_path(graph, start, end=0, limit=None): +def find_path(graph, start, end='0', limit=None): """Get a path through the routing network, from a segment to an outlet. @@ -122,11 +122,11 @@ def find_path(graph, start, end=0, limit=None): if limit is None: limit = len(graph) path = [start] - next = start + next_id = start for i in range(limit): - next = graph[next] - path.append(next) - if next == end: + next_id = graph[next_id] + path.append(next_id) + if str(next_id) == str(end): break return path @@ -299,7 +299,7 @@ def get_next_id_in_subset(subset, routing, ids): ids : revised list of first values downstream of the values in ids (determined by routing) that are also in subset. """ - subset = set(subset).union({0}) + subset = set(subset).union({'0'}) routing = routing.copy() if np.isscalar(ids): ids = [ids] diff --git a/sfrmaker/sfrdata.py b/sfrmaker/sfrdata.py index a4003601..8413a895 100644 --- a/sfrmaker/sfrdata.py +++ b/sfrmaker/sfrdata.py @@ -110,7 +110,7 @@ class SFRData(DataPackage): 'thts2', 'thti2', 'eps2', 'uhc2'] dtypes = {'rno': int, 'node': int, 'k': int, 'i': int, 'j': int, - 'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'int64', + 'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'object', 'per': int, 'nseg': int, 'icalc': int, 'outseg': int, 'iupseg': int, 'iprior': int, 'nstrpts': int, 'name': object, 'geometry': object} @@ -1315,6 +1315,8 @@ def from_yaml(cls, config_file, package_name=None, output_path=None, # resample inflows to model stress periods inflows_input['id_column'] = inflows_input['line_id_column'] inflows_by_stress_period = pd.read_csv(inflows_input['filename']) + inflows_by_stress_period[inflows_input['id_column']] =\ + inflows_by_stress_period[inflows_input['id_column']].astype(int).astype(str) # check if all inflow sites are included in sfr network missing_sites = set(inflows_by_stress_period[inflows_input['id_column']]). \ @@ -1381,7 +1383,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True): rno : sequence of ints Convert the listed reach numbers (nro), and all downstream reaches, to the RIV package. If None, all reaches are converted (default). - line_ids : sequence of ints + line_ids : sequence of ints or strings Convert the segments corresponding to the line_ids (unique identifiers for LineString features in the source hydrography), and all downstream segments to the RIV package. If None, all reaches @@ -1409,6 +1411,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True): if line_ids is not None: if np.isscalar(line_ids): line_ids = [line_ids] + line_ids = [str(lid) for lid in line_ids] loc = loc & self.reach_data.line_id.isin(line_ids) if rno is not None: if np.isscalar(rno): diff --git a/sfrmaker/test/conftest.py b/sfrmaker/test/conftest.py index 4e630067..e8bebf35 100644 --- a/sfrmaker/test/conftest.py +++ b/sfrmaker/test/conftest.py @@ -261,9 +261,10 @@ def lines_from_shapefile(test_data_path): def shellmound_sfrdata(shellmound_model, lines_from_shapefile, shellmound_grid): m = shellmound_model + lines = copy.deepcopy(lines_from_shapefile) # from the lines and StructuredGrid instances, make a sfrmaker.sfrdata instance # (lines are intersected with the model grid and converted to reaches, etc.) - sfrdata = lines_from_shapefile.to_sfr(grid=shellmound_grid, + sfrdata = lines.to_sfr(grid=shellmound_grid, model=m) return sfrdata diff --git a/sfrmaker/test/test_flows.py b/sfrmaker/test/test_flows.py index 2f6a1946..ed66e916 100644 --- a/sfrmaker/test/test_flows.py +++ b/sfrmaker/test/test_flows.py @@ -39,14 +39,14 @@ def test_get_inflow_locations_from_parent_model(outdir, shellmound_grid, shellmo ) shellmound_sfrdata.export_cells('{}/shellmound_sfr_cells.shp'.format(outdir)) shellmound_sfrdata.export_lines('{}/shellmound_sfr_lines.shp'.format(outdir)) - expected_line_ids = {1000005, # yazoo - 18046670, # Tallahatchie River - 938030409, # unnamed stream along southern boundary - 17955337, # unnamed stream in NW corner - 17955371, # downstream of "Wild Bill Bayou" - 17955445, # unnamed stream just east of 17955281 - 17958187, # big sunflower river - 17956547, # unnamed trib to BSR on west side + expected_line_ids = {'1000005', # yazoo + '18046670', # Tallahatchie River + '938030409', # unnamed stream along southern boundary + '17955337', # unnamed stream in NW corner + '17955371', # downstream of "Wild Bill Bayou" + '17955445', # unnamed stream just east of 17955281 + '17958187', # big sunflower river + '17956547', # unnamed trib to BSR on west side } symmetric_diff = expected_line_ids ^ set(df.line_id) assert len(symmetric_diff) == 0 @@ -95,7 +95,7 @@ def test_add_to_perioddata(shellmound_sfrdata): # two inflows applied upstream of model on different paths; # inflows should be applied to the first lines in the model # along their respective paths - flowline_routing[6] = 1000005 + flowline_routing['6'] = '1000005' flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.], 'per': [0, 1, 0, 1], 'line_id': [6, 6, 4, 4]}) @@ -109,7 +109,7 @@ def test_add_to_perioddata(shellmound_sfrdata): assert np.allclose(sfrd.period_data['inflow'].sort_values(), flows['Q_avg'].sort_values()) assert np.allclose(sfrd.period_data.index.get_level_values(0), sorted(flows['per'])) - lines = {flowline_routing[6], seq[-1]} + lines = {flowline_routing['6'], seq[-1]} rnos = rd.loc[rd.line_id.isin(lines), 'rno'] assert not set(sfrd.period_data.index.levels[1]).difference(rnos) assert len(set(sfrd.period_data.index.levels[1])) == 2 @@ -132,7 +132,12 @@ def test_add_to_perioddata(shellmound_sfrdata): data_column='Q_avg', one_inflow_per_path=True) assert np.allclose(sfrd.period_data['inflow'].values, [300., 100., 30., 10.]) - assert np.allclose(sfrd.period_data.index.levels[1], [354, 394]) + # these reach numbers are liable to change + rnos = [ + sfrd.reach_data.loc[sfrd.reach_data['line_id'] == seq[-1], 'rno'].min(), + sfrd.reach_data.loc[sfrd.reach_data['line_id'] == '1000005', 'rno'].min() + ] + assert np.allclose(sfrd.period_data.index.levels[1], rnos) # two inflows applied along same path in model; # one_inflow_per_path=False @@ -140,9 +145,9 @@ def test_add_to_perioddata(shellmound_sfrdata): flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.], 'per': [0, 1, 0, 1], - 'line_id': [17955337, 17955337, - flowline_routing[17955337], - flowline_routing[17955337] + 'line_id': ['17955337', '17955337', + flowline_routing['17955337'], + flowline_routing['17955337'] ]}) sfrd._period_data = None add_to_perioddata(sfrd, flows, diff --git a/sfrmaker/test/test_nhdplus_utils.py b/sfrmaker/test/test_nhdplus_utils.py index 36f4f905..b4b8d16e 100644 --- a/sfrmaker/test/test_nhdplus_utils.py +++ b/sfrmaker/test/test_nhdplus_utils.py @@ -1,6 +1,7 @@ """Unit tests for test_nhdplus_utils.py""" import pytest from sfrmaker.nhdplus_utils import ( + load_nhdplus_v2, read_nhdplus, read_nhdplus_hr ) @@ -62,3 +63,22 @@ def test_read_nhdplus(nhdplus_files, bbox_filter, expected_nrows, # NHDPlusIDs should be treated as strings comid_col = [c for c in results.columns if c.lower() in {'comid', 'froncomid', 'tocomid'}] assert all([type(s) == str for s in results[comid_col].astype(int).astype(str)]) + + +@pytest.mark.parametrize('kwargs,bbox_filter', ( + ({'NHDPlus_paths': 'tylerforks/NHDPlus'}, 'tylerforks/active_area.shp'), + ({'NHDFlowlines': 'tylerforks/NHDPlus/NHDSnapshot/Hydrography/NHDFlowline.shp', + 'elevslope': 'tylerforks/NHDPlus/NHDPlusAttributes/elevslope.dbf', + 'PlusFlowlineVAA': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlowlineVAA.dbf', + 'PlusFlow': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlow.dbf' + }, 'tylerforks/active_area.shp') +)) +def test_load_nhdplus_v2(kwargs, bbox_filter, datapath): + kwargs = {k: datapath / v for k, v in kwargs.items()} + if isinstance(bbox_filter, str): + bbox_filter = datapath / bbox_filter + results = load_nhdplus_v2(**kwargs, bbox_filter=bbox_filter) + # all comids and tocomids should be strings + assert all([type(s) == str for s in results['COMID'].astype(int).astype(str)]) + assert all([type(s) == str for comids in results['COMID'].astype(int).astype(str) + for s in comids ]) diff --git a/sfrmaker/test/test_observations.py b/sfrmaker/test/test_observations.py index 814080d5..6cd6bead 100644 --- a/sfrmaker/test/test_observations.py +++ b/sfrmaker/test/test_observations.py @@ -117,7 +117,7 @@ def test_add_observations_from_line_ids(shellmound_sfrdata, flux_observation_dat # get the last reach in each segment rd = shellmound_sfrdata.reach_data.sort_values(by=['iseg', 'ireach'], axis=0).groupby('iseg').last() rno = dict(zip(rd.line_id, rd.rno)) - assert set(obs.rno) == set([rno[lid] for lid in flux_observation_data.line_id]) + assert set(obs.rno) == set([rno[str(lid)] for lid in flux_observation_data.line_id]) rd = shellmound_sfrdata.reach_data iseg_ireach = dict(list(zip(rd.rno, zip(rd.iseg, rd.ireach)))) for i, r in obs.iterrows(): diff --git a/sfrmaker/test/test_preprocessing.py b/sfrmaker/test/test_preprocessing.py index 218cf08c..16070b65 100644 --- a/sfrmaker/test/test_preprocessing.py +++ b/sfrmaker/test/test_preprocessing.py @@ -423,6 +423,6 @@ def test_preprocess_nhdplus_hr_waterbodies(project_root_path, outdir): dest_crs=26905, outfile=outfile) df = gpd.read_file(outfile) df.crs == 26905 - assert 'Beaver Lake' in df['GNIS_Name'] + assert 'Beaver Lake' in df['GNIS_Name'].values # the next line shouldn't be there either assert 75004400012864 not in df['NHDPlusID'].values diff --git a/sfrmaker/test/test_routing.py b/sfrmaker/test/test_routing.py index bfb0baec..20d60274 100644 --- a/sfrmaker/test/test_routing.py +++ b/sfrmaker/test/test_routing.py @@ -6,21 +6,23 @@ get_previous_ids_in_subset) -def add_line_sequence(routing, nlines=4): +def add_line_sequence(routing, nlines=4, string_ids=False): headwater_line_ids = set(routing.keys()).difference(routing.values()) # add in some lines upstream of a headwater n = nlines new_lines = set(range(n + 2)).difference(routing) new_routing = {} - id = new_lines.pop() - sequence = [id] + lid = new_lines.pop() + sequence = [lid] for i in range(n): nextid = new_lines.pop() - new_routing[id] = nextid - id = nextid + new_routing[lid] = nextid + lid = nextid sequence.append(nextid) end = headwater_line_ids.pop() - new_routing[id] = end + new_routing[lid] = end + if string_ids: + new_routing = {str(k): str(v) for k, v in new_routing.items()} sequence.append(end) routing.update(new_routing) return sequence @@ -32,15 +34,16 @@ def test_get_next_id_in_subset(shellmound_sfrdata): sfr_routing = shellmound_sfrdata.segment_routing.copy() # routing for source hydrography - routing = {line_id.get(k, 0): line_id.get(v, 0) + routing = {str(line_id.get(k, 0)): line_id.get(v, '0') for k, v in sfr_routing.items()} nlines = 4 - seq = add_line_sequence(routing, nlines=nlines) + seq = add_line_sequence(routing, nlines=nlines, string_ids=True) - result = get_next_id_in_subset(rd.line_id, routing, set(range(nlines + 2))) - assert set(result).difference(rd.line_id) == {0} - assert set(result) == {0, seq[-1]} - assert len(result) == len(range(nlines + 2)) + ids = {str(lid) for lid in set(range(nlines + 1))} + result = get_next_id_in_subset(rd.line_id, routing, ids) + assert set(result).difference(rd.line_id) == {'0'} + assert set(result) == {'0', seq[-1]} + assert len(result) == len(range(nlines + 1)) def test_get_previous_ids_in_subset(): diff --git a/sfrmaker/utils.py b/sfrmaker/utils.py index 379ee597..c8ee60f5 100644 --- a/sfrmaker/utils.py +++ b/sfrmaker/utils.py @@ -352,7 +352,7 @@ def width_from_arbolate_sum(asum, a=0.1193, b=0.5032, minimum_width=1., input_un Original equation from Feinstein et al (2010), for arbolate sum of 1,000 km: >>> width = width_from_arbolate_sum(1000, 0.1193, 0.5032, input_units='kilometers', output_units='feet') >>> round(width, 2) - np.float64(124.69) + 124.69 """ if not np.isscalar(asum): asum = np.atleast_1d(np.squeeze(asum))