refactor: use strings universally as datatype for line identifiers, t…

…o avoid confusion withfloats vs ints and issues with long identifiers such as those in NHDPlus HR that can't be represented as integers in 32-bit contexts
aleaf · Oct 17, 2024 · 11f05dd · 11f05dd
1 parent 22d4f0b
commit 11f05dd
Show file tree

Hide file tree

Showing 13 changed files with 112 additions and 61 deletions.
diff --git a/sfrmaker/flows.py b/sfrmaker/flows.py
@@ -5,6 +5,7 @@
 import warnings
 import numpy as np
 import pandas as pd
+import geopandas as gpd
 from shapely.geometry import box
 import flopy
 from sfrmaker.routing import find_path, make_graph
@@ -68,7 +69,7 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,
 
     # parent and inset reach data
     if isinstance(parent_reach_data, str) or isinstance(parent_reach_data, Path):
-        prd = shp2df(parent_reach_data)
+        prd = gpd.read_file(parent_reach_data)
     elif isinstance(parent_reach_data, pd.DataFrame):
         prd = parent_reach_data.copy()
     else:
@@ -78,10 +79,11 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,
         prd['ireach'] = 1
     mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach', 'geometry'}
     assert len(mustinclude_cols.intersection(prd.columns)) == len(mustinclude_cols)
-
+    prd['line_id'] = prd['line_id'].astype(int).astype(str)
+
     if isinstance(inset_reach_data, str) or isinstance(inset_reach_data, Path):
         if inset_reach_data.endswith('.shp'):
-            ird = shp2df(inset_reach_data)
+            ird = gpd.read_file(inset_reach_data)
         else:
             ird = pd.read_csv(inset_reach_data)
     elif isinstance(inset_reach_data, pd.DataFrame):
@@ -93,7 +95,8 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,
         ird['ireach'] = 1
     mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach'}
     assert len(mustinclude_cols.intersection(ird.columns)) == len(mustinclude_cols)
-
+    ird['line_id'] = ird['line_id'].astype(int).astype(str)
+
     graph = make_graph(ird.rno.values, ird.outreach.values, one_to_many=False)
 
     # cull parent reach data to only lines that cross or are just upstream of inset boundary
@@ -267,14 +270,19 @@ def add_to_perioddata(sfrdata, data, flowline_routing=None,
 
     # cull data to valid periods
     data = data.loc[data[period_column] >= 0].copy()
+
+    # convert line IDs to strings
+    if line_id_column in data.columns:
+        data[line_id_column] = data[line_id_column].astype(int).astype(str)
 
     # map NHDPlus COMIDs to reach numbers
     if flowline_routing is not None:
         assert line_id_column in data.columns, \
             "Data need an id column so {} locations can be mapped to reach numbers".format(variable)
         # replace ids that are not keys (those outside the network) with zeros
         # (0 is the exit condition for finding paths in get_next_id_in_subset)
-        flowline_routing = {k: v if v in flowline_routing.keys() else 0 
+        # also force all IDs to strings
+        flowline_routing = {str(int(k)): str(int(v)) if v in flowline_routing.keys() else '0' 
                             for k, v in flowline_routing.items()}
         rno_column = 'rno'
         r1 = sfrd.reach_data.loc[sfrd.reach_data.ireach == 1]

diff --git a/sfrmaker/lines.py b/sfrmaker/lines.py
@@ -141,13 +141,14 @@ def geometry_length_units(self):
                        'foot': 'feet',
                        'meters': 'meters',
                        'metre': 'meters'}
-        if self.crs.is_geographic:
-            raise ValueError('Flowline geometries need to be in a '
-                             'projected coordinate system; supply a model grid '
-                             'with a projected CRS to the Lines.to_sfr() method or '
-                             'run Lines.to_crs() to reproject the flowlines.'
-                             )
-        self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name)
+        if self.crs is not None:
+            if self.crs.is_geographic:
+                raise ValueError('Flowline geometries need to be in a '
+                                'projected coordinate system; supply a model grid '
+                                'with a projected CRS to the Lines.to_sfr() method or '
+                                'run Lines.to_crs() to reproject the flowlines.'
+                                )
+            self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name)
         if self._geometry_length_units is None:
             print("Warning: No length units specified in CRS for input LineStrings "
                   "or length units not recognized"
@@ -195,10 +196,10 @@ def routing(self):
                     routing = pick_toids(routing, self.elevup)
                 # set toids not in routing dataset to 0
                 # (outlet condition)
-                routing = {k: v if v in routing.keys() else 0 
+                routing = {k: v if v in routing.keys() else '0' 
                             for k, v in routing.items()}
             else:
-                routing = {self.df.id.values[0]: 0}
+                routing = {self.df.id.values[0]: '0'}
             self._routing = routing
             self._last_routing_dict_update = time.time()
         return self._routing
@@ -569,7 +570,7 @@ def from_shapefile(cls, shapefile,
         df = gpd.read_file(shapefile, bbox=bbox_filter)
         assert 'geometry' in df.columns, "No feature geometries found in {}.".format(shapefile)
 
-        return cls.from_dataframe(df,
+        return cls.from_dataframe(df,   
                                   id_column=id_column,
                                   routing_column=routing_column,
                                   arbolate_sum_column2=arbolate_sum_column2,
@@ -697,6 +698,14 @@ def from_dataframe(cls, df,
         to_drop = set(rename_cols.values()).intersection(df.columns)
         df.drop(to_drop, axis=1, inplace=True)
         df.rename(columns=rename_cols, inplace=True)
+
+        # convert IDs to strings
+        df['id'] = df['id'].astype(int).astype(str)
+        # if reading from NHDPlus, to-ids may already be in lists
+        if np.isscalar(df['toid'].values[0]):
+            df['toid'] = df['toid'].astype(int).astype(str)
+        else:
+            df['toid'] = [[str(int(toid)) for toid in toids] for toids in df['toid']]
 
         column_order = ['id', 'toid',
                         'asum1', 'asum2',
@@ -1078,13 +1087,14 @@ def to_sfr(self, grid=None,
         groups = lengths.groupby('line_id')  # fragments grouped by parent line
 
         reach_cumsums = []
-        ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values
-        for id in ordered_ids:
-            grp = groups.get_group(id).sort_values(by='ireach')
+        #ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values
+        ordered_ids = lengths['line_id'].unique()
+        for line_id in ordered_ids:
+            grp = groups.get_group(line_id).sort_values(by='ireach')
             dist = np.cumsum(grp.rchlen.values) - 0.5 * grp.rchlen.values
             reach_cumsums.append(dist)
         reach_cumsums = np.concatenate(reach_cumsums)
-        segment_asums = [asum1s[id] for id in lengths.line_id]
+        segment_asums = [asum1s[line_id] for line_id in lengths.line_id]
         reach_asums = segment_asums + reach_cumsums *\
             convert_length_units(self.geometry_length_units, self.asum_units)
         # maintain positive asums; lengths in NHD often aren't exactly equal to feature lengths

diff --git a/sfrmaker/nhdplus_utils.py b/sfrmaker/nhdplus_utils.py
@@ -167,7 +167,7 @@ def load_nhdplus_v2(NHDPlus_paths=None,
     # read flowlines and attribute tables into dataframes
     fl = read_nhdplus(NHDFlowlines, bbox_filter=bbox_filter)
     pfvaa = read_nhdplus(PlusFlowlineVAA)
-    pf = shp2df(PlusFlow)
+    pf = read_nhdplus(PlusFlow)
     elevs = read_nhdplus(elevslope)
 
     # join flowline and attribute dataframes
@@ -195,7 +195,7 @@ def get_tocomids(pf, fromcomid_list):
     # subset PlusFlow entries for comids that are not in fromcomid_list
     # comids may be missing because they are outside of the model
     # or if the fromcomid_list dataset was edited (resulting in breaks in the routing)
-    missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != 0)
+    missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != '0')
     missing = pf.loc[missing_tocomids, ['FROMCOMID', 'TOCOMID']].copy()
     # recursively crawl the PlusFlow table
     # to try to find a downstream comid that is in fromcomid_list
@@ -205,7 +205,7 @@ def get_tocomids(pf, fromcomid_list):
 
     # set any remaining comids not in fromcomid_list to zero
     # (outlets or inlets from outside model)
-    pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = 0
+    pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = '0'
     tocomid = pf.TOCOMID.values
     fromcomid = pf.FROMCOMID.values
     tocomids = [tocomid[fromcomid == c].tolist() for c in comids]
@@ -228,7 +228,7 @@ def find_next_comid(comid, pftable, comids, max_levels=10):
             # (often these will be in different levelpaths,
             # so there is no way to determine a preferred routing path)
             return list(set(nextocomid).intersection(comids))[0]
-    return 0
+    return '0'
 
 
 def read_nhdplus(shapefiles, bbox_filter=None,

diff --git a/sfrmaker/observations.py b/sfrmaker/observations.py
@@ -123,6 +123,7 @@ def add_observations(sfrdata, data, flowline_routing=None,
 
         # get reach number from site locations in source hydrography (line_ids)
         elif line_id_column in data.columns:
+            data[line_id_column] = data[line_id_column].astype(int).astype(str)
             # map NHDPlus COMIDs to reach numbers
             if flowline_routing is None:
                 line_id = dict(zip(reach_data.iseg, reach_data.line_id))

diff --git a/sfrmaker/routing.py b/sfrmaker/routing.py
@@ -98,7 +98,7 @@ def get_upsegs(graph_r, seg):
     return all_upsegs
 
 
-def find_path(graph, start, end=0, limit=None):
+def find_path(graph, start, end='0', limit=None):
     """Get a path through the routing network,
     from a segment to an outlet.
 
@@ -122,11 +122,11 @@ def find_path(graph, start, end=0, limit=None):
     if limit is None:
         limit = len(graph)
     path = [start]
-    next = start
+    next_id = start
     for i in range(limit):
-        next = graph[next]
-        path.append(next)
-        if next == end:
+        next_id = graph[next_id]
+        path.append(next_id)
+        if str(next_id) == str(end):
             break
     return path
 
@@ -299,7 +299,7 @@ def get_next_id_in_subset(subset, routing, ids):
     ids : revised list of first values downstream of the values in ids (determined by routing)
         that are also in subset.
     """
-    subset = set(subset).union({0})
+    subset = set(subset).union({'0'})
     routing = routing.copy()
     if np.isscalar(ids):
         ids = [ids]

diff --git a/sfrmaker/sfrdata.py b/sfrmaker/sfrdata.py
@@ -110,7 +110,7 @@ class SFRData(DataPackage):
               'thts2', 'thti2', 'eps2', 'uhc2']
 
     dtypes = {'rno': int, 'node': int, 'k': int, 'i': int, 'j': int,
-              'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'int64',
+              'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'object',
               'per': int, 'nseg': int, 'icalc': int, 'outseg': int,
               'iupseg': int, 'iprior': int, 'nstrpts': int,
               'name': object, 'geometry': object}
@@ -1315,6 +1315,8 @@ def from_yaml(cls, config_file, package_name=None, output_path=None,
             # resample inflows to model stress periods
             inflows_input['id_column'] = inflows_input['line_id_column']
             inflows_by_stress_period = pd.read_csv(inflows_input['filename'])
+            inflows_by_stress_period[inflows_input['id_column']] =\
+                inflows_by_stress_period[inflows_input['id_column']].astype(int).astype(str)
 
             # check if all inflow sites are included in sfr network
             missing_sites = set(inflows_by_stress_period[inflows_input['id_column']]). \
@@ -1381,7 +1383,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True):
         rno : sequence of ints
             Convert the listed reach numbers (nro), and all downstream reaches,
             to the RIV package. If None, all reaches are converted (default).
-        line_ids : sequence of ints
+        line_ids : sequence of ints or strings
             Convert the segments corresponding to the line_ids 
             (unique identifiers for LineString features in the source hydrography), 
             and all downstream segments to the RIV package. If None, all reaches 
@@ -1409,6 +1411,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True):
         if line_ids is not None:
             if np.isscalar(line_ids):
                 line_ids = [line_ids]
+            line_ids = [str(lid) for lid in line_ids]
             loc = loc & self.reach_data.line_id.isin(line_ids)
         if rno is not None:
             if np.isscalar(rno):

diff --git a/sfrmaker/test/conftest.py b/sfrmaker/test/conftest.py
@@ -261,9 +261,10 @@ def lines_from_shapefile(test_data_path):
 def shellmound_sfrdata(shellmound_model, lines_from_shapefile,
                        shellmound_grid):
     m = shellmound_model
+    lines = copy.deepcopy(lines_from_shapefile)
     # from the lines and StructuredGrid instances, make a sfrmaker.sfrdata instance
     # (lines are intersected with the model grid and converted to reaches, etc.)
-    sfrdata = lines_from_shapefile.to_sfr(grid=shellmound_grid,
+    sfrdata = lines.to_sfr(grid=shellmound_grid,
                                           model=m)
     return sfrdata
 

diff --git a/sfrmaker/test/test_flows.py b/sfrmaker/test/test_flows.py
@@ -39,14 +39,14 @@ def test_get_inflow_locations_from_parent_model(outdir, shellmound_grid, shellmo
                                                 )
     shellmound_sfrdata.export_cells('{}/shellmound_sfr_cells.shp'.format(outdir))
     shellmound_sfrdata.export_lines('{}/shellmound_sfr_lines.shp'.format(outdir))
-    expected_line_ids = {1000005,  # yazoo
-                         18046670,  # Tallahatchie River
-                         938030409,  # unnamed stream along southern boundary
-                         17955337,  # unnamed stream in NW corner
-                         17955371,  # downstream of "Wild Bill Bayou"
-                         17955445,  # unnamed stream just east of 17955281
-                         17958187,  # big sunflower river
-                         17956547,  # unnamed trib to BSR on west side
+    expected_line_ids = {'1000005',  # yazoo
+                         '18046670',  # Tallahatchie River
+                         '938030409',  # unnamed stream along southern boundary
+                         '17955337',  # unnamed stream in NW corner
+                         '17955371',  # downstream of "Wild Bill Bayou"
+                         '17955445',  # unnamed stream just east of 17955281
+                         '17958187',  # big sunflower river
+                         '17956547',  # unnamed trib to BSR on west side
                          }
     symmetric_diff = expected_line_ids ^ set(df.line_id)
     assert len(symmetric_diff) == 0
@@ -95,7 +95,7 @@ def test_add_to_perioddata(shellmound_sfrdata):
     # two inflows applied upstream of model on different paths;
     # inflows should be applied to the first lines in the model 
     # along their respective paths
-    flowline_routing[6] = 1000005
+    flowline_routing['6'] = '1000005'
     flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.],
                           'per': [0, 1, 0, 1],
                           'line_id': [6, 6, 4, 4]})
@@ -109,7 +109,7 @@ def test_add_to_perioddata(shellmound_sfrdata):
     assert np.allclose(sfrd.period_data['inflow'].sort_values(), flows['Q_avg'].sort_values())
     assert np.allclose(sfrd.period_data.index.get_level_values(0), 
                        sorted(flows['per']))
-    lines = {flowline_routing[6], seq[-1]}
+    lines = {flowline_routing['6'], seq[-1]}
     rnos = rd.loc[rd.line_id.isin(lines), 'rno']
     assert not set(sfrd.period_data.index.levels[1]).difference(rnos)
     assert len(set(sfrd.period_data.index.levels[1])) == 2
@@ -132,17 +132,22 @@ def test_add_to_perioddata(shellmound_sfrdata):
                       data_column='Q_avg',
                       one_inflow_per_path=True)
     assert np.allclose(sfrd.period_data['inflow'].values, [300., 100., 30., 10.])
-    assert np.allclose(sfrd.period_data.index.levels[1], [354, 394])
+    # these reach numbers are liable to change
+    rnos = [
+        sfrd.reach_data.loc[sfrd.reach_data['line_id'] == seq[-1], 'rno'].min(),
+        sfrd.reach_data.loc[sfrd.reach_data['line_id'] == '1000005', 'rno'].min()
+    ]
+    assert np.allclose(sfrd.period_data.index.levels[1], rnos)
 
     # two inflows applied along same path in model;
     # one_inflow_per_path=False
     # inflows should be applied to their respective reaches
 
     flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.],
                           'per': [0, 1, 0, 1],
-                          'line_id': [17955337, 17955337, 
-                                      flowline_routing[17955337], 
-                                      flowline_routing[17955337]
+                          'line_id': ['17955337', '17955337', 
+                                      flowline_routing['17955337'], 
+                                      flowline_routing['17955337']
                                       ]})
     sfrd._period_data = None
     add_to_perioddata(sfrd, flows,

diff --git a/sfrmaker/test/test_nhdplus_utils.py b/sfrmaker/test/test_nhdplus_utils.py
@@ -1,6 +1,7 @@
 """Unit tests for test_nhdplus_utils.py"""
 import pytest
 from sfrmaker.nhdplus_utils import (
+    load_nhdplus_v2,
     read_nhdplus,
     read_nhdplus_hr
 )
@@ -62,3 +63,22 @@ def test_read_nhdplus(nhdplus_files, bbox_filter, expected_nrows,
     # NHDPlusIDs should be treated as strings
     comid_col = [c for c in results.columns if c.lower() in {'comid', 'froncomid', 'tocomid'}]
     assert all([type(s) == str for s in results[comid_col].astype(int).astype(str)])
+
+
+@pytest.mark.parametrize('kwargs,bbox_filter', (
+    ({'NHDPlus_paths': 'tylerforks/NHDPlus'}, 'tylerforks/active_area.shp'),
+    ({'NHDFlowlines': 'tylerforks/NHDPlus/NHDSnapshot/Hydrography/NHDFlowline.shp',
+      'elevslope': 'tylerforks/NHDPlus/NHDPlusAttributes/elevslope.dbf',
+      'PlusFlowlineVAA': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlowlineVAA.dbf',
+      'PlusFlow': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlow.dbf'
+      }, 'tylerforks/active_area.shp')
+))
+def test_load_nhdplus_v2(kwargs, bbox_filter, datapath):
+    kwargs = {k: datapath / v for k, v in kwargs.items()}
+    if isinstance(bbox_filter, str):
+        bbox_filter = datapath / bbox_filter
+    results = load_nhdplus_v2(**kwargs, bbox_filter=bbox_filter)
+    # all comids and tocomids should be strings
+    assert all([type(s) == str for s in results['COMID'].astype(int).astype(str)])
+    assert all([type(s) == str for comids in results['COMID'].astype(int).astype(str) 
+         for s in comids ])
diff --git a/sfrmaker/test/test_observations.py b/sfrmaker/test/test_observations.py
@@ -117,7 +117,7 @@ def test_add_observations_from_line_ids(shellmound_sfrdata, flux_observation_dat
     # get the last reach in each segment
     rd = shellmound_sfrdata.reach_data.sort_values(by=['iseg', 'ireach'], axis=0).groupby('iseg').last()
     rno = dict(zip(rd.line_id, rd.rno))
-    assert set(obs.rno) == set([rno[lid] for lid in flux_observation_data.line_id])
+    assert set(obs.rno) == set([rno[str(lid)] for lid in flux_observation_data.line_id])
     rd = shellmound_sfrdata.reach_data
     iseg_ireach = dict(list(zip(rd.rno, zip(rd.iseg, rd.ireach))))
     for i, r in obs.iterrows():

diff --git a/sfrmaker/test/test_preprocessing.py b/sfrmaker/test/test_preprocessing.py
@@ -423,6 +423,6 @@ def test_preprocess_nhdplus_hr_waterbodies(project_root_path, outdir):
                                       dest_crs=26905, outfile=outfile)
     df = gpd.read_file(outfile)
     df.crs == 26905
-    assert 'Beaver Lake' in df['GNIS_Name']
+    assert 'Beaver Lake' in df['GNIS_Name'].values
     # the next line shouldn't be there either
     assert 75004400012864 not in df['NHDPlusID'].values