Skip to content

Commit

Permalink
refactor: use strings universally as datatype for line identifiers, t…
Browse files Browse the repository at this point in the history
…o avoid confusion withfloats vs ints and issues with long identifiers such as those in NHDPlus HR that can't be represented as integers in 32-bit contexts
  • Loading branch information
aleaf committed Oct 17, 2024
1 parent 22d4f0b commit 11f05dd
Show file tree
Hide file tree
Showing 13 changed files with 112 additions and 61 deletions.
18 changes: 13 additions & 5 deletions sfrmaker/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import box
import flopy
from sfrmaker.routing import find_path, make_graph
Expand Down Expand Up @@ -68,7 +69,7 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,

# parent and inset reach data
if isinstance(parent_reach_data, str) or isinstance(parent_reach_data, Path):
prd = shp2df(parent_reach_data)
prd = gpd.read_file(parent_reach_data)
elif isinstance(parent_reach_data, pd.DataFrame):
prd = parent_reach_data.copy()
else:
Expand All @@ -78,10 +79,11 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,
prd['ireach'] = 1
mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach', 'geometry'}
assert len(mustinclude_cols.intersection(prd.columns)) == len(mustinclude_cols)

prd['line_id'] = prd['line_id'].astype(int).astype(str)

if isinstance(inset_reach_data, str) or isinstance(inset_reach_data, Path):
if inset_reach_data.endswith('.shp'):
ird = shp2df(inset_reach_data)
ird = gpd.read_file(inset_reach_data)
else:
ird = pd.read_csv(inset_reach_data)
elif isinstance(inset_reach_data, pd.DataFrame):
Expand All @@ -93,7 +95,8 @@ def get_inflow_locations_from_parent_model(parent_reach_data, inset_reach_data,
ird['ireach'] = 1
mustinclude_cols = {'line_id', 'rno', 'iseg', 'ireach'}
assert len(mustinclude_cols.intersection(ird.columns)) == len(mustinclude_cols)

ird['line_id'] = ird['line_id'].astype(int).astype(str)

graph = make_graph(ird.rno.values, ird.outreach.values, one_to_many=False)

# cull parent reach data to only lines that cross or are just upstream of inset boundary
Expand Down Expand Up @@ -267,14 +270,19 @@ def add_to_perioddata(sfrdata, data, flowline_routing=None,

# cull data to valid periods
data = data.loc[data[period_column] >= 0].copy()

# convert line IDs to strings
if line_id_column in data.columns:
data[line_id_column] = data[line_id_column].astype(int).astype(str)

# map NHDPlus COMIDs to reach numbers
if flowline_routing is not None:
assert line_id_column in data.columns, \
"Data need an id column so {} locations can be mapped to reach numbers".format(variable)
# replace ids that are not keys (those outside the network) with zeros
# (0 is the exit condition for finding paths in get_next_id_in_subset)
flowline_routing = {k: v if v in flowline_routing.keys() else 0
# also force all IDs to strings
flowline_routing = {str(int(k)): str(int(v)) if v in flowline_routing.keys() else '0'
for k, v in flowline_routing.items()}
rno_column = 'rno'
r1 = sfrd.reach_data.loc[sfrd.reach_data.ireach == 1]
Expand Down
38 changes: 24 additions & 14 deletions sfrmaker/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,14 @@ def geometry_length_units(self):
'foot': 'feet',
'meters': 'meters',
'metre': 'meters'}
if self.crs.is_geographic:
raise ValueError('Flowline geometries need to be in a '
'projected coordinate system; supply a model grid '
'with a projected CRS to the Lines.to_sfr() method or '
'run Lines.to_crs() to reproject the flowlines.'
)
self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name)
if self.crs is not None:
if self.crs.is_geographic:
raise ValueError('Flowline geometries need to be in a '
'projected coordinate system; supply a model grid '
'with a projected CRS to the Lines.to_sfr() method or '
'run Lines.to_crs() to reproject the flowlines.'
)
self._geometry_length_units = valid_units.get(self.crs.axis_info[0].unit_name)
if self._geometry_length_units is None:
print("Warning: No length units specified in CRS for input LineStrings "
"or length units not recognized"
Expand Down Expand Up @@ -195,10 +196,10 @@ def routing(self):
routing = pick_toids(routing, self.elevup)
# set toids not in routing dataset to 0
# (outlet condition)
routing = {k: v if v in routing.keys() else 0
routing = {k: v if v in routing.keys() else '0'
for k, v in routing.items()}
else:
routing = {self.df.id.values[0]: 0}
routing = {self.df.id.values[0]: '0'}
self._routing = routing
self._last_routing_dict_update = time.time()
return self._routing
Expand Down Expand Up @@ -569,7 +570,7 @@ def from_shapefile(cls, shapefile,
df = gpd.read_file(shapefile, bbox=bbox_filter)
assert 'geometry' in df.columns, "No feature geometries found in {}.".format(shapefile)

return cls.from_dataframe(df,
return cls.from_dataframe(df,
id_column=id_column,
routing_column=routing_column,
arbolate_sum_column2=arbolate_sum_column2,
Expand Down Expand Up @@ -697,6 +698,14 @@ def from_dataframe(cls, df,
to_drop = set(rename_cols.values()).intersection(df.columns)
df.drop(to_drop, axis=1, inplace=True)
df.rename(columns=rename_cols, inplace=True)

# convert IDs to strings
df['id'] = df['id'].astype(int).astype(str)
# if reading from NHDPlus, to-ids may already be in lists
if np.isscalar(df['toid'].values[0]):
df['toid'] = df['toid'].astype(int).astype(str)
else:
df['toid'] = [[str(int(toid)) for toid in toids] for toids in df['toid']]

column_order = ['id', 'toid',
'asum1', 'asum2',
Expand Down Expand Up @@ -1078,13 +1087,14 @@ def to_sfr(self, grid=None,
groups = lengths.groupby('line_id') # fragments grouped by parent line

reach_cumsums = []
ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values
for id in ordered_ids:
grp = groups.get_group(id).sort_values(by='ireach')
#ordered_ids = rd.line_id.loc[rd.line_id.diff() != 0].values
ordered_ids = lengths['line_id'].unique()
for line_id in ordered_ids:
grp = groups.get_group(line_id).sort_values(by='ireach')
dist = np.cumsum(grp.rchlen.values) - 0.5 * grp.rchlen.values
reach_cumsums.append(dist)
reach_cumsums = np.concatenate(reach_cumsums)
segment_asums = [asum1s[id] for id in lengths.line_id]
segment_asums = [asum1s[line_id] for line_id in lengths.line_id]
reach_asums = segment_asums + reach_cumsums *\
convert_length_units(self.geometry_length_units, self.asum_units)
# maintain positive asums; lengths in NHD often aren't exactly equal to feature lengths
Expand Down
8 changes: 4 additions & 4 deletions sfrmaker/nhdplus_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def load_nhdplus_v2(NHDPlus_paths=None,
# read flowlines and attribute tables into dataframes
fl = read_nhdplus(NHDFlowlines, bbox_filter=bbox_filter)
pfvaa = read_nhdplus(PlusFlowlineVAA)
pf = shp2df(PlusFlow)
pf = read_nhdplus(PlusFlow)
elevs = read_nhdplus(elevslope)

# join flowline and attribute dataframes
Expand Down Expand Up @@ -195,7 +195,7 @@ def get_tocomids(pf, fromcomid_list):
# subset PlusFlow entries for comids that are not in fromcomid_list
# comids may be missing because they are outside of the model
# or if the fromcomid_list dataset was edited (resulting in breaks in the routing)
missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != 0)
missing_tocomids = ~pf.TOCOMID.isin(comids) & (pf.TOCOMID != '0')
missing = pf.loc[missing_tocomids, ['FROMCOMID', 'TOCOMID']].copy()
# recursively crawl the PlusFlow table
# to try to find a downstream comid that is in fromcomid_list
Expand All @@ -205,7 +205,7 @@ def get_tocomids(pf, fromcomid_list):

# set any remaining comids not in fromcomid_list to zero
# (outlets or inlets from outside model)
pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = 0
pf.loc[~pf.FROMCOMID.isin(comids), 'FROMCOMID'] = '0'
tocomid = pf.TOCOMID.values
fromcomid = pf.FROMCOMID.values
tocomids = [tocomid[fromcomid == c].tolist() for c in comids]
Expand All @@ -228,7 +228,7 @@ def find_next_comid(comid, pftable, comids, max_levels=10):
# (often these will be in different levelpaths,
# so there is no way to determine a preferred routing path)
return list(set(nextocomid).intersection(comids))[0]
return 0
return '0'


def read_nhdplus(shapefiles, bbox_filter=None,
Expand Down
1 change: 1 addition & 0 deletions sfrmaker/observations.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def add_observations(sfrdata, data, flowline_routing=None,

# get reach number from site locations in source hydrography (line_ids)
elif line_id_column in data.columns:
data[line_id_column] = data[line_id_column].astype(int).astype(str)
# map NHDPlus COMIDs to reach numbers
if flowline_routing is None:
line_id = dict(zip(reach_data.iseg, reach_data.line_id))
Expand Down
12 changes: 6 additions & 6 deletions sfrmaker/routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def get_upsegs(graph_r, seg):
return all_upsegs


def find_path(graph, start, end=0, limit=None):
def find_path(graph, start, end='0', limit=None):
"""Get a path through the routing network,
from a segment to an outlet.
Expand All @@ -122,11 +122,11 @@ def find_path(graph, start, end=0, limit=None):
if limit is None:
limit = len(graph)
path = [start]
next = start
next_id = start
for i in range(limit):
next = graph[next]
path.append(next)
if next == end:
next_id = graph[next_id]
path.append(next_id)
if str(next_id) == str(end):
break
return path

Expand Down Expand Up @@ -299,7 +299,7 @@ def get_next_id_in_subset(subset, routing, ids):
ids : revised list of first values downstream of the values in ids (determined by routing)
that are also in subset.
"""
subset = set(subset).union({0})
subset = set(subset).union({'0'})
routing = routing.copy()
if np.isscalar(ids):
ids = [ids]
Expand Down
7 changes: 5 additions & 2 deletions sfrmaker/sfrdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class SFRData(DataPackage):
'thts2', 'thti2', 'eps2', 'uhc2']

dtypes = {'rno': int, 'node': int, 'k': int, 'i': int, 'j': int,
'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'int64',
'iseg': int, 'ireach': int, 'outreach': int, 'line_id': 'object',
'per': int, 'nseg': int, 'icalc': int, 'outseg': int,
'iupseg': int, 'iprior': int, 'nstrpts': int,
'name': object, 'geometry': object}
Expand Down Expand Up @@ -1315,6 +1315,8 @@ def from_yaml(cls, config_file, package_name=None, output_path=None,
# resample inflows to model stress periods
inflows_input['id_column'] = inflows_input['line_id_column']
inflows_by_stress_period = pd.read_csv(inflows_input['filename'])
inflows_by_stress_period[inflows_input['id_column']] =\
inflows_by_stress_period[inflows_input['id_column']].astype(int).astype(str)

# check if all inflow sites are included in sfr network
missing_sites = set(inflows_by_stress_period[inflows_input['id_column']]). \
Expand Down Expand Up @@ -1381,7 +1383,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True):
rno : sequence of ints
Convert the listed reach numbers (nro), and all downstream reaches,
to the RIV package. If None, all reaches are converted (default).
line_ids : sequence of ints
line_ids : sequence of ints or strings
Convert the segments corresponding to the line_ids
(unique identifiers for LineString features in the source hydrography),
and all downstream segments to the RIV package. If None, all reaches
Expand Down Expand Up @@ -1409,6 +1411,7 @@ def to_riv(self, segments=None, rno=None, line_ids=None, drop_in_sfr=True):
if line_ids is not None:
if np.isscalar(line_ids):
line_ids = [line_ids]
line_ids = [str(lid) for lid in line_ids]
loc = loc & self.reach_data.line_id.isin(line_ids)
if rno is not None:
if np.isscalar(rno):
Expand Down
3 changes: 2 additions & 1 deletion sfrmaker/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,10 @@ def lines_from_shapefile(test_data_path):
def shellmound_sfrdata(shellmound_model, lines_from_shapefile,
shellmound_grid):
m = shellmound_model
lines = copy.deepcopy(lines_from_shapefile)
# from the lines and StructuredGrid instances, make a sfrmaker.sfrdata instance
# (lines are intersected with the model grid and converted to reaches, etc.)
sfrdata = lines_from_shapefile.to_sfr(grid=shellmound_grid,
sfrdata = lines.to_sfr(grid=shellmound_grid,
model=m)
return sfrdata

Expand Down
33 changes: 19 additions & 14 deletions sfrmaker/test/test_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ def test_get_inflow_locations_from_parent_model(outdir, shellmound_grid, shellmo
)
shellmound_sfrdata.export_cells('{}/shellmound_sfr_cells.shp'.format(outdir))
shellmound_sfrdata.export_lines('{}/shellmound_sfr_lines.shp'.format(outdir))
expected_line_ids = {1000005, # yazoo
18046670, # Tallahatchie River
938030409, # unnamed stream along southern boundary
17955337, # unnamed stream in NW corner
17955371, # downstream of "Wild Bill Bayou"
17955445, # unnamed stream just east of 17955281
17958187, # big sunflower river
17956547, # unnamed trib to BSR on west side
expected_line_ids = {'1000005', # yazoo
'18046670', # Tallahatchie River
'938030409', # unnamed stream along southern boundary
'17955337', # unnamed stream in NW corner
'17955371', # downstream of "Wild Bill Bayou"
'17955445', # unnamed stream just east of 17955281
'17958187', # big sunflower river
'17956547', # unnamed trib to BSR on west side
}
symmetric_diff = expected_line_ids ^ set(df.line_id)
assert len(symmetric_diff) == 0
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_add_to_perioddata(shellmound_sfrdata):
# two inflows applied upstream of model on different paths;
# inflows should be applied to the first lines in the model
# along their respective paths
flowline_routing[6] = 1000005
flowline_routing['6'] = '1000005'
flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.],
'per': [0, 1, 0, 1],
'line_id': [6, 6, 4, 4]})
Expand All @@ -109,7 +109,7 @@ def test_add_to_perioddata(shellmound_sfrdata):
assert np.allclose(sfrd.period_data['inflow'].sort_values(), flows['Q_avg'].sort_values())
assert np.allclose(sfrd.period_data.index.get_level_values(0),
sorted(flows['per']))
lines = {flowline_routing[6], seq[-1]}
lines = {flowline_routing['6'], seq[-1]}
rnos = rd.loc[rd.line_id.isin(lines), 'rno']
assert not set(sfrd.period_data.index.levels[1]).difference(rnos)
assert len(set(sfrd.period_data.index.levels[1])) == 2
Expand All @@ -132,17 +132,22 @@ def test_add_to_perioddata(shellmound_sfrdata):
data_column='Q_avg',
one_inflow_per_path=True)
assert np.allclose(sfrd.period_data['inflow'].values, [300., 100., 30., 10.])
assert np.allclose(sfrd.period_data.index.levels[1], [354, 394])
# these reach numbers are liable to change
rnos = [
sfrd.reach_data.loc[sfrd.reach_data['line_id'] == seq[-1], 'rno'].min(),
sfrd.reach_data.loc[sfrd.reach_data['line_id'] == '1000005', 'rno'].min()
]
assert np.allclose(sfrd.period_data.index.levels[1], rnos)

# two inflows applied along same path in model;
# one_inflow_per_path=False
# inflows should be applied to their respective reaches

flows = pd.DataFrame({'Q_avg': [100., 10., 200., 20.],
'per': [0, 1, 0, 1],
'line_id': [17955337, 17955337,
flowline_routing[17955337],
flowline_routing[17955337]
'line_id': ['17955337', '17955337',
flowline_routing['17955337'],
flowline_routing['17955337']
]})
sfrd._period_data = None
add_to_perioddata(sfrd, flows,
Expand Down
20 changes: 20 additions & 0 deletions sfrmaker/test/test_nhdplus_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit tests for test_nhdplus_utils.py"""
import pytest
from sfrmaker.nhdplus_utils import (
load_nhdplus_v2,
read_nhdplus,
read_nhdplus_hr
)
Expand Down Expand Up @@ -62,3 +63,22 @@ def test_read_nhdplus(nhdplus_files, bbox_filter, expected_nrows,
# NHDPlusIDs should be treated as strings
comid_col = [c for c in results.columns if c.lower() in {'comid', 'froncomid', 'tocomid'}]
assert all([type(s) == str for s in results[comid_col].astype(int).astype(str)])


@pytest.mark.parametrize('kwargs,bbox_filter', (
({'NHDPlus_paths': 'tylerforks/NHDPlus'}, 'tylerforks/active_area.shp'),
({'NHDFlowlines': 'tylerforks/NHDPlus/NHDSnapshot/Hydrography/NHDFlowline.shp',
'elevslope': 'tylerforks/NHDPlus/NHDPlusAttributes/elevslope.dbf',
'PlusFlowlineVAA': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlowlineVAA.dbf',
'PlusFlow': 'tylerforks/NHDPlus/NHDPlusAttributes/PlusFlow.dbf'
}, 'tylerforks/active_area.shp')
))
def test_load_nhdplus_v2(kwargs, bbox_filter, datapath):
kwargs = {k: datapath / v for k, v in kwargs.items()}
if isinstance(bbox_filter, str):
bbox_filter = datapath / bbox_filter
results = load_nhdplus_v2(**kwargs, bbox_filter=bbox_filter)
# all comids and tocomids should be strings
assert all([type(s) == str for s in results['COMID'].astype(int).astype(str)])
assert all([type(s) == str for comids in results['COMID'].astype(int).astype(str)
for s in comids ])
2 changes: 1 addition & 1 deletion sfrmaker/test/test_observations.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_add_observations_from_line_ids(shellmound_sfrdata, flux_observation_dat
# get the last reach in each segment
rd = shellmound_sfrdata.reach_data.sort_values(by=['iseg', 'ireach'], axis=0).groupby('iseg').last()
rno = dict(zip(rd.line_id, rd.rno))
assert set(obs.rno) == set([rno[lid] for lid in flux_observation_data.line_id])
assert set(obs.rno) == set([rno[str(lid)] for lid in flux_observation_data.line_id])
rd = shellmound_sfrdata.reach_data
iseg_ireach = dict(list(zip(rd.rno, zip(rd.iseg, rd.ireach))))
for i, r in obs.iterrows():
Expand Down
2 changes: 1 addition & 1 deletion sfrmaker/test/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,6 @@ def test_preprocess_nhdplus_hr_waterbodies(project_root_path, outdir):
dest_crs=26905, outfile=outfile)
df = gpd.read_file(outfile)
df.crs == 26905
assert 'Beaver Lake' in df['GNIS_Name']
assert 'Beaver Lake' in df['GNIS_Name'].values
# the next line shouldn't be there either
assert 75004400012864 not in df['NHDPlusID'].values
Loading

0 comments on commit 11f05dd

Please sign in to comment.