From 46a9149585ff20f464bd88469db00ba963b66673 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:12:50 -0700 Subject: [PATCH 1/2] Improve quadtree_point_in_polygon performance by 5x --- python/cuspatial/cuspatial/core/geoseries.py | 38 ++++++++----------- .../cuspatial/cuspatial/core/spatial/join.py | 17 +++++---- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/python/cuspatial/cuspatial/core/geoseries.py b/python/cuspatial/cuspatial/core/geoseries.py index b8d7c4945..254a8138e 100644 --- a/python/cuspatial/cuspatial/core/geoseries.py +++ b/python/cuspatial/cuspatial/core/geoseries.py @@ -229,26 +229,28 @@ def __init__(self, list_series, meta): @property def x(self): - return self.xy[::2].reset_index(drop=True) + return cudf.Series(self.xy.values[::2]) @property def y(self): - return self.xy[1::2].reset_index(drop=True) + return cudf.Series(self.xy.values[1::2]) - @property + @cached_property def xy(self): - features = self._get_current_features(self._type) + features = self.column if hasattr(features, "leaves"): - return cudf.Series(features.leaves().values) + return cudf.Series._from_column(features.leaves()) else: return cudf.Series() - def _get_current_features(self, type): + @cached_property + def column(self): + """Return the ListColumn reordered by union offset.""" # Resample the existing features so that the offsets returned # by `_offset` methods reflect previous slicing, and match # the values returned by .xy. existing_indices = self._meta.union_offsets[ - self._meta.input_types == type.value + self._meta.input_types == self._type.value ] existing_features = self._col.take(existing_indices._column) return existing_features @@ -265,10 +267,6 @@ def point_indices(self): self._meta.input_types != -1 ] - def column(self): - """Return the ListColumn reordered by union offset.""" - return self._get_current_features(self._type) - class MultiPointGeoColumnAccessor(GeoColumnAccessor): def __init__(self, list_series, meta): super().__init__(list_series, meta) @@ -276,7 +274,7 @@ def __init__(self, list_series, meta): @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -292,13 +290,11 @@ def __init__(self, list_series, meta): @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column.offsets.values @property def part_offset(self): - return self._get_current_features( - self._type - ).elements.offsets.values + return self.column.elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -314,19 +310,15 @@ def __init__(self, list_series, meta): @property def geometry_offset(self): - return self._get_current_features(self._type).offsets.values + return self.column.offsets.values @property def part_offset(self): - return self._get_current_features( - self._type - ).elements.offsets.values + return self.column.elements.offsets.values @property def ring_offset(self): - return self._get_current_features( - self._type - ).elements.elements.offsets.values + return self.column.elements.elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the diff --git a/python/cuspatial/cuspatial/core/spatial/join.py b/python/cuspatial/cuspatial/core/spatial/join.py index c237fe9d3..fdaeb8f4a 100644 --- a/python/cuspatial/cuspatial/core/spatial/join.py +++ b/python/cuspatial/cuspatial/core/spatial/join.py @@ -214,14 +214,15 @@ def quadtree_point_in_polygon( raise ValueError( "`polygons` Geoseries must contains only polygons geometries." ) - - points_x = as_column(points.points.x) - points_y = as_column(points.points.y) - - poly_offsets = as_column(polygons.polygons.part_offset) - ring_offsets = as_column(polygons.polygons.ring_offset) - poly_points_x = as_column(polygons.polygons.x) - poly_points_y = as_column(polygons.polygons.y) + points_data = points.points + points_x = as_column(points_data.x) + points_y = as_column(points_data.y) + + polygon_data = polygons.polygons + poly_offsets = as_column(polygon_data.part_offset) + ring_offsets = as_column(polygon_data.ring_offset) + poly_points_x = as_column(polygon_data.x) + poly_points_y = as_column(polygon_data.y) return DataFrame._from_data( *spatial_join.quadtree_point_in_polygon( From 8d7714a5944dcd1bccb7d53c070641d4feb4506c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:21:06 -0700 Subject: [PATCH 2/2] Fix failing tests --- python/cuspatial/cuspatial/core/geoseries.py | 57 ++++++++------------ 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/python/cuspatial/cuspatial/core/geoseries.py b/python/cuspatial/cuspatial/core/geoseries.py index 254a8138e..37da66744 100644 --- a/python/cuspatial/cuspatial/core/geoseries.py +++ b/python/cuspatial/cuspatial/core/geoseries.py @@ -221,11 +221,18 @@ def sizes(self): ) class GeoColumnAccessor: - def __init__(self, list_series, meta): + def __init__(self, list_series, meta, typ): self._series = list_series self._col = self._series._column self._meta = meta - self._type = Feature_Enum.POINT + self._type = typ + # Resample the existing features so that the offsets returned + # by `_offset` methods reflect previous slicing, and match + # the values returned by .xy. + existing_indices = self._meta.union_offsets[ + self._meta.input_types == self._type.value + ] + self._existing_features = self._col.take(existing_indices._column) @property def x(self): @@ -237,23 +244,15 @@ def y(self): @cached_property def xy(self): - features = self.column + features = self.column() if hasattr(features, "leaves"): return cudf.Series._from_column(features.leaves()) else: return cudf.Series() - @cached_property def column(self): """Return the ListColumn reordered by union offset.""" - # Resample the existing features so that the offsets returned - # by `_offset` methods reflect previous slicing, and match - # the values returned by .xy. - existing_indices = self._meta.union_offsets[ - self._meta.input_types == self._type.value - ] - existing_features = self._col.take(existing_indices._column) - return existing_features + return self._existing_features def point_indices(self): # Return a cupy.ndarray containing the index values that each @@ -268,13 +267,9 @@ def point_indices(self): ] class MultiPointGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.MULTIPOINT - @property def geometry_offset(self): - return self.column.offsets.values + return self.column().offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -284,17 +279,13 @@ def point_indices(self): return cp.repeat(self._meta.input_types.index, sizes) class LineStringGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.LINESTRING - @property def geometry_offset(self): - return self.column.offsets.values + return self.column().offsets.values @property def part_offset(self): - return self.column.elements.offsets.values + return self.column().elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -304,21 +295,17 @@ def point_indices(self): return cp.repeat(self._meta.input_types.index, sizes) class PolygonGeoColumnAccessor(GeoColumnAccessor): - def __init__(self, list_series, meta): - super().__init__(list_series, meta) - self._type = Feature_Enum.POLYGON - @property def geometry_offset(self): - return self.column.offsets.values + return self.column().offsets.values @property def part_offset(self): - return self.column.elements.offsets.values + return self.column().elements.offsets.values @property def ring_offset(self): - return self.column.elements.elements.offsets.values + return self.column().elements.elements.offsets.values def point_indices(self): # Return a cupy.ndarray containing the index values from the @@ -332,27 +319,29 @@ def point_indices(self): @property def points(self): """Access the `PointsArray` of the underlying `GeoArrowBuffers`.""" - return self.GeoColumnAccessor(self._column.points, self._column._meta) + return self.GeoColumnAccessor( + self._column.points, self._column._meta, Feature_Enum.POINT + ) @property def multipoints(self): """Access the `MultiPointArray` of the underlying `GeoArrowBuffers`.""" return self.MultiPointGeoColumnAccessor( - self._column.mpoints, self._column._meta + self._column.mpoints, self._column._meta, Feature_Enum.MULTIPOINT ) @property def lines(self): """Access the `LineArray` of the underlying `GeoArrowBuffers`.""" return self.LineStringGeoColumnAccessor( - self._column.lines, self._column._meta + self._column.lines, self._column._meta, Feature_Enum.LINESTRING ) @property def polygons(self): """Access the `PolygonArray` of the underlying `GeoArrowBuffers`.""" return self.PolygonGeoColumnAccessor( - self._column.polygons, self._column._meta + self._column.polygons, self._column._meta, Feature_Enum.POLYGON ) def __repr__(self):