Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix performance regression in quadtree_point_in_polygon by 5x #1446

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 29 additions & 48 deletions python/cuspatial/cuspatial/core/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,37 +221,38 @@ def sizes(self):
)

class GeoColumnAccessor:
def __init__(self, list_series, meta):
def __init__(self, list_series, meta, typ):
self._series = list_series
self._col = self._series._column
self._meta = meta
self._type = Feature_Enum.POINT
self._type = typ
# Resample the existing features so that the offsets returned
# by `_offset` methods reflect previous slicing, and match
# the values returned by .xy.
existing_indices = self._meta.union_offsets[
self._meta.input_types == self._type.value
]
self._existing_features = self._col.take(existing_indices._column)

@property
def x(self):
return self.xy[::2].reset_index(drop=True)
return cudf.Series(self.xy.values[::2])

@property
def y(self):
return self.xy[1::2].reset_index(drop=True)
return cudf.Series(self.xy.values[1::2])

@property
@cached_property
def xy(self):
features = self._get_current_features(self._type)
features = self.column()
if hasattr(features, "leaves"):
return cudf.Series(features.leaves().values)
return cudf.Series._from_column(features.leaves())
else:
return cudf.Series()

def _get_current_features(self, type):
# Resample the existing features so that the offsets returned
# by `_offset` methods reflect previous slicing, and match
# the values returned by .xy.
existing_indices = self._meta.union_offsets[
self._meta.input_types == type.value
]
existing_features = self._col.take(existing_indices._column)
return existing_features
def column(self):
"""Return the ListColumn reordered by union offset."""
return self._existing_features

def point_indices(self):
# Return a cupy.ndarray containing the index values that each
Expand All @@ -265,18 +266,10 @@ def point_indices(self):
self._meta.input_types != -1
]

def column(self):
"""Return the ListColumn reordered by union offset."""
return self._get_current_features(self._type)

class MultiPointGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.MULTIPOINT

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -286,19 +279,13 @@ def point_indices(self):
return cp.repeat(self._meta.input_types.index, sizes)

class LineStringGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.LINESTRING

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

@property
def part_offset(self):
return self._get_current_features(
self._type
).elements.offsets.values
return self.column().elements.offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -308,25 +295,17 @@ def point_indices(self):
return cp.repeat(self._meta.input_types.index, sizes)

class PolygonGeoColumnAccessor(GeoColumnAccessor):
def __init__(self, list_series, meta):
super().__init__(list_series, meta)
self._type = Feature_Enum.POLYGON

@property
def geometry_offset(self):
return self._get_current_features(self._type).offsets.values
return self.column().offsets.values

@property
def part_offset(self):
return self._get_current_features(
self._type
).elements.offsets.values
return self.column().elements.offsets.values

@property
def ring_offset(self):
return self._get_current_features(
self._type
).elements.elements.offsets.values
return self.column().elements.elements.offsets.values

def point_indices(self):
# Return a cupy.ndarray containing the index values from the
Expand All @@ -340,27 +319,29 @@ def point_indices(self):
@property
def points(self):
"""Access the `PointsArray` of the underlying `GeoArrowBuffers`."""
return self.GeoColumnAccessor(self._column.points, self._column._meta)
return self.GeoColumnAccessor(
self._column.points, self._column._meta, Feature_Enum.POINT
)

@property
def multipoints(self):
"""Access the `MultiPointArray` of the underlying `GeoArrowBuffers`."""
return self.MultiPointGeoColumnAccessor(
self._column.mpoints, self._column._meta
self._column.mpoints, self._column._meta, Feature_Enum.MULTIPOINT
)

@property
def lines(self):
"""Access the `LineArray` of the underlying `GeoArrowBuffers`."""
return self.LineStringGeoColumnAccessor(
self._column.lines, self._column._meta
self._column.lines, self._column._meta, Feature_Enum.LINESTRING
)

@property
def polygons(self):
"""Access the `PolygonArray` of the underlying `GeoArrowBuffers`."""
return self.PolygonGeoColumnAccessor(
self._column.polygons, self._column._meta
self._column.polygons, self._column._meta, Feature_Enum.POLYGON
)

def __repr__(self):
Expand Down
17 changes: 9 additions & 8 deletions python/cuspatial/cuspatial/core/spatial/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,15 @@ def quadtree_point_in_polygon(
raise ValueError(
"`polygons` Geoseries must contains only polygons geometries."
)

points_x = as_column(points.points.x)
points_y = as_column(points.points.y)

poly_offsets = as_column(polygons.polygons.part_offset)
ring_offsets = as_column(polygons.polygons.ring_offset)
poly_points_x = as_column(polygons.polygons.x)
poly_points_y = as_column(polygons.polygons.y)
points_data = points.points
points_x = as_column(points_data.x)
points_y = as_column(points_data.y)
Comment on lines +217 to +219
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should allow users to avoid packing points into a GeoSeries if desired.

Since GeoSeries is a Union<List<List<List<Float32>>>, each 8-byte point pair requires an additional 13 bytes (1 + 4 * 3 ) of indices, which is prohibitively expensive when the dataset is large.

I suggest we either change the API to accept separate x/y columns again (while still accepting and unpacking the points from a GeoSeries), or provide some sort of wrapper for a pair of x/y columns that can be accessed the same way as the GeoSeries.

At the moment, here's my workaround:

class Points:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.points = self
        self._column = self
        self._meta = self
        self.input_types = Series([Feature_Enum.POINT.value])

    def __len__(self) -> int:
        return len(self.x)


x = Series([ ... ])
y = Series([ ... ])
# Wrap x and y cols in a Points instance which duck-types
# GeoSeries' accessors ala `p.points.x` and `p.points.y`
quadtree_on_points(Points(x, y), ...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Historically it looks like the API changed in #934 where quadtree_point_in_polygon was updated to accept a GeoSeries points and polygon inputs. I suppose @isVoid is best to make the call on whether we should change the API back to accept x/y points again.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #1486


polygon_data = polygons.polygons
poly_offsets = as_column(polygon_data.part_offset)
ring_offsets = as_column(polygon_data.ring_offset)
poly_points_x = as_column(polygon_data.x)
poly_points_y = as_column(polygon_data.y)

return DataFrame._from_data(
*spatial_join.quadtree_point_in_polygon(
Expand Down
Loading