diff --git a/examples/example.parquet b/examples/example.parquet index 481e66b..7c653bb 100644 Binary files a/examples/example.parquet and b/examples/example.parquet differ diff --git a/examples/example_metadata.json b/examples/example_metadata.json index ed69b7c..a4b9889 100644 --- a/examples/example_metadata.json +++ b/examples/example_metadata.json @@ -8,6 +8,26 @@ 180.0, 83.6451 ], + "covering": { + "bbox": { + "xmax": [ + "bbox", + "xmax" + ], + "xmin": [ + "bbox", + "xmin" + ], + "ymax": [ + "bbox", + "ymax" + ], + "ymin": [ + "bbox", + "ymin" + ] + } + }, "crs": { "$schema": "https://proj.org/schemas/v0.6/projjson.schema.json", "area": "World.", diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 4782e57..8b2e79d 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -58,6 +58,8 @@ Each geometry column in the dataset MUST be included in the `columns` field abov | edges | string | Name of the coordinate system for the edges. Must be one of `"planar"` or `"spherical"`. The default value is `"planar"`. | | bbox | \[number] | Bounding Box of the geometries in the file, formatted according to [RFC 7946, section 5](https://tools.ietf.org/html/rfc7946#section-5). | | epoch | number | Coordinate epoch in case of a dynamic CRS, expressed as a decimal year. | +| covering | object | Object containing bounding box column names to help accelerate spatial data retrieval | + #### crs @@ -134,6 +136,38 @@ For non-geographic coordinate reference systems, the items in the bbox are minim The bbox values are in the same coordinate reference system as the geometry. +#### covering + +The covering field specifies optional simplified representations of each geometry. The keys of the "covering" object MUST be a supported encoding. Currently the only supported encoding is "bbox" which specifies the names of [bounding box columns](#bounding-box-columns) + +Example: +``` +"covering": { + "bbox": { + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"] + } +} +``` + +##### bbox covering encoding + +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. + +The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name. + +The value specified in this field should not be confused with the top-level [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. + +Note: This technique to use the bounding box to improve spatial queries does not apply to geometries that cross the antimeridian. Such geometries are unsupported by this method. + +### Bounding Box Columns + +A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. As with the top-level [`bbox`](#bbox) column, the values follow the GeoJSON specification (RFC 7946, section 5), which also describes how to represent the bbox for geometries that cross the antimeridian. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE` and all columns MUST use the same type. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. + +The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. + ### Additional information #### Feature identifiers diff --git a/format-specs/schema.json b/format-specs/schema.json index ae31ee0..10b0a7b 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -71,6 +71,50 @@ }, "epoch": { "type": "number" + }, + "covering": { + "type": "object", + "minProperties": 1, + "properties": { + "bbox": { + "type": "object", + "required": ["xmin", "xmax", "ymin", "ymax"], + "properties": { + "xmin": { + "type": "array", + "items": [ + { "type": "string" }, + { "const": "xmin" } + ], + "additionalItems": false + }, + "xmax": { + "type": "array", + "items": [ + { "type": "string" }, + { "const": "xmax" } + ], + "additionalItems": false + }, + "ymin": { + "type": "array", + "items": [ + { "type": "string" }, + { "const": "ymin" } + ], + "additionalItems": false + }, + "ymax": { + "type": "array", + "items": [ + { "type": "string" }, + { "const": "ymax" } + ], + "additionalItems": false + } + } + } + } } } } diff --git a/scripts/generate_example.py b/scripts/generate_example.py index 27ea618..c913c71 100644 --- a/scripts/generate_example.py +++ b/scripts/generate_example.py @@ -8,6 +8,7 @@ >>> import json, pprint, pyarrow.parquet as pq >>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"])) """ +from collections import OrderedDict import json import pathlib @@ -19,6 +20,14 @@ df = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres")) df = df.to_crs("ogc:84") + +geometry_bbox = df.bounds.rename( + OrderedDict( + [("minx", "xmin"), ("miny", "ymin"), ("maxx", "xmax"), ("maxy", "ymax")] + ), + axis=1, +) +df["bbox"] = geometry_bbox.to_dict("records") table = pa.Table.from_pandas(df.head().to_wkb()) @@ -39,14 +48,19 @@ def get_version() -> str: "crs": json.loads(df.crs.to_json()), "edges": "planar", "bbox": [round(x, 4) for x in df.total_bounds], + "covering": { + "bbox": { + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"], + }, + }, }, }, } -schema = ( - table.schema - .with_metadata({"geo": json.dumps(metadata)}) -) +schema = table.schema.with_metadata({"geo": json.dumps(metadata)}) table = table.cast(schema) pq.write_table(table, HERE / "../examples/example.parquet") diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index cda7fb1..f0217c1 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -40,8 +40,8 @@ def get_version() -> str: "columns": { "geometry": { "encoding": "WKB", - "geometry_types": [], - }, + "geometry_types": [] + } }, } @@ -210,6 +210,88 @@ def get_version() -> str: metadata["columns"]["geometry"]["epoch"] = "2015.1" invalid_cases["epoch_string"] = metadata +# Geometry Bbox +metadata_covering_template = copy.deepcopy(metadata_template) +metadata_covering_template["columns"]["geometry"]["covering"] = { + "bbox": { + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"], + }, +} + + +# Allow "any_column.xmin" etc. +metadata = copy.deepcopy(metadata_covering_template) +valid_cases["valid_default_bbox"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"] = { + "xmin": ["any_column", "xmin"], + "ymin": ["any_column", "ymin"], + "xmax": ["any_column", "xmax"], + "ymax": ["any_column", "ymax"], +} +valid_cases["valid_but_not_bbox_struct_name"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"].pop("bbox") +invalid_cases["empty_geometry_bbox"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"] = {} +invalid_cases["empty_geometry_bbox_missing_fields"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmin") +invalid_cases["covering_bbox_missing_xmin"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymin") +invalid_cases["covering_bbox_missing_ymin"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmax") +invalid_cases["covering_bbox_missing_xmax"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymax") +invalid_cases["covering_bbox_missing_ymax"] = metadata + +# Invalid bbox xmin/xmax/ymin/ymax values +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = ["bbox", "not_xmin"] +invalid_cases["covering_bbox_invalid_xmin"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = ["bbox", "not_xmax"] +invalid_cases["covering_bbox_invalid_xmax"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = ["bbox", "not_ymin"] +invalid_cases["covering_bbox_invalid_ymin"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = ["bbox", "not_ymax"] +invalid_cases["covering_bbox_invalid_ymax"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = ["bbox", "xmin", "invalid_extra"] +invalid_cases["covering_bbox_extra_xmin_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = ["bbox", "xmax", "invalid_extra"] +invalid_cases["covering_bbox_extra_xmax_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = ["bbox", "ymin", "invalid_extra"] +invalid_cases["covering_bbox_extra_ymin_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = ["bbox", "ymax", "invalid_extra"] +invalid_cases["covering_bbox_extra_ymax_elements"] = metadata + # # Tests