From 0882e85223e58d62def9bd375c86d6bd1707e1ca Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 27 Nov 2023 14:07:15 -0500 Subject: [PATCH 01/15] Add geometry_bbox proposal definition * Add documentation to the top-level GeoParquet description and definition. * Add the geometry_bbox definition to the json schema * Add a few tests. Verify with `pytest test_json_schema.py` --- format-specs/geoparquet.md | 16 ++++++++++++++++ format-specs/schema.json | 10 ++++++++++ scripts/test_json_schema.py | 15 +++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 4782e57..81bcaf4 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -58,6 +58,8 @@ Each geometry column in the dataset MUST be included in the `columns` field abov | edges | string | Name of the coordinate system for the edges. Must be one of `"planar"` or `"spherical"`. The default value is `"planar"`. | | bbox | \[number] | Bounding Box of the geometries in the file, formatted according to [RFC 7946, section 5](https://tools.ietf.org/html/rfc7946#section-5). | | epoch | number | Coordinate epoch in case of a dynamic CRS, expressed as a decimal year. | +| geometry_bbox | object | Object specifying a column name of a [Bounding Box Column](#bounding-box-columns). | + #### crs @@ -134,6 +136,20 @@ For non-geographic coordinate reference systems, the items in the bbox are minim The bbox values are in the same coordinate reference system as the geometry. +#### geometry_bbox + +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the name of a column containing the bounding box of the geometry for every row. + +The format of `geometry_bbox` is `{"name": "column_name"}` where `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. + +Note: the value specified in this field should not be confused with the [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. + +### Bounding Box Columns + +A bounding box column MUST be a Parquet struct with required fields `xmin`, `xmax`, `ymin`, and `ymax`. For three dimensions the additional fields `zmin` and `zmax` MUST be present. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row not contain a geometry value, the row MUST NOT contain a bounding box value. + +The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. + ### Additional information #### Feature identifiers diff --git a/format-specs/schema.json b/format-specs/schema.json index ae31ee0..6f27af6 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -71,6 +71,16 @@ }, "epoch": { "type": "number" + }, + "geometry_bbox": { + "type": "object", + "required": ["column"], + "properties": { + "column": { + "type": "string", + "minLength": 1 + } + } } } } diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index cda7fb1..dfafbce 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -41,6 +41,9 @@ def get_version() -> str: "geometry": { "encoding": "WKB", "geometry_types": [], + "geometry_bbox": { + "column": "bbox", + }, }, }, } @@ -210,6 +213,18 @@ def get_version() -> str: metadata["columns"]["geometry"]["epoch"] = "2015.1" invalid_cases["epoch_string"] = metadata +# Geometry Bbox + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["geometry_bbox"].pop("column") +invalid_cases["empty_geometry_bbox"] = metadata + + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["geometry_bbox"]["column"] = "" +invalid_cases["empty_geometry_bbox_column"] = metadata + + # # Tests From e74af7d61beeaec32761f26251eff6f7be4cafa0 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 27 Nov 2023 14:55:49 -0500 Subject: [PATCH 02/15] Update generate_example to include geometry_bbox parameter --- examples/example.parquet | Bin 27814 -> 29627 bytes examples/example_metadata.json | 3 +++ scripts/generate_example.py | 15 +++++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/example.parquet b/examples/example.parquet index 481e66b38952aac5ee9e2737bc454da3fc1c2794..1e7bc885ae58391ccca1a666aacc08d8ae447fc4 100644 GIT binary patch delta 2319 zcmcIkeP~-%6u)_0W60W3=3aP_I<`7vux0sJYR$Be`(FByWa*N(G%v~iki0ZWv*u&k zHfa=zj#0`cNC}7zVIU$hiYOI9RD1~P1R4EfPUoD8x+&u3P#Fk*-E&`(*1aJ3M=yEz z-E;0azjJ=)oV=GV>fZWZ_v#1H8XIE81FVrn+Yz!Nq|+@uk@2kUobFt-dOb^P-OiJd z-Abi+=hfdtzw+#jz@Klw6H-{jn%FxAh+7YiWB=>&D{Nmmlz)zO9nIb}r(1>de09 zu3QLvv^{HDaE2xX#MElF%wG9U!>s!6o}K!Zv8;Rp%(QszqcoXCTGROaef82CZ~C0d z#ajPIJ0HGBiM@AI`T1{`$FBeN)7%BGhx=)MrP zmY@qk9L6{fvH;P%V5x@ndi!m*`-wC%;R9t&APEr(Hkwy}cw*iTvIrXwvU*Pw015N6 zwzAEZ3=P?A038leqcM6%kMu#p!7-}%~;UR6pHDYsX#?y207zM@7R>n9E?IgxL zuV$G7@C^ymY9^e~vMfdIsp1S=<3l0hWRH-ImRz6-3p5@*lKO*Li9CEUG0taXNz5c- z6HtBvSRw6$-m1xf5q0%>n}DP3D*(mhkEWdNOgs|AIPfU3?=O)}B|!U1#sjSCr5R(5 zqR55Oxdg@cPJ*c&a;M#$RHK6ZhFVb?V-sb$C?1sy(fO&yJ*f%BBikgTP9~mEpc5Eg zFW89MJi-s-`e!p;_d-4SioB0+@&`v!gE+j^)p*x4`90Q^UWESRhx}%QLRfVC1BPFW AFaQ7m delta 569 zcmZ8dJ4?e*6u!1lic}{X60ImGPC`u}XoU_ZZ5kg`n?P;U#XdBBM5`8>Di;5M;E+Xd zcNXgA?&J?p2UkJ-1x}uGt0JBxC+B|W^?mpLZQ$m4;Pzo-SUOXg7;2TPyZgmx#6Q`p zg@i#VCb_+d?Yp2PyuJs7!I`jd^%xjoOqvyg&Tdoui1aRYYN7FNx68x`Sk`)Bu|Gq6 zipI%0&ayx6Bm=A9x1FgWZ9#JYG(a=F-i>n^K9>`F_8b8K9)3o}EqP$jQO5_e1g5YF zph7NflE(y==5PU}O_PI51Og9vlp;^_6xppTt*9K->)4dgi5s3ZFsdAGkmtPk5mxE9 zrC6S=$PHWBIHlz{Fbm*-=D~8m41J!W6!_Q26O){F(OzrYihhb{i`E?d(#-pxhofVL zS}hfL7s!-Kq|q(l5Iy6^Tf#X}(ds2dohuepw{-9$*zEH|k{O`=o5Tb8@Z0!NtwDzn vB#HJ%55JqTkw86tE!2;Ckntv!qJ2vWLNe($ThlQ&#@N4?y+j$yfhB(dS)iUF diff --git a/examples/example_metadata.json b/examples/example_metadata.json index ed69b7c..b31f1fd 100644 --- a/examples/example_metadata.json +++ b/examples/example_metadata.json @@ -108,6 +108,9 @@ }, "edges": "planar", "encoding": "WKB", + "geometry_bbox": { + "column": "bbox" + }, "geometry_types": [ "Polygon", "MultiPolygon" diff --git a/scripts/generate_example.py b/scripts/generate_example.py index 27ea618..50bc284 100644 --- a/scripts/generate_example.py +++ b/scripts/generate_example.py @@ -8,6 +8,7 @@ >>> import json, pprint, pyarrow.parquet as pq >>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"])) """ +from collections import OrderedDict import json import pathlib @@ -19,6 +20,14 @@ df = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres")) df = df.to_crs("ogc:84") + +geometry_bbox = df.bounds.rename( + OrderedDict( + [("minx", "xmin"), ("miny", "ymin"), ("maxx", "xmax"), ("maxy", "ymax")] + ), + axis=1, +) +df["bbox"] = geometry_bbox.to_dict("records") table = pa.Table.from_pandas(df.head().to_wkb()) @@ -39,14 +48,12 @@ def get_version() -> str: "crs": json.loads(df.crs.to_json()), "edges": "planar", "bbox": [round(x, 4) for x in df.total_bounds], + "geometry_bbox": {"column": "bbox"}, }, }, } -schema = ( - table.schema - .with_metadata({"geo": json.dumps(metadata)}) -) +schema = table.schema.with_metadata({"geo": json.dumps(metadata)}) table = table.cast(schema) pq.write_table(table, HERE / "../examples/example.parquet") From 8790bb4dd53173d2cbc6f22ec9859208590ecdf0 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Tue, 28 Nov 2023 14:56:05 -0500 Subject: [PATCH 03/15] Change to covering from geometry_bbox. Doc updates. Change the geometry_bbox to the broader "covering" section. Update tests and examples. Made some documentation updates: * Parquet schema -> group * Do not require zmin/zmax if geometries have 3 dimensions --- examples/example.parquet | Bin 29627 -> 29631 bytes examples/example_metadata.json | 8 +++++--- format-specs/geoparquet.md | 21 ++++++++++++++++----- format-specs/schema.json | 16 +++++++++++----- scripts/generate_example.py | 4 +++- scripts/test_json_schema.py | 11 ++++++----- 6 files changed, 41 insertions(+), 19 deletions(-) diff --git a/examples/example.parquet b/examples/example.parquet index 1e7bc885ae58391ccca1a666aacc08d8ae447fc4..2b5da28690fcc81b1f66a6aa32b42c339ca35fb9 100644 GIT binary patch delta 697 zcmYjOzi--55Qe2<$k3reO*;gkpbQya?8I&8Qk{)$AS#7a6oWd12hB^20RzRPR26$N zzIN)~xl+aq{d2N*?$o=ZiDJq3?#FlczVF`6ZTk0Z`q$-2`up|O$?#F3H%>lI&qpf% zD)%KX#}|C|#?gE@LN_ld^;K`G4{4q9VJ(v4P4tfyAXfUNR1K4HHA?x=eX&O<# zT(vmqhS*ee(Xd)L<1DLNtL6BJY{xua1g0?v%=gRw7A9~Y?pD!SD-0vUi*P{ym~Q2K zN_2x-5}581@iV$+a!<3V|HjXr`_`FkaTgd=DwfdDc3e-3S$0KS>`Ro0 z-5Tq4BTJ7q32ov*Z%vPSr`m=K${8I|+$X;E+3< zD)SYbkT`PSbNG$U%+h8huV;4NynS!%*SFluTkh%ZEcfU6{w!Ml8upT?HyL05)H?Z{ z|Gs{k?&4!p`F=s4Fsw~{r7g*g>r zhR?ypK+oWiQUNx=QHYJI0WAQZaz#WIAq}HtZ3?l6ncb|?p~9;$p^a~>VASnH317F}myp*gEy3}hzW?>&e$xXHA7Q68;KKyZvO~n)3 z*sNlQYGH;!kPuTG9Lm1-oPOYSn&cb_XQ-8?YQzwVTdUST!hJKc#qhn&wRMG zNhSU str: "crs": json.loads(df.crs.to_json()), "edges": "planar", "bbox": [round(x, 4) for x in df.total_bounds], - "geometry_bbox": {"column": "bbox"}, + "covering": { + "box": {"column": "bbox"}, + }, }, }, } diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index dfafbce..2ef8d27 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -41,8 +41,10 @@ def get_version() -> str: "geometry": { "encoding": "WKB", "geometry_types": [], - "geometry_bbox": { - "column": "bbox", + "covering": { + "box": { + "column": "bbox", + }, }, }, }, @@ -216,16 +218,15 @@ def get_version() -> str: # Geometry Bbox metadata = copy.deepcopy(metadata_template) -metadata["columns"]["geometry"]["geometry_bbox"].pop("column") +metadata["columns"]["geometry"]["covering"].pop("box") invalid_cases["empty_geometry_bbox"] = metadata metadata = copy.deepcopy(metadata_template) -metadata["columns"]["geometry"]["geometry_bbox"]["column"] = "" +metadata["columns"]["geometry"]["covering"]["box"]["column"] = "" invalid_cases["empty_geometry_bbox_column"] = metadata - # # Tests @pytest.mark.parametrize( From 388f74334e08e4310a199bae417c2e7891718d89 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 29 Jan 2024 09:48:29 -0500 Subject: [PATCH 04/15] Refactor bbox covering to specify each element Refactor the covering bbox to be an object with fields specifying each bbox element. "Pin" them to have to end in ".xmin", ".xmax", etc. --- examples/example.parquet | Bin 29631 -> 29794 bytes examples/example_metadata.json | 7 +++-- format-specs/geoparquet.md | 13 +++++--- format-specs/schema.json | 20 +++++++++--- scripts/generate_example.py | 7 ++++- scripts/test_json_schema.py | 55 ++++++++++++++++++++++++++++++--- 6 files changed, 86 insertions(+), 16 deletions(-) diff --git a/examples/example.parquet b/examples/example.parquet index 2b5da28690fcc81b1f66a6aa32b42c339ca35fb9..5e547b18dd60640272f896d34672f07b82f5dffc 100644 GIT binary patch delta 932 zcmZ8fO>YuW6onW~7rN4zc4b4-bW!5~#UkC^_lEL~Wh@NCstjd@Nf`zT1u8M@bvb{) zok>?F{Q>?B*KTy{Kk(lBijcl!-rSFK&pYSM{P-RH^(XrM^Lg~=-Iw#llcgQZW5>IT z*xdI97{qKWyy4{aZtP~W$O#tbew#*CNK6>HQM@6A+A1>V3Gd&P4fWR_T-Md@&+t#u;?aIPYAwTxIxx~=+FqTEn@-wOO zpf_ooDX%QS>zWUH{dF&w=rPB}oYiz!a}Rp`3H6+2BI7jU087#`!zXvqw`Y*2RD&!) z*+rSMLLMy2CCSzasr2jmfu_B~EL$y5QKB(SXk&jRGB5B10E0F`x$0X!nx+iI_^~$C zw04*p?Y}^;Tx?9!nnnk?vK|Jby_Y8XThD5dYk6d;XUd&?fV z|~7E5Z|XW$W8hgcd=Eon*$b&ME+SHtG7+F{i{5IfcX&0nD%CKQ&0sboNQFJ=oAPyVdzQw<9!w02IG*^GAQ3oj8SZoa)w@ zA1iBQs-V!`Y(qOb@&_%~cMd|+IjB+n20o&@6|8_5qFEgxidz88#oz%*M%Co6P&pX@ zLt|hV_!hc1tzlSkHh1!J!B#=i=V(bS z83xTNLy}$g7U!dITCN9JPCM!%WIuYZ7NmKsIOX>L?c>7c7~vKtr2=DBPVTW}6=|dH zT8%ea=K?m_a!Jc(qAW9MuKP9zy%?Juj~GW^z#&IFq^H-i|Bm#QRUd|y*$J%=Z5L~EJLNHGsC+yRO@XCl)70ZQ`s!pb1DyfGr3Qho3KfJS!Jdp?o&p$ZreOiDRL zRK%NC!tciO$suzpw6?&%!Cu}A>_m0>cFfnNqj7$TNQa14rr&~^?+6kwVie|^R$XEj ifO*`P4jzw(b1 str: "edges": "planar", "bbox": [round(x, 4) for x in df.total_bounds], "covering": { - "box": {"column": "bbox"}, + "bbox": { + "xmin": "bbox.xmin", + "ymin": "bbox.ymin", + "xmax": "bbox.xmax", + "ymax": "bbox.ymax", + }, }, }, }, diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index 2ef8d27..260d528 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -42,8 +42,11 @@ def get_version() -> str: "encoding": "WKB", "geometry_types": [], "covering": { - "box": { - "column": "bbox", + "bbox": { + "xmin": "bbox.xmin", + "ymin": "bbox.ymin", + "xmax": "bbox.xmax", + "ymax": "bbox.ymax", }, }, }, @@ -217,15 +220,57 @@ def get_version() -> str: # Geometry Bbox +# Allow "any_column.xmin" etc. metadata = copy.deepcopy(metadata_template) -metadata["columns"]["geometry"]["covering"].pop("box") +metadata["columns"]["geometry"]["covering"]["bbox"] = { + "xmin": "any_column.xmin", + "ymin": "any_column.ymin", + "xmax": "any_column.xmax", + "ymax": "any_column.ymax", +} +valid_cases["valid_but_not_bbox_struct_name"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"].pop("bbox") invalid_cases["empty_geometry_bbox"] = metadata metadata = copy.deepcopy(metadata_template) -metadata["columns"]["geometry"]["covering"]["box"]["column"] = "" -invalid_cases["empty_geometry_bbox_column"] = metadata +metadata["columns"]["geometry"]["covering"]["bbox"] = {} +invalid_cases["empty_geometry_bbox_missing_fields"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmin") +invalid_cases["covering_bbox_missing_xmin"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymin") +invalid_cases["covering_bbox_missing_ymin"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmax") +invalid_cases["covering_bbox_missing_xmax"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymax") +invalid_cases["covering_bbox_missing_ymax"] = metadata +## Invalid bbox xmin/xmax/ymin/ymax values +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = "not_bbox_dot_xmin" +invalid_cases["covering_bbox_invalid_xmin"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = "not_bbox_dot_xmax" +invalid_cases["covering_bbox_invalid_xmax"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = "not_bbox_dot_ymin" +invalid_cases["covering_bbox_invalid_ymin"] = metadata + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = "not_bbox_dot_ymax" +invalid_cases["covering_bbox_invalid_ymax"] = metadata # # Tests From 7a02a39ba3a246a9f8b91d51ae60c6e3bfe95077 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 29 Jan 2024 12:29:04 -0500 Subject: [PATCH 05/15] Run update_example_schemas Fixes failing test --- examples/example_metadata.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/example_metadata.json b/examples/example_metadata.json index b6fea3e..4cda34a 100644 --- a/examples/example_metadata.json +++ b/examples/example_metadata.json @@ -10,10 +10,10 @@ ], "covering": { "bbox": { - "xmin": "bbox.xmin", "xmax": "bbox.xmax", - "ymin": "bbox.ymin", - "ymax": "bbox.ymax" + "xmin": "bbox.xmin", + "ymax": "bbox.ymax", + "ymin": "bbox.ymin" } }, "crs": { From bf95d7d3fb4696b3e6188a9905fa9f5379db4e19 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 29 Jan 2024 16:57:30 -0500 Subject: [PATCH 06/15] Documentation updates --- format-specs/geoparquet.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 1a466f9..e9d4e6f 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -58,7 +58,7 @@ Each geometry column in the dataset MUST be included in the `columns` field abov | edges | string | Name of the coordinate system for the edges. Must be one of `"planar"` or `"spherical"`. The default value is `"planar"`. | | bbox | \[number] | Bounding Box of the geometries in the file, formatted according to [RFC 7946, section 5](https://tools.ietf.org/html/rfc7946#section-5). | | epoch | number | Coordinate epoch in case of a dynamic CRS, expressed as a decimal year. | -| covering | object | Object containing information like bounding boxes to help accelerate spatial data retrieval | +| covering | object | Object containing bounding box column names to help accelerate spatial data retrieval | #### crs @@ -162,7 +162,7 @@ Note: the value specified in this field should not be confused with the top-leve ### Bounding Box Columns -A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax`. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row not contain a geometry value, the row MUST NOT contain a bounding box value. +A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax`. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. From 6b3fc8010e7cccd8cbefc1cc7e98dac38b6c62eb Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 29 Jan 2024 17:05:39 -0500 Subject: [PATCH 07/15] Remote covering from top-level metadata template --- scripts/test_json_schema.py | 45 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index 260d528..8387328 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -40,16 +40,8 @@ def get_version() -> str: "columns": { "geometry": { "encoding": "WKB", - "geometry_types": [], - "covering": { - "bbox": { - "xmin": "bbox.xmin", - "ymin": "bbox.ymin", - "xmax": "bbox.xmax", - "ymax": "bbox.ymax", - }, - }, - }, + "geometry_types": [] + } }, } @@ -219,9 +211,19 @@ def get_version() -> str: invalid_cases["epoch_string"] = metadata # Geometry Bbox +metadata_covering_template = copy.deepcopy(metadata_template) +metadata_covering_template["columns"]["geometry"]["covering"] = { + "bbox": { + "xmin": "bbox.xmin", + "ymin": "bbox.ymin", + "xmax": "bbox.xmax", + "ymax": "bbox.ymax", + }, +} + # Allow "any_column.xmin" etc. -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"] = { "xmin": "any_column.xmin", "ymin": "any_column.ymin", @@ -230,45 +232,44 @@ def get_version() -> str: } valid_cases["valid_but_not_bbox_struct_name"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"].pop("bbox") invalid_cases["empty_geometry_bbox"] = metadata - -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"] = {} invalid_cases["empty_geometry_bbox_missing_fields"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmin") invalid_cases["covering_bbox_missing_xmin"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymin") invalid_cases["covering_bbox_missing_ymin"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"].pop("xmax") invalid_cases["covering_bbox_missing_xmax"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymax") invalid_cases["covering_bbox_missing_ymax"] = metadata ## Invalid bbox xmin/xmax/ymin/ymax values -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = "not_bbox_dot_xmin" invalid_cases["covering_bbox_invalid_xmin"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = "not_bbox_dot_xmax" invalid_cases["covering_bbox_invalid_xmax"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = "not_bbox_dot_ymin" invalid_cases["covering_bbox_invalid_ymin"] = metadata -metadata = copy.deepcopy(metadata_template) +metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = "not_bbox_dot_ymax" invalid_cases["covering_bbox_invalid_ymax"] = metadata From 8bfb5b7316516e12f8f30da857bb63451c4705c9 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 29 Jan 2024 22:04:41 -0500 Subject: [PATCH 08/15] Update docs. Describe bounding box as the "coordinate range" of the geometry which is language used by the GeoJSON spec --- format-specs/geoparquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index e9d4e6f..8fb67b9 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -162,7 +162,7 @@ Note: the value specified in this field should not be confused with the top-leve ### Bounding Box Columns -A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax`. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. +A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. From 44097bfa29cb956d3451c3c14dd100b85b4e1ef7 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Tue, 30 Jan 2024 13:57:02 -0500 Subject: [PATCH 09/15] Update covering bbox fields to be parquet schema paths Update bbox so that each element is an array. For example: ["bbox", "xmin"] which represents a top-level bbox struct field with an xmin member beneath it --- examples/example.parquet | Bin 29794 -> 29838 bytes examples/example_metadata.json | 20 +++++++++++--- format-specs/geoparquet.md | 12 ++++----- format-specs/schema.json | 34 +++++++++++++++++------- scripts/generate_example.py | 8 +++--- scripts/test_json_schema.py | 46 +++++++++++++++++++++++---------- 6 files changed, 84 insertions(+), 36 deletions(-) diff --git a/examples/example.parquet b/examples/example.parquet index 5e547b18dd60640272f896d34672f07b82f5dffc..7c653bb259012832e65eea60506a4c1984117239 100644 GIT binary patch delta 893 zcmZ8f-D(q25N^DXfab0S6vPrL6)J4E-PWv7WX|TNXKidYRyT=Mn@!S_jhm#+e~OU% z7QERf@C}6Ch~P{32wo}ZJ2*32D)n5t_L^c00GfF)O->EzCFLyOMM)*NJyJ122 zO)cL}3k4tm&)($XkNO&)=LUYIV~?Ab97xMHJU1B0V2$u5<91KEGaGYEL;4dXyTP#L z8h-gD;nuKHR{owZJlU~*Z$tUJ7WmD)-fCKQgRx6kZprKSVDFqE1Kpe-m38yTAw`fO zQ(b>O2B?#p28@p&3m_+`8Kt8y$uCkS0f0jBiKa~uIBPf>z;iN`c*!3V+96+XevW+R z1r3!j*w3G?YMR*#zn`ou9(&wKxY}6LkhQ?x4-@_6B#Y*Bxr^y(gc2rUzQ3}FD`BEP zzEq+W0I~oz1c&LGq(%I2GuA(XG1Xuv#w7nmmk<^q8e08lF~i z)aC+)j4Ue6H{RECSerA0)%pgJigqh$trn+|i58)*8omlYog|Z7ADnu+*(%^YeWJJ8 zVPf!LPL0H<;_3nSc(G4{95O_R5=W!eF=WF6E9>T{kq<^ycXN%R_?S32t9C#*Bq~qh z_y>r{sDo4BBvcL%o)2z4;#KX4G0qo6qfw=ze*?dKu-W!@m>g!}_wOw%6rrU51@Yzy AOaK4? delta 881 zcmZ8fO>Yum5C#uT4?UWGc#)7MJ=C~BiAZ}ev+p9^@*%8+WjU3X!Y+~Jy9+fj+-qV? z_7C_IOnUPNc-EU|PkQtpI5RKO#y4ShXP#$fpJ!e^{Eq(o75#j35&d!d{^DlCjY~OP zJYD#`LHtQ9eofEgm@0CD#f9Hmtc#p5^Iq|q81LvrcDKN?UfIXfw%74nyPx(nI!ED0 z_r(L4hu_`(t)cD$=zyo}-bTGIn@e%lbNp<&IPhjIJKZY^=ymMmNcPNU<{InYVfm!j z5%)1~<}}=Yeg9#7#=4DUw%bV1R{`eu%U|~898M`+YLEjcx~NH%pbFqnE=Z$6R3+9h z4mIsLX1TISMS(6bp}=Me*a$j*!Qvw23U~aophFqxQ!UW6RwbipG`U(LQfN+gw%Hz0 zpdPGHSJc9<J@@|kszRCgwe_Vc3Rr~EAA;|R^m*^geQ}OCGLZP)Or{=bQ4(A zb=C`QBB)oUva4pQ9!R^MXb;|S@pT2866YA$x?sQJ&Ki?oNRtN5Q@tx~tJ`dpW3&ex zI7hLxWF>}J+-e%(c=|ZzP+`12;@83&+4hX$#$dWQPU*!;5>iB!N!{so8{1v3Bu diff --git a/examples/example_metadata.json b/examples/example_metadata.json index 4cda34a..a4b9889 100644 --- a/examples/example_metadata.json +++ b/examples/example_metadata.json @@ -10,10 +10,22 @@ ], "covering": { "bbox": { - "xmax": "bbox.xmax", - "xmin": "bbox.xmin", - "ymax": "bbox.ymax", - "ymin": "bbox.ymin" + "xmax": [ + "bbox", + "xmax" + ], + "xmin": [ + "bbox", + "xmin" + ], + "ymax": [ + "bbox", + "ymax" + ], + "ymin": [ + "bbox", + "ymin" + ] } }, "crs": { diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 8fb67b9..0a2d9e1 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -144,19 +144,19 @@ Example: ``` "covering": { "bbox": { - "xmin": "bbox.xmin", - "ymin": "bbox.ymin", - "xmax": "bbox.xmax", - "ymax": "bbox.ymax" + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"] } } ``` ##### bbox covering encoding -Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the name of a column containing the bounding box of the geometry for every row. +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. -The format of `bbox` encoding is `{"xmin": "column_name.xmin", "ymin": "column_name.ymin", "xmax": "column_name.xmax", "ymax": "column_name.ymax"}` where `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. The values MUST end in `.xmin`, `.ymin`, etc. All values MUST use the same column name. +The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name. Note: the value specified in this field should not be confused with the top-level [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. diff --git a/format-specs/schema.json b/format-specs/schema.json index 49a155f..fe67679 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema#", + "$schema": "http://json-schema.org/draft-06/schema#", "title": "GeoParquet", "description": "Parquet metadata included in the geo field.", "type": "object", @@ -81,20 +81,36 @@ "required": ["xmin", "xmax", "ymin", "ymax"], "properties": { "xmin": { - "type": "string", - "pattern": "\\.xmin$" + "type": "array", + "items": [ + { "type": "string" }, + { "const": "xmin" } + ], + "additionalItems": false }, "xmax": { - "type": "string", - "pattern": "\\.xmax$" + "type": "array", + "items": [ + { "type": "string" }, + { "const": "xmax" } + ], + "additionalItems": false }, "ymin": { - "type": "string", - "pattern": "\\.ymin$" + "type": "array", + "items": [ + { "type": "string" }, + { "const": "ymin" } + ], + "additionalItems": false }, "ymax": { - "type": "string", - "pattern": "\\.ymax$" + "type": "array", + "items": [ + { "type": "string" }, + { "const": "ymax" } + ], + "additionalItems": false } } } diff --git a/scripts/generate_example.py b/scripts/generate_example.py index 9b9a50d..c913c71 100644 --- a/scripts/generate_example.py +++ b/scripts/generate_example.py @@ -50,10 +50,10 @@ def get_version() -> str: "bbox": [round(x, 4) for x in df.total_bounds], "covering": { "bbox": { - "xmin": "bbox.xmin", - "ymin": "bbox.ymin", - "xmax": "bbox.xmax", - "ymax": "bbox.ymax", + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"], }, }, }, diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index 8387328..f0217c1 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -214,21 +214,24 @@ def get_version() -> str: metadata_covering_template = copy.deepcopy(metadata_template) metadata_covering_template["columns"]["geometry"]["covering"] = { "bbox": { - "xmin": "bbox.xmin", - "ymin": "bbox.ymin", - "xmax": "bbox.xmax", - "ymax": "bbox.ymax", + "xmin": ["bbox", "xmin"], + "ymin": ["bbox", "ymin"], + "xmax": ["bbox", "xmax"], + "ymax": ["bbox", "ymax"], }, } # Allow "any_column.xmin" etc. +metadata = copy.deepcopy(metadata_covering_template) +valid_cases["valid_default_bbox"] = metadata + metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"]["bbox"] = { - "xmin": "any_column.xmin", - "ymin": "any_column.ymin", - "xmax": "any_column.xmax", - "ymax": "any_column.ymax", + "xmin": ["any_column", "xmin"], + "ymin": ["any_column", "ymin"], + "xmax": ["any_column", "xmax"], + "ymax": ["any_column", "ymax"], } valid_cases["valid_but_not_bbox_struct_name"] = metadata @@ -256,23 +259,40 @@ def get_version() -> str: metadata["columns"]["geometry"]["covering"]["bbox"].pop("ymax") invalid_cases["covering_bbox_missing_ymax"] = metadata -## Invalid bbox xmin/xmax/ymin/ymax values +# Invalid bbox xmin/xmax/ymin/ymax values metadata = copy.deepcopy(metadata_covering_template) -metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = "not_bbox_dot_xmin" +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = ["bbox", "not_xmin"] invalid_cases["covering_bbox_invalid_xmin"] = metadata metadata = copy.deepcopy(metadata_covering_template) -metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = "not_bbox_dot_xmax" +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = ["bbox", "not_xmax"] invalid_cases["covering_bbox_invalid_xmax"] = metadata metadata = copy.deepcopy(metadata_covering_template) -metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = "not_bbox_dot_ymin" +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = ["bbox", "not_ymin"] invalid_cases["covering_bbox_invalid_ymin"] = metadata metadata = copy.deepcopy(metadata_covering_template) -metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = "not_bbox_dot_ymax" +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = ["bbox", "not_ymax"] invalid_cases["covering_bbox_invalid_ymax"] = metadata +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = ["bbox", "xmin", "invalid_extra"] +invalid_cases["covering_bbox_extra_xmin_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = ["bbox", "xmax", "invalid_extra"] +invalid_cases["covering_bbox_extra_xmax_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = ["bbox", "ymin", "invalid_extra"] +invalid_cases["covering_bbox_extra_ymin_elements"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = ["bbox", "ymax", "invalid_extra"] +invalid_cases["covering_bbox_extra_ymax_elements"] = metadata + + # # Tests @pytest.mark.parametrize( From e0b3d475a2154beb5bad562c23ea32d0beea7813 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Tue, 30 Jan 2024 13:59:06 -0500 Subject: [PATCH 10/15] Fix schema version to 7. Accidentally committed in last version for testing --- format-specs/schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format-specs/schema.json b/format-specs/schema.json index fe67679..10b0a7b 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-06/schema#", + "$schema": "http://json-schema.org/draft-07/schema#", "title": "GeoParquet", "description": "Parquet metadata included in the geo field.", "type": "object", From 40ebb37b3cf701a14e9c8925e58d86e5a878ab69 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Fri, 2 Feb 2024 09:06:04 -0500 Subject: [PATCH 11/15] Update docs * Mention that bounding boxes are useful for both row group statistics and page indexes * Require that bbox encoding columns be all of the same type (float or double) --- format-specs/geoparquet.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 0a2d9e1..c0b582b 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -154,7 +154,7 @@ Example: ##### bbox covering encoding -Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group or page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name. @@ -162,7 +162,7 @@ Note: the value specified in this field should not be confused with the top-leve ### Bounding Box Columns -A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. +A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE` and all columns MUST use the same type. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. From 4a3c59cf03503bef7025ae304d2a229e4ff3934b Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Fri, 2 Feb 2024 15:10:22 -0500 Subject: [PATCH 12/15] "row groups or page indexes" -> "row groups and page indexes" --- format-specs/geoparquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index c0b582b..9b4416f 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -154,7 +154,7 @@ Example: ##### bbox covering encoding -Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group or page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name. From c540208412dd18cb8e627de9aa071500c6147025 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Thu, 15 Feb 2024 13:00:59 -0500 Subject: [PATCH 13/15] Mention anti-meridian crossing --- format-specs/geoparquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 9b4416f..518030f 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -162,7 +162,7 @@ Note: the value specified in this field should not be confused with the top-leve ### Bounding Box Columns -A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE` and all columns MUST use the same type. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. +A bounding box column MUST be a Parquet group field with 4 child fields named `xmin`, `xmax`, `ymin`, and `ymax` representing the geometry's coordinate range. As with the top-level [`bbox`](#bbox) column, the values follow the GeoJSON specification (RFC 7946, section 5), which also describes how to represent the bbox for geometries that cross the antimeridian. For three dimensions the additional fields `zmin` and `zmax` MAY be present but are not required. The fields MUST be of Parquet type `FLOAT` or `DOUBLE` and all columns MUST use the same type. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row does not contain a geometry value, the row MUST NOT contain a bounding box value. The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. From f3edcaf970e58b8f67b1e6f74fc50227a9e32c61 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 11 Mar 2024 09:06:41 -0400 Subject: [PATCH 14/15] Mention not supporting antiemeridian crossings --- format-specs/geoparquet.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 518030f..4f01b2f 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -154,11 +154,13 @@ Example: ##### bbox covering encoding -Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name. -Note: the value specified in this field should not be confused with the top-level [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. +The value specified in this field should not be confused with the top-level [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. + +Note: This technique to use the bounding box to improve spatial queries does not apply to geometries that cross the antimeridian. Such geometries are unsupported by this method. ### Bounding Box Columns From eb6d3e02116306ea356d39f72315d7305096f0f6 Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 11 Mar 2024 09:11:25 -0400 Subject: [PATCH 15/15] Remove trailing whitespace for linter --- format-specs/geoparquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 4f01b2f..8b2e79d 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -154,7 +154,7 @@ Example: ##### bbox covering encoding -Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group and page index bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the column name and fields containing the bounding box of the geometry for every row. The format of the `bbox` encoding is `{"xmin": ["column_name", "xmin"], "ymin": ["column_name", "ymin"], "xmax": ["column_name", "xmax"], "ymax": ["column_name", "ymax"]}`. The arrays represent Parquet schema paths for nested groups. In this example, `column_name` is a Parquet group with fields `xmin`, `ymin`, `xmax`, `ymax`. The value in `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. In order to constrain this value to a single bounding group field, the second item in each element MUST be `xmin`, `ymin`, etc. All values MUST use the same column name.