From 634f886740460d9e8eec8ad15b9fcc87024aad89 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 10 Dec 2024 17:39:48 -0500 Subject: [PATCH 1/3] [c++] Fix upgrade-shape for dataframes with non-standard dimensions --- libtiledbsoma/src/soma/soma_array.cc | 202 +++++++++++++-------------- 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e25958b460..339f207a49 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -29,10 +29,10 @@ * This file defines the SOMAArray class. */ -#include "soma_array.h" #include #include "../utils/logger.h" #include "../utils/util.h" +#include "soma_array.h" #include @@ -1039,107 +1039,107 @@ void SOMAArray::_set_soma_joinid_shape_helper( } ndrect.set_range(dim_name, 0, newshape - 1); continue; + } - switch (dim.type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: - // See comments in soma_array.h. - ndrect.set_range(dim_name, "", "\x7f"); - break; - - case TILEDB_INT8: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_BOOL: - case TILEDB_UINT8: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT16: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT16: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT64: - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_TIME_HR: - case TILEDB_TIME_MIN: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_TIME_PS: - case TILEDB_TIME_FS: - case TILEDB_TIME_AS: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT64: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_FLOAT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_FLOAT64: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - default: - throw TileDBSOMAError(std::format( - "{}: internal error: unhandled type {} for {}.", - function_name_for_messages, - tiledb::impl::type_to_str(dim.type()), - dim_name)); - } + switch (dim.type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: + // See comments in soma_array.h. + ndrect.set_range(dim_name, "", "\x7f"); + break; + + case TILEDB_INT8: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_BOOL: + case TILEDB_UINT8: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT16: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT16: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT64: + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_TIME_HR: + case TILEDB_TIME_MIN: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_TIME_PS: + case TILEDB_TIME_FS: + case TILEDB_TIME_AS: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT64: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_FLOAT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_FLOAT64: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + default: + throw TileDBSOMAError(std::format( + "{}: internal error: unhandled type {} for {}.", + function_name_for_messages, + tiledb::impl::type_to_str(dim.type()), + dim_name)); } } From 1521201766d3d35b59decf6860ffa071760cf53e Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 10 Dec 2024 18:25:06 -0500 Subject: [PATCH 2/3] unit testing --- .../nonstandard-dataframe-without-shapes.tgz | Bin 0 -> 1645 bytes apis/python/tests/test_shape.py | 59 ++++++++++++++++++ libtiledbsoma/src/soma/soma_array.cc | 2 +- 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 apis/python/testdata/nonstandard-dataframe-without-shapes.tgz diff --git a/apis/python/testdata/nonstandard-dataframe-without-shapes.tgz b/apis/python/testdata/nonstandard-dataframe-without-shapes.tgz new file mode 100644 index 0000000000000000000000000000000000000000..486c8ac3448fd8be2eb901af70487f9b8db680d7 GIT binary patch literal 1645 zcmV-z29o(7iwFQ<%2;Or1MOT5Y!g)+e>b|W`>tas9R%tUaZKp$`&t&3fteqxhB$(V zF=?;u73R7POOc@=j06ZXiX?;&Kca|)hzaE1kk(o zy?J-5YrFMs6yGm7es}*p_`Q3-`~UCl-5stvU^qyU6gbWqXFkCs%_bPaNwEUWQ!GbP zq?4p5hGKw|Grns8EM627qjFc;q-bQYc5<_Dc#xmN^XTUsfve8vYnNi8k(WW{{?i;I zVE@g4;(tU8NRdUxT>+W<&yXC>e^a3NAC8Li+og_}QH3CL|5=u%vHxa3@juwv-rgEB zt^f=n|7o7Z{+j{C|3$$CQoDgOfFad?3j1#c)b-z}3NQr!IRVfAOo7&apN|q4#x2kk z!+R)?{)m@?q+6hr6O`N%6OtH)=fgsnrvn~}cKc{L=@pc*|1`t#9QNM~WZVA`<%)#@ zS`E25&$5H~F9)^$vvL1PGX~&)?mq$9CU1Ain_U%0v6@foZaQr<0HE^$*pfd2y~HpI zSCDI{2wWx{8#ec0krv#V)6X9uK=J3{;<;AK4KV$tmi+SdHz#^Fd+X08t1CN=gWnw$axo89XQt&SZY&-HWH zn>>?2{jU5Lr59-F0R3y}_Cjf23 z2#R$P>-|po5+3y{a2PmNer2refVgucjB|Y4c#vlSSFz)_nFOW!E162@9Y?Wv8L`&N zcDx6|Ie!E6$TvX34xEY#GYpmTBr4@drQ7?Ay$V?W1V5?A%a);xEy;MHnBseP{Mg7# zE7={>NdBDI{jYCVMiJ*aN$#mOwG3uy?O0ESo}3`qElpo1NdxkP8o-bC$#!UT3Xy z%O14MIa6^@?c4TpuyxER!K5>7fK4>3U?itIg3Hu2=N8;SM&KAx2#b7( z!3cU#isERYbAc3zbS~+PMnb5wdFY&g{%wcpy7+RC9t6nur`aB1HROwsAA?=|0CENF z>SD;JAlHvd@LP}xOM*j?4?(VlgYXLE%aEV6CU`I8@dXK94*4wPCU_mZ54o%`!OubN zgDk*7e-jOUae{)7zlJ=yB*AMTUx7TgG{O5J-&>a8m5?t$ZZ1#oPRMq6y={W*|JT~v z7}ye<4Vj z|E&J`FVFHk&VMtYo`)zmaCQEp7y)wPR)#SIWdHpSIsc>DIY8?BzdrvtmL+lin*o&n z%4B&o(B?mi#qa+`PpE%=8G)W+7y;GyKR%z-vAA7|im}$tj%BQt{2I+0(DDZ8+##`atiLs#o;Mea~6vp13uy&+%KyyuMG?H0=3(Vc~Z_yz8nzKko49>hqhU6PiDpTrhuM zXjx_3k2iK7CMH%L`Rig6H7&BQ@6a1pFKk@Baaxh>%E>clPh70p_tey7(e4{hTaLd& zZo4+=rd6C?cg^6T%?YT#{~h-1&$T2POQMl-{x|UZFG0Y+|CxgR|Nld6zS~XfkC>3W zLxzHG)+0L~2$L);N(>v~!XCFPt9ZfxZcYxCqDw7ox3=9kmjI7`v*+Ob~08Rh^H$6iA literal 0 HcmV?d00001 diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index 7b0437c0c2..0894003a02 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -671,3 +671,62 @@ def _check_ndarray(ndarray, has_shapes, expected_shape): _check_ndarray(exp.ms["RNA"].obsm["X_pca"], True, (2639, 50)) _check_ndarray(exp.ms["RNA"].obsp["connectivities"], True, (2639, 2639)) _check_ndarray(exp.ms["RNA"].varm["PCs"], True, (1839, 50)) + + +def test_canned_nonstandard_dataframe_upgrade(tmp_path): + uri = tmp_path.as_posix() + + tgz = TESTDATA / "nonstandard-dataframe-without-shapes.tgz" + + with tarfile.open(tgz) as handle: + handle.extractall(uri) + + # ---------------------------------------------------------------- + # As of tiledbsoma 1.15 we no longer write dataframes/arrays without + # core current domain (soma domain/shape) so it is crucial that in order + # to test upgrade-shape, we test saved-off data written before 1.15.0. + # + # Here is a dataframe created with tiledbsoma 1.14.5: + # ---------------------------------------------------------------- + # import tiledbsoma + # import pyarrow as pa + # + # schema = pa.schema([ + # ("soma_joinid", pa.int64()), + # ("mystring", pa.string()), + # ("myint", pa.int32()), + # ("myfloat", pa.float32()), + # ]) + # + # data = pa.Table.from_pydict({ + # "soma_joinid": [10, 20], + # "mystring": ["hello", "world"], + # "myint": [1330, 1440], + # "myfloat": [4.5, 5.5], + # }) + # + # with tiledbsoma.DataFrame.create( + # uri="data-sdf-dom-multi-py", + # schema=schema, + # index_column_names=["soma_joinid", "myint", "mystring"], + # domain=None, + # ) as sdf: + # sdf.write(data) + # ---------------------------------------------------------------- + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert not sdf.tiledbsoma_has_upgraded_domain + assert sdf.non_empty_domain() == ((10, 20), (1330, 1440), ("hello", "world")) + assert sdf.domain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) + assert sdf.maxdomain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) + + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + ok, msg = sdf.tiledbsoma_upgrade_soma_joinid_shape(1, check_only=True) + + sdf.tiledbsoma_upgrade_soma_joinid_shape(100) + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert sdf.tiledbsoma_has_upgraded_domain + assert sdf.non_empty_domain() == ((10, 20), (1330, 1440), ("hello", "world")) + assert sdf.domain == ((0, 99), (-2147483648, 2147481598), ("", "")) + assert sdf.maxdomain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 339f207a49..68cd1e9187 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -29,10 +29,10 @@ * This file defines the SOMAArray class. */ +#include "soma_array.h" #include #include "../utils/logger.h" #include "../utils/util.h" -#include "soma_array.h" #include From e048cce3658c8d2b2e7003547ad45884e8a24546 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 10 Dec 2024 20:32:41 -0500 Subject: [PATCH 3/3] code-review feedback --- libtiledbsoma/src/soma/soma_array.cc | 203 +++++++++++++-------------- 1 file changed, 101 insertions(+), 102 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 68cd1e9187..337b1c86da 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1038,108 +1038,107 @@ void SOMAArray::_set_soma_joinid_shape_helper( tiledb::impl::type_to_str(dim.type()))); } ndrect.set_range(dim_name, 0, newshape - 1); - continue; - } - - switch (dim.type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: - // See comments in soma_array.h. - ndrect.set_range(dim_name, "", "\x7f"); - break; - - case TILEDB_INT8: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_BOOL: - case TILEDB_UINT8: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT16: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT16: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_INT64: - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_TIME_HR: - case TILEDB_TIME_MIN: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_TIME_PS: - case TILEDB_TIME_FS: - case TILEDB_TIME_AS: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_UINT64: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_FLOAT32: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - case TILEDB_FLOAT64: - ndrect.set_range( - dim_name, - dim.domain().first, - dim.domain().second); - break; - default: - throw TileDBSOMAError(std::format( - "{}: internal error: unhandled type {} for {}.", - function_name_for_messages, - tiledb::impl::type_to_str(dim.type()), - dim_name)); + } else { + switch (dim.type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: + // See comments in soma_array.h. + ndrect.set_range(dim_name, "", "\x7f"); + break; + + case TILEDB_INT8: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_BOOL: + case TILEDB_UINT8: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT16: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT16: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_INT64: + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_TIME_HR: + case TILEDB_TIME_MIN: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_TIME_PS: + case TILEDB_TIME_FS: + case TILEDB_TIME_AS: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_UINT64: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_FLOAT32: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + case TILEDB_FLOAT64: + ndrect.set_range( + dim_name, + dim.domain().first, + dim.domain().second); + break; + default: + throw TileDBSOMAError(std::format( + "{}: internal error: unhandled type {} for {}.", + function_name_for_messages, + tiledb::impl::type_to_str(dim.type()), + dim_name)); + } } }