From 6d2715fc1e3e2ab353190ba524ca0d0f95899b89 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 10 Dec 2024 20:59:46 -0500 Subject: [PATCH] [c++] Fix upgrade-soma-joinid-shape for dataframes having non-standard dimensions (#3416) * [c++] Fix upgrade-shape for dataframes with non-standard dimensions * unit testing * code-review feedback --- .../nonstandard-dataframe-without-shapes.tgz | Bin 0 -> 1645 bytes apis/python/tests/test_shape.py | 59 ++++++++++++++++++ libtiledbsoma/src/soma/soma_array.cc | 3 +- 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 apis/python/testdata/nonstandard-dataframe-without-shapes.tgz diff --git a/apis/python/testdata/nonstandard-dataframe-without-shapes.tgz b/apis/python/testdata/nonstandard-dataframe-without-shapes.tgz new file mode 100644 index 0000000000000000000000000000000000000000..486c8ac3448fd8be2eb901af70487f9b8db680d7 GIT binary patch literal 1645 zcmV-z29o(7iwFQ<%2;Or1MOT5Y!g)+e>b|W`>tas9R%tUaZKp$`&t&3fteqxhB$(V zF=?;u73R7POOc@=j06ZXiX?;&Kca|)hzaE1kk(o zy?J-5YrFMs6yGm7es}*p_`Q3-`~UCl-5stvU^qyU6gbWqXFkCs%_bPaNwEUWQ!GbP zq?4p5hGKw|Grns8EM627qjFc;q-bQYc5<_Dc#xmN^XTUsfve8vYnNi8k(WW{{?i;I zVE@g4;(tU8NRdUxT>+W<&yXC>e^a3NAC8Li+og_}QH3CL|5=u%vHxa3@juwv-rgEB zt^f=n|7o7Z{+j{C|3$$CQoDgOfFad?3j1#c)b-z}3NQr!IRVfAOo7&apN|q4#x2kk z!+R)?{)m@?q+6hr6O`N%6OtH)=fgsnrvn~}cKc{L=@pc*|1`t#9QNM~WZVA`<%)#@ zS`E25&$5H~F9)^$vvL1PGX~&)?mq$9CU1Ain_U%0v6@foZaQr<0HE^$*pfd2y~HpI zSCDI{2wWx{8#ec0krv#V)6X9uK=J3{;<;AK4KV$tmi+SdHz#^Fd+X08t1CN=gWnw$axo89XQt&SZY&-HWH zn>>?2{jU5Lr59-F0R3y}_Cjf23 z2#R$P>-|po5+3y{a2PmNer2refVgucjB|Y4c#vlSSFz)_nFOW!E162@9Y?Wv8L`&N zcDx6|Ie!E6$TvX34xEY#GYpmTBr4@drQ7?Ay$V?W1V5?A%a);xEy;MHnBseP{Mg7# zE7={>NdBDI{jYCVMiJ*aN$#mOwG3uy?O0ESo}3`qElpo1NdxkP8o-bC$#!UT3Xy z%O14MIa6^@?c4TpuyxER!K5>7fK4>3U?itIg3Hu2=N8;SM&KAx2#b7( z!3cU#isERYbAc3zbS~+PMnb5wdFY&g{%wcpy7+RC9t6nur`aB1HROwsAA?=|0CENF z>SD;JAlHvd@LP}xOM*j?4?(VlgYXLE%aEV6CU`I8@dXK94*4wPCU_mZ54o%`!OubN zgDk*7e-jOUae{)7zlJ=yB*AMTUx7TgG{O5J-&>a8m5?t$ZZ1#oPRMq6y={W*|JT~v z7}ye<4Vj z|E&J`FVFHk&VMtYo`)zmaCQEp7y)wPR)#SIWdHpSIsc>DIY8?BzdrvtmL+lin*o&n z%4B&o(B?mi#qa+`PpE%=8G)W+7y;GyKR%z-vAA7|im}$tj%BQt{2I+0(DDZ8+##`atiLs#o;Mea~6vp13uy&+%KyyuMG?H0=3(Vc~Z_yz8nzKko49>hqhU6PiDpTrhuM zXjx_3k2iK7CMH%L`Rig6H7&BQ@6a1pFKk@Baaxh>%E>clPh70p_tey7(e4{hTaLd& zZo4+=rd6C?cg^6T%?YT#{~h-1&$T2POQMl-{x|UZFG0Y+|CxgR|Nld6zS~XfkC>3W zLxzHG)+0L~2$L);N(>v~!XCFPt9ZfxZcYxCqDw7ox3=9kmjI7`v*+Ob~08Rh^H$6iA literal 0 HcmV?d00001 diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index 7b0437c0c2..0894003a02 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -671,3 +671,62 @@ def _check_ndarray(ndarray, has_shapes, expected_shape): _check_ndarray(exp.ms["RNA"].obsm["X_pca"], True, (2639, 50)) _check_ndarray(exp.ms["RNA"].obsp["connectivities"], True, (2639, 2639)) _check_ndarray(exp.ms["RNA"].varm["PCs"], True, (1839, 50)) + + +def test_canned_nonstandard_dataframe_upgrade(tmp_path): + uri = tmp_path.as_posix() + + tgz = TESTDATA / "nonstandard-dataframe-without-shapes.tgz" + + with tarfile.open(tgz) as handle: + handle.extractall(uri) + + # ---------------------------------------------------------------- + # As of tiledbsoma 1.15 we no longer write dataframes/arrays without + # core current domain (soma domain/shape) so it is crucial that in order + # to test upgrade-shape, we test saved-off data written before 1.15.0. + # + # Here is a dataframe created with tiledbsoma 1.14.5: + # ---------------------------------------------------------------- + # import tiledbsoma + # import pyarrow as pa + # + # schema = pa.schema([ + # ("soma_joinid", pa.int64()), + # ("mystring", pa.string()), + # ("myint", pa.int32()), + # ("myfloat", pa.float32()), + # ]) + # + # data = pa.Table.from_pydict({ + # "soma_joinid": [10, 20], + # "mystring": ["hello", "world"], + # "myint": [1330, 1440], + # "myfloat": [4.5, 5.5], + # }) + # + # with tiledbsoma.DataFrame.create( + # uri="data-sdf-dom-multi-py", + # schema=schema, + # index_column_names=["soma_joinid", "myint", "mystring"], + # domain=None, + # ) as sdf: + # sdf.write(data) + # ---------------------------------------------------------------- + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert not sdf.tiledbsoma_has_upgraded_domain + assert sdf.non_empty_domain() == ((10, 20), (1330, 1440), ("hello", "world")) + assert sdf.domain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) + assert sdf.maxdomain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) + + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + ok, msg = sdf.tiledbsoma_upgrade_soma_joinid_shape(1, check_only=True) + + sdf.tiledbsoma_upgrade_soma_joinid_shape(100) + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert sdf.tiledbsoma_has_upgraded_domain + assert sdf.non_empty_domain() == ((10, 20), (1330, 1440), ("hello", "world")) + assert sdf.domain == ((0, 99), (-2147483648, 2147481598), ("", "")) + assert sdf.maxdomain == ((0, 2147483646), (-2147483648, 2147481598), ("", "")) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e1d5a74f6e..84c6dae73c 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1036,8 +1036,7 @@ void SOMAArray::_set_soma_joinid_shape_helper( tiledb::impl::type_to_str(dim.type()))); } ndrect.set_range(dim_name, 0, newshape - 1); - continue; - + } else { switch (dim.type()) { case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: