Skip to content

Commit

Permalink
WIP add platform config in C++
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenv committed Mar 26, 2024
1 parent 0b029d7 commit 5417c17
Show file tree
Hide file tree
Showing 15 changed files with 147 additions and 26 deletions.
19 changes: 18 additions & 1 deletion apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,23 @@ def create(
domains = pa.StructArray.from_arrays(domains, names=index_column_names)
extents = pa.StructArray.from_arrays(extents, names=index_column_names)

print(platform_config)
plt_cfg = None
if platform_config:
ops = TileDBCreateOptions.from_platform_config(platform_config)
plt_cfg = clib.PlatformConfig()
plt_cfg.dataframe_dim_zstd_level = ops.dataframe_dim_zstd_level
plt_cfg.sparse_nd_array_dim_zstd_level = ops.sparse_nd_array_dim_zstd_level
plt_cfg.write_X_chunked = ops.write_X_chunked
plt_cfg.goal_chunk_nnz = ops.goal_chunk_nnz
plt_cfg.capacity = ops.capacity
if ops.offsets_filters:
plt_cfg.offsets_filters = [info["_type"] for info in ops.offsets_filters]
if ops.validity_filters:
plt_cfg.validity_filters = [info["_type"] for info in ops.validity_filters]
plt_cfg.allows_duplicates = ops.allows_duplicates
plt_cfg.tile_order = ops.tile_order
plt_cfg.cell_order = ops.cell_order
plt_cfg.consolidate_and_vacuum = ops.consolidate_and_vacuum

# TODO add as kw args
clib.SOMADataFrame.create(
Expand All @@ -254,6 +270,7 @@ def create(
domains,
extents,
context.native_context,
plt_cfg,
)

handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp)
Expand Down
20 changes: 20 additions & 0 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,26 @@ PYBIND11_MODULE(pytiledbsoma, m) {
},
"Print TileDB internal statistics. Lifecycle: experimental.");

py::class_<PlatformConfig>(m, "PlatformConfig")
.def(py::init<>())
.def_readwrite(
"dataframe_dim_zstd_level",
&PlatformConfig::dataframe_dim_zstd_level)
.def_readwrite(
"sparse_nd_array_dim_zstd_level",
&PlatformConfig::sparse_nd_array_dim_zstd_level)
.def_readwrite("write_X_chunked", &PlatformConfig::write_X_chunked)
.def_readwrite("goal_chunk_nnz", &PlatformConfig::goal_chunk_nnz)
.def_readwrite("remote_cap_nbytes", &PlatformConfig::remote_cap_nbytes)
.def_readwrite("capacity", &PlatformConfig::capacity)
.def_readwrite("offsets_filters", &PlatformConfig::offsets_filters)
.def_readwrite("validity_filters", &PlatformConfig::validity_filters)
.def_readwrite("allows_duplicates", &PlatformConfig::allows_duplicates)
.def_readwrite("tile_order", &PlatformConfig::tile_order)
.def_readwrite("cell_order", &PlatformConfig::cell_order)
.def_readwrite(
"consolidate_and_vacuum", &PlatformConfig::consolidate_and_vacuum);

load_soma_context(m);
load_soma_object(m);
load_soma_array(m);
Expand Down
6 changes: 4 additions & 2 deletions apis/python/src/tiledbsoma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ void load_soma_dataframe(py::module& m) {
std::vector<std::string> index_columns_names,
py::object py_domains,
py::object py_extents,
std::shared_ptr<SOMAContext> context) {
std::shared_ptr<SOMAContext> context,
std::optional<PlatformConfig> platform_config) {
ArrowSchema schema;
uintptr_t schema_ptr = (uintptr_t)(&schema);
py_schema.attr("_export_to_c")(schema_ptr);
Expand Down Expand Up @@ -94,7 +95,8 @@ void load_soma_dataframe(py::module& m) {
index_columns_names,
std::make_shared<ArrowArray>(domains),
std::make_shared<ArrowArray>(extents)),
context);
context,
platform_config);
} catch (const std::out_of_range& e) {
throw py::type_error(e.what());
} catch (const std::exception& e) {
Expand Down
10 changes: 6 additions & 4 deletions libtiledbsoma/src/soma/soma_collection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,9 @@ std::shared_ptr<SOMAExperiment> SOMACollection::add_new_experiment(
URIType uri_type,
std::shared_ptr<SOMAContext> ctx,
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns) {
SOMAExperiment::create(uri, schema, index_columns, ctx);
ColumnIndexInfo index_columns,
std::optional<PlatformConfig> platform_config) {
SOMAExperiment::create(uri, schema, index_columns, ctx, platform_config);
std::shared_ptr<SOMAExperiment> member = SOMAExperiment::open(
uri, OpenMode::read, ctx);
this->set(std::string(uri), uri_type, std::string(key));
Expand Down Expand Up @@ -141,8 +142,9 @@ std::shared_ptr<SOMADataFrame> SOMACollection::add_new_dataframe(
URIType uri_type,
std::shared_ptr<SOMAContext> ctx,
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns) {
SOMADataFrame::create(uri, schema, index_columns, ctx);
ColumnIndexInfo index_columns,
std::optional<PlatformConfig> platform_config) {
SOMADataFrame::create(uri, schema, index_columns, ctx, platform_config);
std::shared_ptr<SOMADataFrame> member = SOMADataFrame::open(
uri, OpenMode::read, ctx);
this->set(std::string(uri), uri_type, std::string(key));
Expand Down
6 changes: 4 additions & 2 deletions libtiledbsoma/src/soma/soma_collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,8 @@ class SOMACollection : public SOMAGroup {
URIType uri_type,
std::shared_ptr<SOMAContext> ctx,
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns);
ColumnIndexInfo index_columns,
std::optional<PlatformConfig> platform_config = std::nullopt);

/**
* Create and add a SOMAMeasurement to the SOMACollection.
Expand Down Expand Up @@ -190,7 +191,8 @@ class SOMACollection : public SOMAGroup {
URIType uri_type,
std::shared_ptr<SOMAContext> ctx,
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns);
ColumnIndexInfo index_columns,
std::optional<PlatformConfig> platform_config = std::nullopt);

/**
* Create and add a SOMADenseNDArray to the SOMACollection.
Expand Down
5 changes: 2 additions & 3 deletions libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,10 @@ void SOMADataFrame::create(
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config,
std::optional<std::pair<uint64_t, uint64_t>> timestamp) {
PlatformConfig platform_cfg = {
{"tiledb", {{"create", {{"allow_duplicates", false}}}}}};
auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema(
ctx->tiledb_ctx(), schema, index_columns, platform_cfg);
ctx->tiledb_ctx(), schema, index_columns, platform_config);
SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp);
}

Expand Down
1 change: 1 addition & 0 deletions libtiledbsoma/src/soma/soma_dataframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class SOMADataFrame : public SOMAArray {
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config = std::nullopt,
std::optional<std::pair<uint64_t, uint64_t>> timestamp = std::nullopt);

/**
Expand Down
8 changes: 7 additions & 1 deletion libtiledbsoma/src/soma/soma_experiment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,18 @@ void SOMAExperiment::create(
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config,
std::optional<TimestampRange> timestamp) {
std::string exp_uri(uri);

SOMAGroup::create(ctx, exp_uri, "SOMAExperiment", timestamp);
SOMADataFrame::create(
exp_uri + "/obs", schema, index_columns, ctx, timestamp);
exp_uri + "/obs",
schema,
index_columns,
ctx,
platform_config,
timestamp);
SOMACollection::create(exp_uri + "/ms", ctx, timestamp);

auto name = std::string(std::filesystem::path(uri).filename());
Expand Down
1 change: 1 addition & 0 deletions libtiledbsoma/src/soma/soma_experiment.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class SOMAExperiment : public SOMACollection {
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config = std::nullopt,
std::optional<TimestampRange> timestamp = std::nullopt);

/**
Expand Down
8 changes: 7 additions & 1 deletion libtiledbsoma/src/soma/soma_measurement.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,18 @@ void SOMAMeasurement::create(
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config,
std::optional<TimestampRange> timestamp) {
std::string exp_uri(uri);

SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement", timestamp);
SOMADataFrame::create(
exp_uri + "/var", schema, index_columns, ctx, timestamp);
exp_uri + "/var",
schema,
index_columns,
ctx,
platform_config,
timestamp);
SOMACollection::create(exp_uri + "/X", ctx, timestamp);
SOMACollection::create(exp_uri + "/obsm", ctx, timestamp);
SOMACollection::create(exp_uri + "/obsp", ctx, timestamp);
Expand Down
1 change: 1 addition & 0 deletions libtiledbsoma/src/soma/soma_measurement.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class SOMAMeasurement : public SOMACollection {
std::shared_ptr<ArrowSchema> schema,
ColumnIndexInfo index_columns,
std::shared_ptr<SOMAContext> ctx,
std::optional<PlatformConfig> platform_config = std::nullopt,
std::optional<TimestampRange> timestamp = std::nullopt);

/**
Expand Down
62 changes: 56 additions & 6 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,67 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
std::shared_ptr<Context> ctx,
std::shared_ptr<ArrowSchema> arrow_schema,
ColumnIndexInfo index_column_info,
PlatformConfig platform_config) {
std::optional<PlatformConfig> platform_config) {
auto [index_column_names, domains, extents] = index_column_info;

std::cout << (platform_config["tiledb"]["create"]["allows_duplicates"] ?
"yes" :
"No")
<< std::endl;

ArraySchema schema(*ctx, TILEDB_SPARSE);
Domain domain(*ctx);

if (platform_config) {
std::map<std::string, tiledb_filter_type_t> convert_filter = {
{"GzipFilter", TILEDB_FILTER_GZIP},
{"ZstdFilter", TILEDB_FILTER_ZSTD},
{"LZ4Filter", TILEDB_FILTER_LZ4},
{"Bzip2Filter", TILEDB_FILTER_BZIP2},
{"RleFilter", TILEDB_FILTER_RLE},
{"DeltaFilter", TILEDB_FILTER_DELTA},
{"DoubleDeltaFilter", TILEDB_FILTER_DOUBLE_DELTA},
{"BitWidthReductionFilter", TILEDB_FILTER_BIT_WIDTH_REDUCTION},
{"BitShuffleFilter", TILEDB_FILTER_BITSHUFFLE},
{"ByteShuffleFilter", TILEDB_FILTER_BYTESHUFFLE},
{"PositiveDeltaFilter", TILEDB_FILTER_POSITIVE_DELTA},
{"ChecksumMD5Filter", TILEDB_FILTER_CHECKSUM_MD5},
{"ChecksumSHA256Filter", TILEDB_FILTER_CHECKSUM_SHA256},
{"DictionaryFilter", TILEDB_FILTER_DICTIONARY},
{"FloatScaleFilter", TILEDB_FILTER_SCALE_FLOAT},
{"XORFilter", TILEDB_FILTER_XOR},
{"WebpFilter", TILEDB_FILTER_WEBP},
{"NoOpFilter", TILEDB_FILTER_NONE},
};

schema.set_capacity(platform_config->capacity);

if (platform_config->offsets_filters.size() != 0) {
FilterList offset_filter_list(*ctx);
for (auto offset : platform_config->offsets_filters) {
offset_filter_list.add_filter(
Filter(*ctx, convert_filter[offset]));
}
schema.set_offsets_filter_list(offset_filter_list);
}

if (platform_config->validity_filters.size() != 0) {
FilterList validity_filter_list(*ctx);
for (auto validity : platform_config->validity_filters) {
validity_filter_list.add_filter(
Filter(*ctx, convert_filter[validity]));
}
schema.set_validity_filter_list(validity_filter_list);
}

schema.set_allows_dups(platform_config->allows_duplicates);

if (platform_config->tile_order)
schema.set_tile_order(
platform_config->tile_order == "row" ? TILEDB_ROW_MAJOR :
TILEDB_COL_MAJOR);

if (platform_config->cell_order)
schema.set_cell_order(
platform_config->cell_order == "row" ? TILEDB_ROW_MAJOR :
TILEDB_COL_MAJOR);
}

std::map<std::string, Dimension> dims;

for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) {
Expand Down
20 changes: 17 additions & 3 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,22 @@ using ColumnIndexInfo = std::tuple<
std::shared_ptr<ArrowArray> // tile extent
>;

using PlatformConfig =
std::map<std::string, std::map<std::string, std::map<std::string, bool>>>;
class PlatformConfig {
public:
uint64_t dataframe_dim_zstd_level = 3;
uint64_t sparse_nd_array_dim_zstd_level = 3;
bool write_X_chunked = true;
uint64_t goal_chunk_nnz = 100000000;
uint64_t remote_cap_nbytes = 2400000000;
uint64_t capacity = 100000;
std::vector<std::string> offsets_filters = {
"DoubleDeltaFilter", "BitWidthReductionFilter", "ZstdFilter"};
std::vector<std::string> validity_filters;
bool allows_duplicates = false;
std::optional<std::string> tile_order = std::nullopt;
std::optional<std::string> cell_order = std::nullopt;
bool consolidate_and_vacuum = false;
};

class ArrowAdapter {
public:
Expand Down Expand Up @@ -74,7 +88,7 @@ class ArrowAdapter {
std::shared_ptr<Context> ctx,
std::shared_ptr<ArrowSchema> arrow_schema,
ColumnIndexInfo index_column_info,
PlatformConfig platform_config);
std::optional<PlatformConfig> platform_config);

/**
* @brief Get Arrow format string from TileDB datatype.
Expand Down
4 changes: 2 additions & 2 deletions libtiledbsoma/test/unit_soma_collection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ TEST_CASE("SOMAExperiment: metadata") {
std::string uri = "mem://unit-test-experiment";
auto [schema, index_columns] = helper::create_arrow_schema();
SOMAExperiment::create(
uri, schema, index_columns, ctx, TimestampRange(0, 2));
uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2));
auto soma_experiment = SOMAExperiment::open(
uri, OpenMode::write, ctx, std::pair<uint64_t, uint64_t>(1, 1));

Expand Down Expand Up @@ -316,7 +316,7 @@ TEST_CASE("SOMAMeasurement: metadata") {
std::string uri = "mem://unit-test-measurement";
auto [schema, index_columns] = helper::create_arrow_schema();
SOMAMeasurement::create(
uri, schema, index_columns, ctx, TimestampRange(0, 2));
uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2));

auto soma_measurement = SOMAMeasurement::open(
uri, OpenMode::write, ctx, std::pair<uint64_t, uint64_t>(1, 1));
Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/test/unit_soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ TEST_CASE("SOMADataFrame: metadata") {
std::string uri = "mem://unit-test-collection";
auto [schema, index_columns] = helper::create_arrow_schema();
SOMADataFrame::create(
uri, schema, index_columns, ctx, TimestampRange(0, 2));
uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2));

auto soma_dataframe = SOMADataFrame::open(
uri,
Expand Down

0 comments on commit 5417c17

Please sign in to comment.