-
Notifications
You must be signed in to change notification settings - Fork 25
/
soma_array.cc
1660 lines (1474 loc) · 57.1 KB
/
soma_array.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/** @file soma_array.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2022-2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
* This file defines the SOMAArray class.
*/
#include "soma_array.h"
#include <tiledb/array_experimental.h>
#include "../utils/logger.h"
#include "../utils/util.h"
#include <format>
namespace tiledbsoma {
using namespace tiledb;
//===================================================================
//= public static
//===================================================================
std::unique_ptr<SOMAArray> SOMAArray::create(
std::shared_ptr<SOMAContext> ctx,
std::string_view uri,
ArraySchema schema,
std::string soma_type,
std::optional<TimestampRange> timestamp) {
Array::create(std::string(uri), schema);
std::shared_ptr<Array> array;
if (timestamp) {
array = std::make_shared<Array>(
*ctx->tiledb_ctx(),
std::string(uri),
TILEDB_WRITE,
TemporalPolicy(
TimestampStartEnd, timestamp->first, timestamp->second));
} else {
array = std::make_shared<Array>(
*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE);
}
array->put_metadata(
SOMA_OBJECT_TYPE_KEY,
TILEDB_STRING_UTF8,
static_cast<uint32_t>(soma_type.length()),
soma_type.c_str());
array->put_metadata(
ENCODING_VERSION_KEY,
TILEDB_STRING_UTF8,
static_cast<uint32_t>(ENCODING_VERSION_VAL.length()),
ENCODING_VERSION_VAL.c_str());
return std::make_unique<SOMAArray>(ctx, array, timestamp);
}
std::unique_ptr<SOMAArray> SOMAArray::open(
OpenMode mode,
std::string_view uri,
std::string_view name,
std::map<std::string, std::string> platform_config,
std::vector<std::string> column_names,
std::string_view batch_size,
ResultOrder result_order,
std::optional<TimestampRange> timestamp) {
LOG_DEBUG(
std::format("[SOMAArray] static method 'cfg' opening array '{}'", uri));
return std::make_unique<SOMAArray>(
mode,
uri,
std::make_shared<SOMAContext>(platform_config),
name,
column_names,
batch_size,
result_order,
timestamp);
}
std::unique_ptr<SOMAArray> SOMAArray::open(
OpenMode mode,
std::string_view uri,
std::shared_ptr<SOMAContext> ctx,
std::string_view name,
std::vector<std::string> column_names,
std::string_view batch_size,
ResultOrder result_order,
std::optional<TimestampRange> timestamp) {
LOG_DEBUG(
std::format("[SOMAArray] static method 'ctx' opening array '{}'", uri));
return std::make_unique<SOMAArray>(
mode,
uri,
ctx,
name,
column_names,
batch_size,
result_order,
timestamp);
}
//===================================================================
//= public non-static
//===================================================================
SOMAArray::SOMAArray(
OpenMode mode,
std::string_view uri,
std::string_view name,
std::map<std::string, std::string> platform_config,
std::vector<std::string> column_names,
std::string_view batch_size,
ResultOrder result_order,
std::optional<TimestampRange> timestamp)
: uri_(util::rstrip_uri(uri))
, result_order_(result_order)
, timestamp_(timestamp) {
ctx_ = std::make_shared<SOMAContext>(platform_config);
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache();
}
SOMAArray::SOMAArray(
OpenMode mode,
std::string_view uri,
std::shared_ptr<SOMAContext> ctx,
std::string_view name,
std::vector<std::string> column_names,
std::string_view batch_size,
ResultOrder result_order,
std::optional<TimestampRange> timestamp)
: uri_(util::rstrip_uri(uri))
, ctx_(ctx)
, result_order_(result_order)
, timestamp_(timestamp) {
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache();
}
SOMAArray::SOMAArray(
std::shared_ptr<SOMAContext> ctx,
std::shared_ptr<Array> arr,
std::optional<TimestampRange> timestamp)
: uri_(util::rstrip_uri(arr->uri()))
, ctx_(ctx)
, batch_size_("auto")
, result_order_(ResultOrder::automatic)
, timestamp_(timestamp)
, mq_(std::make_unique<ManagedQuery>(arr, ctx_->tiledb_ctx(), name_))
, arr_(arr)
, schema_(std::make_shared<ArraySchema>(arr->schema())) {
reset({}, batch_size_, result_order_);
fill_metadata_cache();
}
void SOMAArray::fill_metadata_cache() {
if (arr_->query_type() == TILEDB_WRITE) {
meta_cache_arr_ = std::make_shared<Array>(
*ctx_->tiledb_ctx(),
uri_,
TILEDB_READ,
TemporalPolicy(
TimestampStartEnd, timestamp()->first, timestamp()->second));
} else {
meta_cache_arr_ = arr_;
}
metadata_.clear();
for (uint64_t idx = 0; idx < meta_cache_arr_->metadata_num(); ++idx) {
std::string key;
tiledb_datatype_t value_type;
uint32_t value_num;
const void* value;
meta_cache_arr_->get_metadata_from_index(
idx, &key, &value_type, &value_num, &value);
MetadataValue mdval(value_type, value_num, value);
std::pair<std::string, const MetadataValue> mdpair(key, mdval);
metadata_.insert(mdpair);
}
}
const std::string SOMAArray::uri() const {
return uri_;
};
std::shared_ptr<SOMAContext> SOMAArray::ctx() {
return ctx_;
};
void SOMAArray::open(OpenMode mode, std::optional<TimestampRange> timestamp) {
timestamp_ = timestamp;
validate(mode, name_, timestamp);
reset(column_names(), batch_size_, result_order_);
fill_metadata_cache();
}
std::unique_ptr<SOMAArray> SOMAArray::reopen(
OpenMode mode, std::optional<TimestampRange> timestamp) {
return std::make_unique<SOMAArray>(
mode,
uri_,
ctx_,
name_,
column_names(),
batch_size_,
result_order_,
timestamp);
}
void SOMAArray::close() {
if (arr_->query_type() == TILEDB_WRITE)
meta_cache_arr_->close();
// Close the array through the managed query to ensure any pending queries
// are completed.
mq_->close();
metadata_.clear();
}
void SOMAArray::reset(
std::vector<std::string> column_names,
std::string_view batch_size,
ResultOrder result_order) {
// Reset managed query
mq_->reset();
if (!column_names.empty()) {
mq_->select_columns(column_names);
}
mq_->set_layout(result_order);
batch_size_ = batch_size;
result_order_ = result_order;
first_read_next_ = true;
submitted_ = false;
}
std::optional<std::shared_ptr<ArrayBuffers>> SOMAArray::read_next() {
// If the query is complete, return `std::nullopt`
if (mq_->is_complete(true)) {
return std::nullopt;
}
// Configure query and allocate result buffers
mq_->setup_read();
// Continue to submit the empty query on first read to return empty results
if (mq_->is_empty_query()) {
if (first_read_next_) {
first_read_next_ = false;
return mq_->results();
} else {
return std::nullopt;
}
}
first_read_next_ = false;
mq_->submit_read();
// Return the results, possibly incomplete
return mq_->results();
}
void SOMAArray::set_column_data(
std::string_view name,
uint64_t num_elems,
const void* data,
uint64_t* offsets,
uint8_t* validity) {
mq_->setup_write_column(name, num_elems, data, offsets, validity);
};
void SOMAArray::set_column_data(
std::string_view name,
uint64_t num_elems,
const void* data,
uint32_t* offsets,
uint8_t* validity) {
mq_->setup_write_column(name, num_elems, data, offsets, validity);
};
uint64_t SOMAArray::ndim() const {
return tiledb_schema()->domain().ndim();
}
std::vector<std::string> SOMAArray::dimension_names() const {
std::vector<std::string> result;
auto dimensions = tiledb_schema()->domain().dimensions();
for (const auto& dim : dimensions) {
result.push_back(dim.name());
}
return result;
}
bool SOMAArray::has_dimension_name(const std::string& name) const {
auto dimensions = tiledb_schema()->domain().dimensions();
for (const auto& dim : dimensions) {
if (dim.name() == name) {
return true;
}
}
return false;
}
std::vector<std::string> SOMAArray::attribute_names() const {
std::vector<std::string> result;
auto schema = tiledb_schema();
unsigned n = schema->attribute_num();
for (unsigned i = 0; i < n; i++) {
result.push_back(schema->attribute(i).name());
}
return result;
}
void SOMAArray::write(bool sort_coords) {
if (arr_->query_type() != TILEDB_WRITE) {
throw TileDBSOMAError("[SOMAArray] array must be opened in write mode");
}
mq_->submit_write(sort_coords);
mq_->reset();
}
void SOMAArray::consolidate_and_vacuum(std::vector<std::string> modes) {
for (auto mode : modes) {
auto cfg = ctx_->tiledb_ctx()->config();
cfg["sm.consolidation.mode"] = mode;
Array::consolidate(Context(cfg), uri_);
Array::vacuum(Context(cfg), uri_);
}
}
void SOMAArray::set_metadata(
const std::string& key,
tiledb_datatype_t value_type,
uint32_t value_num,
const void* value,
bool force) {
if (!force && key.compare(SOMA_OBJECT_TYPE_KEY) == 0)
throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified.");
if (!force && key.compare(ENCODING_VERSION_KEY) == 0)
throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified.");
arr_->put_metadata(key, value_type, value_num, value);
MetadataValue mdval(value_type, value_num, value);
std::pair<std::string, const MetadataValue> mdpair(key, mdval);
metadata_.insert(mdpair);
}
void SOMAArray::delete_metadata(const std::string& key, bool force) {
if (!force && key.compare(SOMA_OBJECT_TYPE_KEY) == 0) {
throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted.");
}
if (!force && key.compare(ENCODING_VERSION_KEY) == 0) {
throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted.");
}
arr_->delete_metadata(key);
metadata_.erase(key);
}
std::optional<MetadataValue> SOMAArray::get_metadata(const std::string& key) {
if (metadata_.count(key) == 0) {
return std::nullopt;
}
return metadata_[key];
}
std::map<std::string, MetadataValue> SOMAArray::get_metadata() {
return metadata_;
}
bool SOMAArray::has_metadata(const std::string& key) {
return metadata_.count(key) != 0;
}
uint64_t SOMAArray::metadata_num() const {
return metadata_.size();
}
void SOMAArray::validate(
OpenMode mode,
std::string_view name,
std::optional<TimestampRange> timestamp) {
// Validate parameters
auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE;
try {
LOG_DEBUG(std::format("[SOMAArray] opening array '{}'", uri_));
if (timestamp) {
arr_ = std::make_shared<Array>(
*ctx_->tiledb_ctx(),
uri_,
tdb_mode,
TemporalPolicy(
TimestampStartEnd, timestamp->first, timestamp->second));
} else {
arr_ = std::make_shared<Array>(*ctx_->tiledb_ctx(), uri_, tdb_mode);
}
LOG_TRACE(std::format("[SOMAArray] loading enumerations"));
ArrayExperimental::load_all_enumerations(
*ctx_->tiledb_ctx(), *(arr_.get()));
schema_ = std::make_shared<ArraySchema>(arr_->schema());
mq_ = std::make_unique<ManagedQuery>(arr_, ctx_->tiledb_ctx(), name);
} catch (const std::exception& e) {
throw TileDBSOMAError(
std::format("Error opening array: '{}'\n {}", uri_, e.what()));
}
}
std::optional<TimestampRange> SOMAArray::timestamp() {
return timestamp_;
}
// Note that ArrowTable is simply our libtiledbsoma pairing of ArrowArray and
// ArrowSchema from nanoarrow.
//
// The domainish enum simply lets us re-use code which is common across
// core domain, core current domain, and core non-empty domain.
ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {
int array_ndim = this->ndim();
auto dimensions = tiledb_schema()->domain().dimensions();
// Create the schema for the info we return
std::vector<std::string> names(array_ndim);
std::vector<tiledb_datatype_t> tiledb_datatypes(array_ndim);
for (int i = 0; i < (int)array_ndim; i++) {
const Dimension& core_dim = dimensions[i];
names[i] = core_dim.name();
tiledb_datatypes[i] = core_dim.type();
}
auto arrow_schema = ArrowAdapter::make_arrow_schema(
names, tiledb_datatypes);
// Create the data for the info we return
auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim);
for (int i = 0; i < array_ndim; i++) {
auto core_dim = dimensions[i];
auto core_type_code = core_dim.type();
ArrowArray* child = nullptr;
switch (core_type_code) {
case TILEDB_INT64:
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_TIME_HR:
case TILEDB_TIME_MIN:
case TILEDB_TIME_SEC:
case TILEDB_TIME_MS:
case TILEDB_TIME_US:
case TILEDB_TIME_NS:
case TILEDB_TIME_PS:
case TILEDB_TIME_FS:
case TILEDB_TIME_AS:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int64_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint64_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int32_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint32_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int16_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint16_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int8_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint8_t>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<double>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<float>(core_dim.name(), which_kind));
break;
case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;
default:
throw TileDBSOMAError(std::format(
"SOMAArray::_get_core_domainish:dim {} has unhandled type "
"{}",
core_dim.name(),
tiledb::impl::type_to_str(core_type_code)));
}
arrow_array->children[i] = child;
}
return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
}
uint64_t SOMAArray::nnz() {
// Verify array is sparse
if (schema_->array_type() != TILEDB_SPARSE) {
throw TileDBSOMAError(
"[SOMAArray] nnz is only supported for sparse arrays");
}
// Load fragment info
FragmentInfo fragment_info(*ctx_->tiledb_ctx(), uri_);
fragment_info.load();
LOG_DEBUG(std::format("[SOMAArray] Fragment info for array '{}'", uri_));
if (LOG_DEBUG_ENABLED()) {
fragment_info.dump();
}
// Find the subset of fragments contained within the read timestamp range
// [if any]
std::vector<uint32_t> relevant_fragments;
for (uint32_t fid = 0; fid < fragment_info.fragment_num(); fid++) {
auto frag_ts = fragment_info.timestamp_range(fid);
assert(frag_ts.first <= frag_ts.second);
if (timestamp_) {
if (frag_ts.first > timestamp_->second ||
frag_ts.second < timestamp_->first) {
// fragment is fully outside the read timestamp range: skip it
continue;
} else if (!(frag_ts.first >= timestamp_->first &&
frag_ts.second <= timestamp_->second)) {
// fragment overlaps read timestamp range, but isn't fully
// contained within: fall back to count_cells to sort that out.
return _nnz_slow();
}
}
// fall through: fragment is fully contained within the read timestamp
// range
relevant_fragments.push_back(fid);
// If any relevant fragment is a consolidated fragment, fall back to
// counting cells, because the fragment may contain duplicates.
// If the application is allowing duplicates (in which case it's the
// application's job to otherwise ensure uniqueness), then
// sum-over-fragments is the right thing to do.
if (!schema_->allows_dups() && frag_ts.first != frag_ts.second) {
return _nnz_slow();
}
}
auto fragment_count = relevant_fragments.size();
if (fragment_count == 0) {
// No data have been written [in the read timestamp range]
return 0;
}
if (fragment_count == 1) {
// Only one fragment; return its cell_num
return fragment_info.cell_num(relevant_fragments[0]);
}
// Check for overlapping fragments on the first dimension and
// compute total_cell_num while going through the loop
uint64_t total_cell_num = 0;
std::vector<std::array<uint64_t, 2>> non_empty_domains(fragment_count);
// The loop after this only works if dim 0 is int64 soma_joinid or
// soma_dim_0. That's the case for _almost_ all SOMADataFrame objects, but
// not the "variant-indexed" ones: the SOMA spec only requires
// that soma_joinid be present as a dim or an attr. It's true for all
// SOMASparseNDArray objects.
auto dim = tiledb_schema()->domain().dimension(0);
auto dim_name = dim.name();
auto type_code = dim.type();
if ((dim_name != "soma_joinid" && dim_name != "soma_dim_0") ||
type_code != TILEDB_INT64) {
LOG_DEBUG(std::format(
"[SOMAArray::nnz] dim 0 (type={} name={}) isn't int64 "
"soma_joinid or int64 soma_dim_0: using _nnz_slow",
tiledb::impl::type_to_str(type_code),
dim_name));
return _nnz_slow();
}
for (uint32_t i = 0; i < fragment_count; i++) {
// TODO[perf]: Reading fragment info is not supported on TileDB Cloud
// yet, but reading one fragment at a time will be slow. Is there
// another way?
total_cell_num += fragment_info.cell_num(relevant_fragments[i]);
fragment_info.get_non_empty_domain(
relevant_fragments[i], 0, &non_empty_domains[i]);
LOG_DEBUG(std::format(
"[SOMAArray] fragment {} non-empty domain = [{}, {}]",
i,
non_empty_domains[i][0],
non_empty_domains[i][1]));
}
// Sort non-empty domains by the start of their ranges
std::sort(non_empty_domains.begin(), non_empty_domains.end());
// After sorting, if the end of a non-empty domain is >= the beginning of
// the next non-empty domain, there is an overlap
bool overlap = false;
for (uint32_t i = 0; i < fragment_count - 1; i++) {
LOG_DEBUG(std::format(
"[SOMAArray] Checking {} < {}",
non_empty_domains[i][1],
non_empty_domains[i + 1][0]));
if (non_empty_domains[i][1] >= non_empty_domains[i + 1][0]) {
overlap = true;
break;
}
}
// If relevant fragments do not overlap, return the total cell_num
if (!overlap) {
return total_cell_num;
}
// Found relevant fragments with overlap, count cells
return _nnz_slow();
}
uint64_t SOMAArray::_nnz_slow() {
LOG_DEBUG(
"[SOMAArray] nnz() found consolidated or overlapping fragments, "
"counting cells...");
auto sr = SOMAArray::open(
OpenMode::read,
uri_,
ctx_,
"count_cells",
{schema_->domain().dimension(0).name()},
batch_size_,
result_order_,
timestamp_);
uint64_t total_cell_num = 0;
while (auto batch = sr->read_next()) {
total_cell_num += (*batch)->num_rows();
}
return total_cell_num;
}
std::vector<int64_t> SOMAArray::shape() {
// There are two reasons for this:
// * Transitional, non-monolithic, phased, careful development for the
// new-shape feature
// * Even after the new-shape feature is fully released, there will be old
// arrays on disk that were created before this feature existed.
// So this is long-term code.
return _get_current_domain().is_empty() ? _tiledb_domain() :
_tiledb_current_domain();
}
std::vector<int64_t> SOMAArray::maxshape() {
return _tiledb_domain();
}
// This is a helper for can_upgrade_shape and can_resize, which have
// much overlap.
StatusAndReason SOMAArray::_can_set_shape_helper(
const std::vector<int64_t>& newshape,
bool must_already_have,
std::string function_name_for_messages) {
// E.g. it's an error to try to upgrade_domain or resize specifying
// a 3-D shape on a 2-D array.
auto arg_ndim = newshape.size();
auto array_ndim = schema_->domain().ndim();
if (array_ndim != arg_ndim) {
return std::pair(
false,
std::format(
"{}: provided shape has ndim {}, while the array has {}",
function_name_for_messages,
arg_ndim,
array_ndim));
}
// Enforce the semantics that tiledbsoma_upgrade_shape must be called
// only on arrays that don't have a shape set, and resize must be called
// only on arrays that do.
bool has_shape = has_current_domain();
if (must_already_have) {
// They're trying to do resize on an array that doesn't already have a
// shape.
if (!has_shape) {
return std::pair(
false,
std::format(
"{}: array currently has no shape: please "
"upgrade the array.",
function_name_for_messages));
}
} else {
// They're trying to do upgrade_shape on an array that already has a
// shape.
if (has_shape) {
return std::pair(
false,
std::format(
"{}: array already has a shape: please use resize",
function_name_for_messages));
}
}
// * For old-style arrays without shape: core domain (soma maxdomain) may be
// small (like 100) or big (like 2 billionish).
// * For new-style arrays with shape: core current domain (soma domain) will
// probably be small and core domain (soma maxdomain) will be huge.
//
// In either case, we need to check that the user's requested shape isn't
// outside the core domain, which is immutable. For old-style arrays,
//
// if the requested shape fits in the array's core domain, it's good to go
// as a new shape.
auto domain_check = _can_set_shape_domainish_subhelper(
newshape, false, function_name_for_messages);
if (!domain_check.first) {
return domain_check;
}
// For new-style arrays, we need to additionally that the the requested
// shape (core current domain) isn't a downsize of the current one.
if (has_shape) {
auto current_domain_check = _can_set_shape_domainish_subhelper(
newshape, true, function_name_for_messages);
if (!current_domain_check.first) {
return current_domain_check;
}
}
return std::pair(true, "");
}
// This is a helper for _can_set_shape_helper: it's used for comparing
// the user's requested shape against the core current domain or core (max)
// domain.
StatusAndReason SOMAArray::_can_set_shape_domainish_subhelper(
const std::vector<int64_t>& newshape,
bool check_current_domain,
std::string function_name_for_messages) {
Domain domain = schema_->domain();
for (unsigned i = 0; i < domain.ndim(); i++) {
const auto& dim = domain.dimension(i);
const std::string& dim_name = dim.name();
// These methods are only for SOMA NDArrays, and any other arrays for
// which the indices are entirely int64. SOMA DataFrame objects, with
// multi-type dims, need to go through upgrade_domain -- and this is
// library-internal code, it's not the user's fault if we got here.
if (dim.type() != TILEDB_INT64) {
throw TileDBSOMAError(std::format(
"{}: internal error: expected {} dim to be {}; got {}",
function_name_for_messages,
dim_name,
tiledb::impl::type_to_str(TILEDB_INT64),
tiledb::impl::type_to_str(dim.type())));
}
if (check_current_domain) {
std::pair<int64_t, int64_t>
cap = _core_current_domain_slot<int64_t>(dim_name);
int64_t old_dim_shape = cap.second + 1;
if (newshape[i] < old_dim_shape) {
return std::pair(
false,
std::format(
"{} for {}: new {} < existing shape {}",
function_name_for_messages,
dim_name,
newshape[i],
old_dim_shape));
}
} else {
std::pair<int64_t, int64_t> cap = _core_domain_slot<int64_t>(
dim_name);
int64_t old_dim_shape = cap.second + 1;
if (newshape[i] > old_dim_shape) {
return std::pair(
false,
std::format(
"{} for {}: new {} < maxshape {}",
function_name_for_messages,
dim_name,
newshape[i],
old_dim_shape));
}
}
}
return std::pair(true, "");
}
StatusAndReason SOMAArray::_can_set_soma_joinid_shape_helper(
int64_t newshape,
bool must_already_have,
std::string function_name_for_messages) {
// Fail if the array doesn't already have a shape yet (they should upgrade
// first).
if (!must_already_have) {
// Upgrading an array to give it a current domain
if (has_current_domain()) {
return std::pair(
false,
std::format(
"{}: dataframe already has its domain set.",
function_name_for_messages));
}
} else {
// Resizing an array's existing current domain
if (!has_current_domain()) {
return std::pair(
false,
std::format(
"{}: dataframe currently has no domain set.",
function_name_for_messages));
}
}
// OK if soma_joinid isn't a dim.
if (!has_dimension_name("soma_joinid")) {
return std::pair(true, "");
}
// Fail if the newshape isn't within the array's core current domain.
if (must_already_have) {
std::pair cur_dom_lo_hi = _core_current_domain_slot<int64_t>(
"soma_joinid");
if (newshape < cur_dom_lo_hi.second) {
return std::pair(
false,
std::format(
"{}: new soma_joinid shape {} < existing shape {}",
function_name_for_messages,
newshape,
cur_dom_lo_hi.second + 1));
}
}
// Fail if the newshape isn't within the array's core (max) domain.
std::pair dom_lo_hi = _core_domain_slot<int64_t>("soma_joinid");
if (newshape > dom_lo_hi.second) {
return std::pair(
false,
std::format(
"{}: new soma_joinid shape {} > maxshape {}",
function_name_for_messages,
newshape,
dom_lo_hi.second + 1));
}
// Sucess otherwise.
return std::pair(true, "");
}
void SOMAArray::_set_shape_helper(
const std::vector<int64_t>& newshape,
bool must_already_have,
std::string function_name_for_messages) {
if (arr_->query_type() != TILEDB_WRITE) {
throw TileDBSOMAError(std::format(
"{} array must be opened in write mode",
function_name_for_messages));
}
if (!must_already_have) {
// Upgrading an array to install a current domain
if (!_get_current_domain().is_empty()) {
throw TileDBSOMAError(std::format(
"{}: array must not already have a shape: please upgrade it",
function_name_for_messages));
}
} else {
// Expanding an array's current domain
if (_get_current_domain().is_empty()) {
throw TileDBSOMAError(std::format(
"{} array must already have a shape: please upgrade it",
function_name_for_messages));
}
}
// Variant-indexed dataframes must use a separate path
_check_dims_are_int64();
auto tctx = ctx_->tiledb_ctx();
ArraySchema schema = arr_->schema();
Domain domain = schema.domain();
ArraySchemaEvolution schema_evolution(*tctx);
CurrentDomain new_current_domain(*tctx);
NDRectangle ndrect(*tctx, domain);
unsigned n = domain.ndim();
if ((unsigned)newshape.size() != n) {
throw TileDBSOMAError(std::format(
"[SOMAArray::resize]: newshape has dimension count {}; array has "
"{} ",
newshape.size(),
n));
}
for (unsigned i = 0; i < n; i++) {
ndrect.set_range<int64_t>(
domain.dimension(i).name(), 0, newshape[i] - 1);
}
new_current_domain.set_ndrectangle(ndrect);
schema_evolution.expand_current_domain(new_current_domain);
schema_evolution.array_evolve(uri_);
}
void SOMAArray::_set_soma_joinid_shape_helper(
int64_t newshape,
bool must_already_have,
std::string function_name_for_messages) {
if (arr_->query_type() != TILEDB_WRITE) {
throw TileDBSOMAError(std::format(
"{}: array must be opened in write mode",
function_name_for_messages));
}
if (!must_already_have) {