From cb88f560570aab66d61c6955697df3766c2a2184 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Thu, 18 Dec 2014 15:03:20 -0800 Subject: [PATCH] Issue #25: Make ORDER BY DESC faster fbshipit-source-id: 1151e93b99f --- storage/rocksdb/ha_rocksdb.cc | 251 +++++++++++++++++++++++------- storage/rocksdb/ha_rocksdb.h | 7 +- storage/rocksdb/rdb_applyiter.cc | 9 +- storage/rocksdb/rdb_applyiter.h | 8 +- storage/rocksdb/rdb_cf_manager.cc | 12 ++ storage/rocksdb/rdb_cf_options.h | 2 + storage/rocksdb/rdb_datadic.cc | 2 +- storage/rocksdb/rdb_datadic.h | 10 +- storage/rocksdb/rdb_debug.cc | 33 ++-- storage/rocksdb/rdb_rowmods.cc | 145 ++++++++++------- storage/rocksdb/rdb_rowmods.h | 59 +++++-- 11 files changed, 396 insertions(+), 142 deletions(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 396c9d82b6fb..bdef953bfd15 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -560,6 +560,25 @@ static void init_rocksdb_psi_keys() #endif +static int bytewise_compare(const rocksdb::Slice& a, const rocksdb::Slice& b) +{ + size_t a_size= a.size(); + size_t b_size= b.size(); + size_t len= (a_size < b_size) ? a_size : b_size; + int res; + + if ((res= memcmp(a.data(), b.data(), len))) + return res; + + /* Ok, res== 0 */ + if (a_size != b_size) + { + return a_size < b_size? -1 : 1; + } + return 0; +} + + /* The keys are in form: {index_number} {mem-comparable-key} @@ -572,25 +591,12 @@ class Primary_key_comparator : public rocksdb::Comparator public: int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const { - size_t a_size= a.size(); - size_t b_size= b.size(); - size_t len= (a_size < b_size) ? a_size : b_size; - int res; - - if ((res= memcmp(a.data(), b.data(), len))) - return res; - - /* Ok, res== 0 */ - if (a_size != b_size) - { - return a_size < b_size? -1 : 1; - } - return 0; + return bytewise_compare(a,b); } /* The following is not needed by RocksDB, but conceptually should be here: */ static ulong get_hashnr(const char *key, size_t key_len); - const char* Name() const { return "RocksDB_SE_v3.0"; } + const char* Name() const { return "RocksDB_SE_v3.1"; } //TODO: advanced funcs: // - FindShortestSeparator @@ -600,8 +606,26 @@ class Primary_key_comparator : public rocksdb::Comparator void FindShortSuccessor(std::string* key) const {} }; + +class Reverse_comparator : public rocksdb::Comparator +{ + int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const + { + return -bytewise_compare(a,b); + } + const char* Name() const { return "rev:RocksDB_SE_v3.1"; } + void FindShortestSeparator(std::string* start, const rocksdb::Slice& limit) const {} + void FindShortSuccessor(std::string* key) const {} +}; + + Primary_key_comparator rocksdb_pk_comparator; +Reverse_comparator rocksdb_rev_pk_comparator; +/* + This function doesn't support reverse comparisons. They are handled by the + caller. +*/ int compare_mem_comparable_keys(const uchar *a, size_t a_len, const uchar *b, size_t b_len) { rocksdb::Slice a_slice((char*)a, a_len); @@ -704,17 +728,20 @@ class Rdb_transaction if (changes.is_empty()) return false; - Row_table_iter iter(&changes); - - for (iter.SeekToFirst(); iter.Valid(); iter.Next()) + for (int reverse=0; reverse <= 1; reverse++) { - if (iter.is_tombstone()) - { - batch.Delete(iter.cf_handle(), iter.key()); - } - else + Row_table_iter iter(&changes, (bool)reverse); + + for (iter.SeekToFirst(); iter.Valid(); iter.Next()) { - batch.Put(iter.cf_handle(), iter.key(), iter.value()); + if (iter.is_tombstone()) + { + batch.Delete(iter.keydef()->get_cf(), iter.key()); + } + else + { + batch.Put(iter.keydef()->get_cf(), iter.key(), iter.value()); + } } } rocksdb::Status s= rdb->Write(rocksdb::WriteOptions(), &batch); @@ -895,6 +922,12 @@ void get_cf_options(const std::string &cf_name, rocksdb::ColumnFamilyOptions *op { *opts = default_cf_opts; rocksdb_cf_options_map.Get(cf_name, opts); + + // Set the comparator according to 'rev:' + if (is_cf_name_reverse(cf_name.c_str())) + opts->comparator= &rocksdb_rev_pk_comparator; + else + opts->comparator= &rocksdb_pk_comparator; } /* @@ -1477,7 +1510,11 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked) StringBuffer<64> fullname; make_dbname_tablename(&fullname, table); if (!(tbl_def= ddl_manager.find((uchar*)fullname.c_ptr(), fullname.length()))) + { + my_error(ER_INTERNAL_ERROR, MYF(0), + "Attempt to open a table that is not present in RocksDB-SE data dictionary"); DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } key_descr= tbl_def->key_descr; pk_descr= key_descr[table->s->primary_key]; @@ -1495,7 +1532,8 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked) DBUG_RETURN(HA_ERR_INTERNAL_ERROR); pk_descr->setup(table); // move this into get_share() ?? - pk_descr->set_cf_handle(cf_handle, is_auto_cf); + pk_descr->set_cf_handle(cf_handle, is_cf_name_reverse(comment), + is_auto_cf); uint packed_key_len= pk_descr->max_storage_fmt_length(); @@ -1517,7 +1555,8 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked) DBUG_RETURN(HA_ERR_INTERNAL_ERROR); key_descr[i]->setup(table); // move this into get_share() ?? - key_descr[i]->set_cf_handle(cf_handle, is_auto_cf); + key_descr[i]->set_cf_handle(cf_handle, is_cf_name_reverse(comment), + is_auto_cf); uint packed_len= key_descr[i]->max_storage_fmt_length(); if (packed_len > max_packed_sec_key_len) @@ -1655,7 +1694,9 @@ int ha_rocksdb::create_key_defs(TABLE *table_arg, const char *db_table, uint len goto error; if (!(key_descr[i]= new RDBSE_KEYDEF(ddl_manager.get_next_number(), i, - cf_handle, is_auto_cf))) + cf_handle, + is_cf_name_reverse(comment), + is_auto_cf))) goto error; } pk_descr= key_descr[table_arg->s->primary_key]; @@ -1956,7 +1997,7 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key, This will open the iterator and position it at a record that's equal or greater than the lookup tuple. */ - setup_index_scan(kd->get_cf(), &slice); + setup_index_scan(kd, &slice); bool move_forward= true; @@ -2400,7 +2441,7 @@ int ha_rocksdb::get_row_by_rowid(uchar *buf, const char *rowid, uint rowid_size) options.snapshot= trx->snapshot; bool found; - if (!trx->changes.Get(key_slice, &retrieved_record, &found)) + if (!trx->changes.Get(pk_descr, key_slice, &retrieved_record, &found)) { rocksdb::Status s= rdb->Get(options, pk_descr->get_cf(), key_slice, &retrieved_record); @@ -2429,13 +2470,21 @@ int ha_rocksdb::get_row_by_rowid(uchar *buf, const char *rowid, uint rowid_size) int ha_rocksdb::index_next(uchar *buf) { - return index_next_with_direction(buf, true); + bool moves_forward= true; + if (key_descr[active_index]->is_reverse_cf) + moves_forward= false; + + return index_next_with_direction(buf, moves_forward); } int ha_rocksdb::index_prev(uchar *buf) { - return index_next_with_direction(buf, false); + bool moves_forward= false; + if (key_descr[active_index]->is_reverse_cf) + moves_forward= true; + + return index_next_with_direction(buf, moves_forward); } @@ -2466,6 +2515,51 @@ int ha_rocksdb::index_next_with_direction(uchar *buf, bool move_forward) int ha_rocksdb::index_first(uchar *buf) +{ + return key_descr[active_index]->is_reverse_cf ? index_last_intern(buf): + index_first_intern(buf); +} + + +int ha_rocksdb::index_last(uchar *buf) +{ + return key_descr[active_index]->is_reverse_cf ? index_first_intern(buf): + index_last_intern(buf); +} + + +/* + Start scanning from the "first" value. + + The 'first' here means "the first from start of the key space". + For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'. + + An picture of a forward-ordered keyspace (remember, the keys have form + 'indexnr-keyval'. Suppose the index we are at has number n) + + (n-1) - ... + ( n ) <--- 1. (n) doesn't exist in the db but it would be here. + ( n ) - aaa <--- 2. Seek("n") will put us here on the first index + ( n ) - bbb record. + ( n ) - cc + + So, need to do: Seek(n); + + A backward-ordered keyspace: + + (n+1) - bbb + (n+1) - aaa + (n+1) <--- (n+1) doesn't exist in the db but would be here. + ( n ) - ccc <--- 1. We need to be here. + ( n ) - bbb + ( n ) - aaa + ( n ) + + So, need to: Seek(n+1); + +*/ + +int ha_rocksdb::index_first_intern(uchar *buf) { uchar *key; uint key_size; @@ -2474,10 +2568,15 @@ int ha_rocksdb::index_first(uchar *buf) key= (active_index == table->s->primary_key)? pk_packed_tuple : sec_key_packed_tuple; - key_descr[active_index]->get_infimum_key(key, &key_size); + + if (key_descr[active_index]->is_reverse_cf) + key_descr[active_index]->get_supremum_key(key, &key_size); + else + key_descr[active_index]->get_infimum_key(key, &key_size); + rocksdb::Slice index_key((const char*)key, key_size); - setup_index_scan(key_descr[active_index]->get_cf(), &index_key); + setup_index_scan(key_descr[active_index], &index_key); skip_scan_it_next_call= TRUE; rc= index_next(buf); @@ -2485,7 +2584,43 @@ int ha_rocksdb::index_first(uchar *buf) } -int ha_rocksdb::index_last(uchar *buf) +/* + Start scanning from the "last" value + + The 'last' here means "the last from start of the key space". + For reverse-ordered key spaces, we will actually read the smallest value. + + An picture of a forward-ordered keyspace (remember, the keys have form + 'indexnr-keyval'. Suppose the we are at a key that has number n) + + (n-1)-something + ( n )-aaa + ( n )-bbb + ( n )-ccc <----------- Need to seek to here. + (n+1) <---- Doesn't exist, but would be here. + (n+1)-smth, or no value at all + + RocksDB's Iterator::Seek($val) seeks to "at $val or first value that's + greater". We can't see to "(n)-ccc" directly, because we don't know what + is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek + to "(n+1)", which is the least possible value that's greater than any value + in index #n. Then we step one record back. + + So, need to: it->Seek(n+1) || it->SeekToLast(); it->Prev(); + + A backward-ordered keyspace: + + (n+1)-something + ( n ) - ccc + ( n ) - bbb + ( n ) - aaa <---------------- (*) Need to seek here. + ( n ) <--- Doesn't exist, but would be here. + (n-1)-smth, or no value at all + + So, need to: it->Seek(n) || it->SeekToLast(); it->Prev(); +*/ + +int ha_rocksdb::index_last_intern(uchar *buf) { uchar *key; uint key_size; @@ -2494,10 +2629,15 @@ int ha_rocksdb::index_last(uchar *buf) key= (active_index == table->s->primary_key)? pk_packed_tuple : sec_key_packed_tuple; - key_descr[active_index]->get_supremum_key(key, &key_size); + + if (key_descr[active_index]->is_reverse_cf) + key_descr[active_index]->get_infimum_key(key, &key_size); + else + key_descr[active_index]->get_supremum_key(key, &key_size); + rocksdb::Slice index_key((const char*)key, key_size); - setup_index_scan(key_descr[active_index]->get_cf(), &index_key); + setup_index_scan(key_descr[active_index], &index_key); if (!scan_it->Valid()) { @@ -2540,8 +2680,6 @@ void ha_rocksdb::unlock_row() } -//static const char* empty_str=""; - int ha_rocksdb::write_row(uchar *buf) { DBUG_ENTER("ha_rocksdb::write_row"); @@ -2592,7 +2730,7 @@ int ha_rocksdb::write_row(uchar *buf) rocksdb::ReadOptions options; bool found; - if (!trx->changes.Get(key_slice, &retrieved_record, &found)) + if (!trx->changes.Get(pk_descr, key_slice, &retrieved_record, &found)) { rocksdb::Status s= rdb->Get(options, pk_descr->get_cf(), key_slice, &retrieved_record); @@ -2628,7 +2766,7 @@ int ha_rocksdb::write_row(uchar *buf) rocksdb::Slice secondary_key_slice((char*)sec_key_packed_tuple, packed_size); rocksdb::Slice value_slice((const char*)sec_key_tails, tail_size); - trx->changes.Put(key_descr[i]->get_cf(), secondary_key_slice, value_slice); + trx->changes.Put(key_descr[i], secondary_key_slice, value_slice); trx->n_writes++; } } @@ -2640,7 +2778,7 @@ int ha_rocksdb::write_row(uchar *buf) rocksdb::Slice value_slice; convert_record_to_storage_format(&value_slice); - trx->changes.Put(pk_descr->get_cf(), key_slice, value_slice); + trx->changes.Put(pk_descr, key_slice, value_slice); trx->n_writes++; if (!do_locking && trx->n_writes > THDVAR(table->in_use, bulk_load_size)) @@ -2658,7 +2796,7 @@ int ha_rocksdb::write_row(uchar *buf) Open a cursor and position it at the passed record */ -void ha_rocksdb::setup_index_scan(rocksdb::ColumnFamilyHandle *cf, +void ha_rocksdb::setup_index_scan(RDBSE_KEYDEF *keydef, rocksdb::Slice *slice) { Rdb_transaction *trx= get_or_create_trx(table->in_use); @@ -2672,9 +2810,9 @@ void ha_rocksdb::setup_index_scan(rocksdb::ColumnFamilyHandle *cf, rocksdb::ReadOptions options; if (!lock_rows) options.snapshot= trx->snapshot; - rocksdb::Iterator* rocksdb_it= rdb->NewIterator(options, cf); + rocksdb::Iterator* rocksdb_it= rdb->NewIterator(options, keydef->get_cf()); scan_it= new Apply_changes_iter; - scan_it->init(&trx->changes, rocksdb_it); + scan_it->init(keydef, &trx->changes, rocksdb_it); } /* @@ -2692,10 +2830,14 @@ int ha_rocksdb::rnd_init(bool scan) if (scan) { uint key_size; - pk_descr->get_infimum_key(pk_packed_tuple, &key_size); + if (pk_descr->is_reverse_cf) + pk_descr->get_supremum_key(pk_packed_tuple, &key_size); + else + pk_descr->get_infimum_key(pk_packed_tuple, &key_size); + rocksdb::Slice table_key((const char*)pk_packed_tuple, key_size); - setup_index_scan(pk_descr->get_cf(), &table_key); + setup_index_scan(pk_descr, &table_key); skip_scan_it_next_call= true; } else @@ -2775,7 +2917,7 @@ int ha_rocksdb::rnd_next_with_direction(uchar *buf, bool move_forward) rocksdb::ReadOptions options; /* lock_rows==TRUE, so not setting options.snapshot */ bool found; - if (!trx->changes.Get(key, &retrieved_record, &found)) + if (!trx->changes.Get(pk_descr, key, &retrieved_record, &found)) { rocksdb::Status s= rdb->Get(options, pk_descr->get_cf(), key, &retrieved_record); @@ -2867,7 +3009,7 @@ int ha_rocksdb::delete_row(const uchar *buf) rocksdb::Slice key_slice(last_rowkey.ptr(), last_rowkey.length()); Rdb_transaction *trx= get_or_create_trx(table->in_use); trx->prepare_for_write(); - trx->changes.Delete(pk_descr->get_cf(), key_slice); + trx->changes.Delete(pk_descr, key_slice); DBUG_ASSERT(buf == table->record[0]); // Delete the record for every secondary index @@ -2881,7 +3023,7 @@ int ha_rocksdb::delete_row(const uchar *buf) NULL, NULL); rocksdb::Slice secondary_key_slice((char*)sec_key_packed_tuple, packed_size); - trx->changes.Delete(key_descr[i]->get_cf(), secondary_key_slice); + trx->changes.Delete(key_descr[i], secondary_key_slice); } } @@ -2995,7 +3137,7 @@ int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data) DBUG_RETURN(return_lock_failure(timed_out)); bool found; - if (!trx->changes.Get(key_slice, &retrieved_record, &found)) + if (!trx->changes.Get(pk_descr, key_slice, &retrieved_record, &found)) { rocksdb::ReadOptions options; /* Not setting options.snapshot, we need to check the real contents */ @@ -3012,7 +3154,7 @@ int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data) trx->prepare_for_write(); trx->add_lock(lock); - trx->changes.Delete(pk_descr->get_cf(), old_key_slice); + trx->changes.Delete(pk_descr, old_key_slice); } trx->prepare_for_write(); @@ -3033,7 +3175,7 @@ int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data) NULL, NULL); rocksdb::Slice secondary_key_slice((char*)sec_key_packed_tuple, packed_size); - trx->changes.Delete(key_descr[i]->get_cf(), secondary_key_slice); + trx->changes.Delete(key_descr[i], secondary_key_slice); } // Then, Put(). @@ -3044,8 +3186,7 @@ int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data) rocksdb::Slice secondary_key_slice((char*)sec_key_packed_tuple, packed_size); rocksdb::Slice value_slice((const char*)sec_key_tails, tail_size); - trx->changes.Put(key_descr[i]->get_cf(), secondary_key_slice, - value_slice); + trx->changes.Put(key_descr[i], secondary_key_slice, value_slice); } } } @@ -3055,7 +3196,7 @@ int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data) /* Write the table record */ rocksdb::Slice value_slice; convert_record_to_storage_format(&value_slice); - trx->changes.Put(pk_descr->get_cf(), key_slice, value_slice); + trx->changes.Put(pk_descr, key_slice, value_slice); DBUG_RETURN(0); } diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 1974b44b2710..1b0723f275a1 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -169,8 +169,7 @@ class ha_rocksdb: public handler int create_key_defs(TABLE *table_arg, const char *db_table, uint len); int secondary_index_read(int keyno, uchar *buf); - void setup_index_scan(rocksdb::ColumnFamilyHandle *cf, - rocksdb::Slice *slice); + void setup_index_scan(RDBSE_KEYDEF *keydef, rocksdb::Slice *slice); int get_row_by_rowid(uchar *buf, const char *pk_tuple, uint pk_tuple_size); void update_auto_incr_val(); @@ -316,6 +315,10 @@ class ha_rocksdb: public handler int index_first(uchar *buf); int index_last(uchar *buf); +private: + int index_first_intern(uchar *buf); + int index_last_intern(uchar *buf); +public: int index_end(); void unlock_row(); diff --git a/storage/rocksdb/rdb_applyiter.cc b/storage/rocksdb/rdb_applyiter.cc index 7074776bef61..fbf0b3a6f5cd 100644 --- a/storage/rocksdb/rdb_applyiter.cc +++ b/storage/rocksdb/rdb_applyiter.cc @@ -41,11 +41,13 @@ Apply_changes_iter::~Apply_changes_iter() } -void Apply_changes_iter::init(Row_table *trx_arg, rocksdb::Iterator *rdb_arg) +void Apply_changes_iter::init(bool is_reverse_arg, Row_table *trx_arg, + rocksdb::Iterator *rdb_arg) { delete trx; delete rdb; - trx= new Row_table_iter(trx_arg); + is_reverse= is_reverse_arg; + trx= new Row_table_iter(trx_arg, is_reverse); rdb= rdb_arg; valid= false; } @@ -136,6 +138,9 @@ void Apply_changes_iter::advance(int direction) int cmp= direction * compare_mem_comparable_keys((const uchar*)trx_key.data(), trx_key.size(), (const uchar*)rdb_key.data(), rdb_key.size()); + if (is_reverse) + cmp *= -1; + if (!cmp) // keys are equal { if (trx->is_tombstone()) diff --git a/storage/rocksdb/rdb_applyiter.h b/storage/rocksdb/rdb_applyiter.h index cc664c58b426..d8f8095abdc7 100644 --- a/storage/rocksdb/rdb_applyiter.h +++ b/storage/rocksdb/rdb_applyiter.h @@ -17,6 +17,8 @@ class Row_table; class Row_table_iter; +class RDBSE_KEYDEF; + /* A class that looks like RocksDB's iterator, but internally it takes into account the changes made by the transaction. @@ -33,10 +35,14 @@ class Apply_changes_iter /* These are the iterators we're merging. We own them, so should free them */ Row_table_iter *trx; rocksdb::Iterator* rdb; + + /* If true, we're scanning reverse-ordered data */ + bool is_reverse; public: Apply_changes_iter(); ~Apply_changes_iter(); - void init(Row_table *trx_arg, rocksdb::Iterator *rdb_arg); + void init(bool is_reverse_arg, Row_table *trx_arg, + rocksdb::Iterator *rdb_arg); void Next(); void Prev(); diff --git a/storage/rocksdb/rdb_cf_manager.cc b/storage/rocksdb/rdb_cf_manager.cc index d926b27ecbe1..22c969303655 100644 --- a/storage/rocksdb/rdb_cf_manager.cc +++ b/storage/rocksdb/rdb_cf_manager.cc @@ -30,6 +30,18 @@ #endif #include + +/* Check if ColumnFamily name says it's a reverse-ordered CF */ +bool is_cf_name_reverse(const char *name) +{ + /* NULL means the default CF is used.. (TODO: can the default CF be reverse?) */ + if (name && !strncmp(name, "rev:", 4)) + return true; + else + return false; +} + + void Column_family_manager::init(std::vector *names, std::vector *handles) { diff --git a/storage/rocksdb/rdb_cf_options.h b/storage/rocksdb/rdb_cf_options.h index 659d1937a956..357fc0fa4122 100644 --- a/storage/rocksdb/rdb_cf_options.h +++ b/storage/rocksdb/rdb_cf_options.h @@ -21,6 +21,8 @@ namespace rocksdb { class ColumnFamilyOptions; } +bool is_cf_name_reverse(const char *name); + /* Per-column family options configs. diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc index 186306d5a730..f41ef30aeec9 100644 --- a/storage/rocksdb/rdb_datadic.cc +++ b/storage/rocksdb/rdb_datadic.cc @@ -962,7 +962,7 @@ bool Table_ddl_manager::init(rocksdb::DB *rdb_dict) look at Field* objects and set max_length and other attributes */ tdef->key_descr[keyno]= new RDBSE_KEYDEF(index_number, keyno, NULL, - false); + false, false); /* Keep track of what was the last index number we saw */ if (max_number < index_number) diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h index 71fad9b50e81..2491e47755ea 100644 --- a/storage/rocksdb/rdb_datadic.h +++ b/storage/rocksdb/rdb_datadic.h @@ -199,9 +199,11 @@ class RDBSE_KEYDEF } RDBSE_KEYDEF(uint indexnr_arg, uint keyno_arg, - rocksdb::ColumnFamilyHandle* cf_handle_arg, bool is_auto_cf_arg) : + rocksdb::ColumnFamilyHandle* cf_handle_arg, + bool is_reverse_cf_arg, bool is_auto_cf_arg) : index_number(indexnr_arg), cf_handle(cf_handle_arg), + is_reverse_cf(is_reverse_cf_arg), is_auto_cf(is_auto_cf_arg), pk_part_no(NULL), pack_info(NULL), @@ -219,9 +221,11 @@ class RDBSE_KEYDEF void setup(TABLE *table); void set_cf_handle(rocksdb::ColumnFamilyHandle* cf_handle_arg, + bool is_reverse_cf_arg, bool is_auto_cf_arg) { cf_handle= cf_handle_arg; + is_reverse_cf= is_reverse_cf_arg; is_auto_cf= is_auto_cf_arg; } @@ -235,7 +239,11 @@ class RDBSE_KEYDEF uchar index_number_storage_form[INDEX_NUMBER_SIZE]; rocksdb::ColumnFamilyHandle* cf_handle; + public: + /* If true, the column family stores data in the reverse order */ + bool is_reverse_cf; + bool is_auto_cf; private: diff --git a/storage/rocksdb/rdb_debug.cc b/storage/rocksdb/rdb_debug.cc index 8dd145084638..99e6dd981885 100644 --- a/storage/rocksdb/rdb_debug.cc +++ b/storage/rocksdb/rdb_debug.cc @@ -83,23 +83,26 @@ void dump_value(FILE *out, const rocksdb::Slice &val) void dump_trx_changes(FILE *out, Row_table &changes) { - Row_table_iter iter(&changes); - fprintf(out, "TRX %p\n", current_thd); - for (iter.SeekToFirst(); iter.Valid(); iter.Next()) + for (int reverse= 0; reverse <= 1; reverse++) + Row_table_iter iter(&changes, (bool)reverse); { - if (iter.is_tombstone()) + fprintf(out, "TRX %p\n", current_thd); + for (iter.SeekToFirst(); iter.Valid(); iter.Next()) { - fprintf(out, "DEL "); - dump_value(out, iter.key()); - fputs("\n", out); - } - else - { - fprintf(out, "PUT "); - dump_value(out, iter.key()); - fputs(" ", out); - dump_value(out, iter.value()); - fputs("\n", out); + if (iter.is_tombstone()) + { + fprintf(out, "DEL "); + dump_value(out, iter.key()); + fputs("\n", out); + } + else + { + fprintf(out, "PUT "); + dump_value(out, iter.key()); + fputs(" ", out); + dump_value(out, iter.value()); + fputs("\n", out); + } } } } diff --git a/storage/rocksdb/rdb_rowmods.cc b/storage/rocksdb/rdb_rowmods.cc index f680978ae4e5..019235b609e6 100644 --- a/storage/rocksdb/rdb_rowmods.cc +++ b/storage/rocksdb/rdb_rowmods.cc @@ -18,18 +18,28 @@ #include "my_base.h" /* ha_rows */ #include "my_sys.h" #include "my_tree.h" +#include +#include "ha_rocksdb.h" +#include "sql_class.h" #include "rocksdb/db.h" #include "rocksdb/comparator.h" #include "rocksdb/write_batch.h" #include "rdb_rowmods.h" +#include "rdb_datadic.h" void Row_table::init() { - init_tree(&tree, 512 /*default_alloc_size*/, 0 /*memory_limit*/, + init_tree(&fw_tree, 512 /*default_alloc_size*/, 0 /*memory_limit*/, sizeof(void*)/*size*/, Row_table::compare_rows, 1 /*with_delete*/, NULL /*free_element*/, NULL/*custom_arg*/); - tree.flag |= TREE_NO_DUPS; + fw_tree.flag |= TREE_NO_DUPS; + + init_tree(&bw_tree, 512 /*default_alloc_size*/, 0 /*memory_limit*/, + sizeof(void*)/*size*/, Row_table::compare_rows_rev, 1 /*with_delete*/, + NULL /*free_element*/, NULL/*custom_arg*/); + bw_tree.flag |= TREE_NO_DUPS; + init_alloc_root(&mem_root, 512, 512); stmt_id= 1; change_id= 0; @@ -38,7 +48,7 @@ void Row_table::init() void Row_table::reinit() { - if (tree.elements_in_tree > 0) + if (fw_tree.elements_in_tree > 0 || bw_tree.elements_in_tree > 0) { cleanup(); init(); @@ -48,7 +58,8 @@ void Row_table::reinit() void Row_table::cleanup() { - delete_tree(&tree); + delete_tree(&fw_tree); + delete_tree(&bw_tree); free_root(&mem_root, MYF(0)); } @@ -66,10 +77,13 @@ void Row_table::cleanup() false - means found nothing */ -bool Row_table::Get(rocksdb::Slice &key, std::string *record, bool *found) +bool Row_table::Get(RDBSE_KEYDEF *keydef, rocksdb::Slice &key, + std::string *record, bool *found) { ROW_DATA **row_ptr; - if ((row_ptr= (ROW_DATA**)tree_search(&tree, &key, &key))) + TREE *tree= keydef->is_reverse_cf? &bw_tree : &fw_tree; + + if ((row_ptr= (ROW_DATA**)tree_search(tree, &key, &key))) { ROW_DATA *row= *row_ptr; if (row->value_len == DATA_IS_TOMBSTONE) @@ -86,6 +100,12 @@ bool Row_table::Get(rocksdb::Slice &key, std::string *record, bool *found) } +int Row_table::compare_rows_rev(const void* arg, const void *a,const void *b) +{ + return -compare_rows(arg, a, b); +} + + int Row_table::compare_rows(const void* arg, const void *a, const void *b) { uchar *pa, *pb; @@ -133,7 +153,7 @@ int Row_table::compare_rows(const void* arg, const void *a, const void *b) return res; } -bool Row_table::Put(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key, +bool Row_table::Put(RDBSE_KEYDEF *keydef, rocksdb::Slice& key, rocksdb::Slice& val) { uchar *data = (uchar*)alloc_root(&mem_root, ROW_DATA_SIZE + key.size() + @@ -142,18 +162,20 @@ bool Row_table::Put(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key, ROW_DATA *rdata= (ROW_DATA*)data; rdata->key_len= key.size(); rdata->value_len= val.size(); - rdata->cf= cf; + rdata->keydef= keydef; rdata->stmt_id= stmt_id; rdata->prev_version= NULL; memcpy(data + ROW_DATA_SIZE, key.data(), key.size()); memcpy(data + ROW_DATA_SIZE + key.size(), val.data(), val.size()); change_id++; - if (!tree_insert(&tree, &data, /*key_size*/0, NULL/*custom_arg*/)) + TREE *tree= keydef->is_reverse_cf? &bw_tree : &fw_tree; + + if (!tree_insert(tree, &data, /*key_size*/0, NULL/*custom_arg*/)) { /* There is already a record with this key (or Out-Of-Memory) */ ROW_DATA **row_ptr; - row_ptr= (ROW_DATA**)tree_search(&tree, &key, &key); + row_ptr= (ROW_DATA**)tree_search(tree, &key, &key); if (!row_ptr) return true; @@ -175,24 +197,25 @@ bool Row_table::Put(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key, Put a tombstone into the table */ -bool Row_table::Delete(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key) +bool Row_table::Delete(RDBSE_KEYDEF *keydef, rocksdb::Slice& key) { uchar *data = (uchar*)alloc_root(&mem_root, ROW_DATA_SIZE + key.size()); ROW_DATA *rdata= (ROW_DATA*)data; rdata->key_len= key.size(); rdata->value_len= DATA_IS_TOMBSTONE; - rdata->cf= cf; + rdata->keydef= keydef; rdata->stmt_id= stmt_id; rdata->prev_version= NULL; memcpy(data + ROW_DATA_SIZE, key.data(), key.size()); change_id++; + TREE *tree= keydef->is_reverse_cf? &bw_tree : &fw_tree; - if (!tree_insert(&tree, &data, /*key_size*/0, NULL/*custom_arg*/)) + if (!tree_insert(tree, &data, /*key_size*/0, NULL/*custom_arg*/)) { /* There is already a record with this key (or Out-Of-Memory) */ ROW_DATA **row_ptr; - row_ptr= (ROW_DATA**)tree_search(&tree, &key, &key); + row_ptr= (ROW_DATA**)tree_search(tree, &key, &key); if (!row_ptr) return true; /* OOM */ @@ -223,44 +246,49 @@ void Row_table::start_stmt() */ void Row_table::rollback_stmt() { - ROW_DATA *delete_list= NULL; - Row_table_iter iter(this); - - /* - To avoid invalidating the iterator, first collect all items that need to be - deleted in a linked list, and then actually do the deletes. - */ - for (iter.SeekToFirst(); iter.Valid(); iter.Next()) + for (int reverse=0; reverse <= 1; reverse++) { - if ((*iter.row_ptr)->stmt_id == stmt_id) + ROW_DATA *delete_list= NULL; + Row_table_iter iter(this, (bool)reverse); + + /* + To avoid invalidating the iterator, first collect all items that need to be + deleted in a linked list, and then actually do the deletes. + */ + for (iter.SeekToFirst(); iter.Valid(); iter.Next()) { - if ((*iter.row_ptr)->prev_version) + if ((*iter.row_ptr)->stmt_id == stmt_id) { - /* - This element has a previous version (the previous version is what the - element was before the current statement). - Replace the element with the its previous version. They have the same - key value, so there is no need to re-balance the tree. - */ - *iter.row_ptr= (*iter.row_ptr)->prev_version; - } - else - { - /* No previous version. Record for removal */ - (*iter.row_ptr)->prev_version= delete_list; - delete_list= (*iter.row_ptr); + if ((*iter.row_ptr)->prev_version) + { + /* + This element has a previous version (the previous version is what the + element was before the current statement). + Replace the element with the its previous version. They have the same + key value, so there is no need to re-balance the tree. + */ + *iter.row_ptr= (*iter.row_ptr)->prev_version; + } + else + { + /* No previous version. Record for removal */ + (*iter.row_ptr)->prev_version= delete_list; + delete_list= (*iter.row_ptr); + } } } - } - /* Do all of the recorded deletes */ - while (delete_list) - { - ROW_DATA *next= delete_list->prev_version; + /* Do all of the recorded deletes */ + TREE *tree= reverse? &bw_tree : &fw_tree; + + while (delete_list) + { + ROW_DATA *next= delete_list->prev_version; - tree_delete(&tree, &delete_list, /*key_size*/ 0, NULL); + tree_delete(tree, &delete_list, /*key_size*/ 0, NULL); - delete_list= next; + delete_list= next; + } } change_id++; @@ -271,14 +299,16 @@ void Row_table::rollback_stmt() * Row_table_iter ***************************************************************************/ -Row_table_iter::Row_table_iter(Row_table *rtable_arg) : - rtable(rtable_arg), row_ptr(NULL), change_id(rtable_arg->change_id) +Row_table_iter::Row_table_iter(Row_table *rtable_arg, bool is_reverse_arg) : + rtable(rtable_arg), is_reverse(is_reverse_arg), row_ptr(NULL), + change_id(rtable_arg->change_id) {} void Row_table_iter::Seek(const rocksdb::Slice &slice) { - row_ptr= (ROW_DATA**)tree_search_key(&rtable->tree, &slice, parents, &last_pos, + TREE *tree= is_reverse? &rtable->bw_tree : &rtable->fw_tree; + row_ptr= (ROW_DATA**)tree_search_key(tree, &slice, parents, &last_pos, HA_READ_KEY_OR_NEXT, &slice/*custom_arg*/); change_id= rtable->change_id; } @@ -286,7 +316,8 @@ void Row_table_iter::Seek(const rocksdb::Slice &slice) void Row_table_iter::SeekToFirst() { - row_ptr= (ROW_DATA**)tree_search_edge(&rtable->tree, parents, &last_pos, + TREE *tree= is_reverse? &rtable->bw_tree : &rtable->fw_tree; + row_ptr= (ROW_DATA**)tree_search_edge(tree, parents, &last_pos, offsetof(TREE_ELEMENT, left)); change_id= rtable->change_id; } @@ -294,7 +325,8 @@ void Row_table_iter::SeekToFirst() void Row_table_iter::SeekToLast() { - row_ptr= (ROW_DATA**)tree_search_edge(&rtable->tree, parents, &last_pos, + TREE *tree= is_reverse? &rtable->bw_tree : &rtable->fw_tree; + row_ptr= (ROW_DATA**)tree_search_edge(tree, parents, &last_pos, offsetof(TREE_ELEMENT, right)); change_id= rtable->change_id; } @@ -302,16 +334,17 @@ void Row_table_iter::SeekToLast() void Row_table_iter::Next() { + TREE *tree= is_reverse? &rtable->bw_tree : &rtable->fw_tree; if (rtable->change_id != change_id) { change_id= rtable->change_id; - row_ptr= (ROW_DATA**)tree_search_key(&rtable->tree, row_ptr, parents, + row_ptr= (ROW_DATA**)tree_search_key(tree, row_ptr, parents, &last_pos, HA_READ_AFTER_KEY, NULL/*custom_arg*/); } else { - row_ptr= (ROW_DATA**)tree_search_next(&rtable->tree, &last_pos, + row_ptr= (ROW_DATA**)tree_search_next(tree, &last_pos, offsetof(TREE_ELEMENT, left), offsetof(TREE_ELEMENT, right)); } @@ -320,16 +353,17 @@ void Row_table_iter::Next() void Row_table_iter::Prev() { + TREE *tree= is_reverse? &rtable->bw_tree : &rtable->fw_tree; if (rtable->change_id != change_id) { change_id= rtable->change_id; - row_ptr= (ROW_DATA**)tree_search_key(&rtable->tree, row_ptr, parents, + row_ptr= (ROW_DATA**)tree_search_key(tree, row_ptr, parents, &last_pos, HA_READ_BEFORE_KEY, NULL/*custom_arg*/); } else { - row_ptr= (ROW_DATA**)tree_search_next(&rtable->tree, &last_pos, + row_ptr= (ROW_DATA**)tree_search_next(tree, &last_pos, offsetof(TREE_ELEMENT, right), offsetof(TREE_ELEMENT, left)); } @@ -365,9 +399,10 @@ rocksdb::Slice Row_table_iter::value() } -rocksdb::ColumnFamilyHandle *Row_table_iter::cf_handle() +RDBSE_KEYDEF *Row_table_iter::keydef() { DBUG_ASSERT(Valid()); ROW_DATA *row= *row_ptr; - return row->cf; + return row->keydef; } + diff --git a/storage/rocksdb/rdb_rowmods.h b/storage/rocksdb/rdb_rowmods.h index f160067ee97e..150f9442c0f4 100644 --- a/storage/rocksdb/rdb_rowmods.h +++ b/storage/rocksdb/rdb_rowmods.h @@ -16,6 +16,7 @@ #include "my_tree.h" +class RDBSE_KEYDEF; typedef struct st_row_data { @@ -24,9 +25,10 @@ typedef struct st_row_data /* Can have a special value: DATA_IS_TOMBSTONE */ size_t value_len; - rocksdb::ColumnFamilyHandle *cf; + /* RocksDB-SE index this row is from. This allows to get the Column Family */ + RDBSE_KEYDEF *keydef; - /* Previous version */ + /* Previous version of the row with this key */ struct st_row_data *prev_version; /* Number of the statement that inserted this row/tombstone */ @@ -59,6 +61,9 @@ class Row_table_iter { Row_table *rtable; /* Table this iterator is for*/ + /* Whether we are iterating through forward or reverse keyspace */ + bool is_reverse; + /* The following are for tree iteration: */ TREE_ELEMENT *parents[MAX_TREE_HEIGHT+1]; TREE_ELEMENT **last_pos; @@ -71,7 +76,7 @@ class Row_table_iter int change_id; friend class Row_table; public: - Row_table_iter(Row_table *rtable_arg); + Row_table_iter(Row_table *rtable_arg, bool is_reverse_arg); /* Scanning functions */ void Seek(const rocksdb::Slice &slice); @@ -86,19 +91,48 @@ class Row_table_iter bool is_tombstone(); rocksdb::Slice key(); rocksdb::Slice value(); - - rocksdb::ColumnFamilyHandle *cf_handle(); + /* + RocksDB-SE index this row belongs to (this also allows to get the column + family) + */ + RDBSE_KEYDEF *keydef(); }; /* A storage for rows, or their tombstones. One can use rocksdb-like iterators to traverse the rows. + + == Relationship with Column Families == + There is only one Row_table object that stores rows from all Column Families. + We rely on the fact that no two rows have the same key, even if they are in + different column families. + + The rows store pointer to their RDBSE_KEYDEF, one can find out which Column + Family the row is from by calling Row_table_iter::keydef(). + + == Forward/backward ordered CFs == + We use two trees - one for rows from forward-ordered CFs, and one for rows + from backward-ordered CFs. + + (we could theoretically use a separate tree for each CF, but TREE stucture is + too heavy for this. + + I've also considered a solution where both forward an backward-ordered data + is kept in the same tree. We could compare by index_no first, and then + compare the remainder in either forward or reverse sorting. This defines an + ordering sufficient for TREE object to operate, but causes difficult issues + at table start/end. + ) */ class Row_table { - TREE tree; + /* Tree for column families using forward ordering */ + TREE fw_tree; + /* Tree for column families using backward ordering */ + TREE bw_tree; + MEM_ROOT mem_root; /* Current statement id */ @@ -107,6 +141,7 @@ class Row_table /* This is incremented on every change, so iterators can know if they were invalidated and should re-position themselves. + (todo: can have a separate change_id for every tree) */ int change_id; @@ -119,12 +154,13 @@ class Row_table void reinit(); /* Operations to put a row, or a tombstone */ - bool Put(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key, + bool Put(RDBSE_KEYDEF *keydef, rocksdb::Slice& key, rocksdb::Slice& val); - bool Delete(rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice& key); + bool Delete(RDBSE_KEYDEF *keydef, rocksdb::Slice& key); /* Lookup may find nothing, find row, of find a tombstone */ - bool Get(rocksdb::Slice &key, std::string *record, bool *found); + bool Get(RDBSE_KEYDEF *keydef, rocksdb::Slice &key, + std::string *record, bool *found); /* Statement support. It is possible to rollback all changes made by the @@ -136,8 +172,11 @@ class Row_table /* This may return false when there are really no changes (TODO: still true?) */ bool is_empty() { - return (tree.elements_in_tree == 0); + return (fw_tree.elements_in_tree == 0 && bw_tree.elements_in_tree == 0); }; private: static int compare_rows(const void* arg, const void *a,const void *b); + static int compare_rows_rev(const void* arg, const void *a,const void *b); }; + +