Skip to content

Commit

Permalink
release-3.5: backport etcd-io#17263 Fix tx buffer inconsistency if th…
Browse files Browse the repository at this point in the history
…ere are unordered key writes in one tx.

etcd-io#17263

Notes:
1. batch_tx_test.go is not incorporated into this backport because main and release-3.5 diverges a lot on buckets/schema/etc..
It is also not the core of the bug fix.
2. verify in tx_buffer.go is removed as mainly due to the dependency is not there in relese-3.5.

Signed-off-by: Chao Chen <[email protected]>
  • Loading branch information
chaochn47 committed Nov 11, 2024
1 parent 4726460 commit fc2db8e
Show file tree
Hide file tree
Showing 3 changed files with 264 additions and 9 deletions.
185 changes: 180 additions & 5 deletions server/mvcc/backend/batch_tx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
package backend_test

import (
"fmt"
"math/rand"
"reflect"
"testing"
"time"

"github.com/google/go-cmp/cmp"

bolt "go.etcd.io/bbolt"
"go.etcd.io/etcd/server/v3/mvcc/backend"
betesting "go.etcd.io/etcd/server/v3/mvcc/backend/testing"
Expand Down Expand Up @@ -239,24 +242,113 @@ func TestRangeAfterDeleteMatch(t *testing.T) {
tx.Unlock()
tx.Commit()

checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0)
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar")})

tx.Lock()
tx.UnsafeDelete(buckets.Test, []byte("foo"))
tx.Unlock()

checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0)
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
checkForEach(t, b.BatchTx(), b.ReadTx(), nil, nil)
}

func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, key, endKey []byte, limit int64) {
func TestRangeAfterUnorderedKeyWriteMatch(t *testing.T) {
b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
defer betesting.Close(t, b)

tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket(buckets.Test)
tx.UnsafePut(buckets.Test, []byte("foo5"), []byte("bar5"))
tx.UnsafePut(buckets.Test, []byte("foo2"), []byte("bar2"))
tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar1"))
tx.UnsafePut(buckets.Test, []byte("foo3"), []byte("bar3"))
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar"))
tx.UnsafePut(buckets.Test, []byte("foo4"), []byte("bar4"))
tx.Unlock()

checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 1)
}

func TestRangeAfterAlternatingBucketWriteMatch(t *testing.T) {
b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
defer betesting.Close(t, b)

tx := b.BatchTx()

tx.Lock()
tx.UnsafeCreateBucket(buckets.Key)
tx.UnsafeCreateBucket(buckets.Test)
tx.UnsafeSeqPut(buckets.Key, []byte("key1"), []byte("val1"))
tx.Unlock()

tx.Lock()
tx.UnsafeSeqPut(buckets.Key, []byte("key2"), []byte("val2"))
tx.Unlock()
tx.Commit()
// only in the 2nd commit the buckets.Key key is removed from the readBuffer.buckets.
// This makes sure to test the case when an empty writeBuffer.bucket
// is used to replace the read buffer bucket.
tx.Commit()
tx.Lock()
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar"))
tx.Unlock()
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Key, []byte("key"), []byte("key5"), 100)
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1)
}

func TestRangeAfterOverwriteMatch(t *testing.T) {
b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
defer betesting.Close(t, b)
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket(buckets.Test)
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2"))
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0"))
tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10"))
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1"))
tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11"))
tx.Unlock()
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1)
checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")})
}

func TestRangeAfterOverwriteAndDeleteMatch(t *testing.T) {
b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
defer betesting.Close(t, b)

tx := b.BatchTx()

tx.Lock()
tx.UnsafeCreateBucket(buckets.Test)
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2"))
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0"))
tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10"))
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1"))
tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11"))
tx.Unlock()

checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")})

tx.Lock()
tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar3"))
tx.UnsafeDelete(buckets.Test, []byte("foo1"))
tx.Unlock()

checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo1"), nil, 0)
checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar3")})
}

func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, bucket backend.Bucket, key, endKey []byte, limit int64) {
tx.Lock()
ks1, vs1 := tx.UnsafeRange(buckets.Test, key, endKey, limit)
ks1, vs1 := tx.UnsafeRange(bucket, key, endKey, limit)
tx.Unlock()

rtx.RLock()
ks2, vs2 := rtx.UnsafeRange(buckets.Test, key, endKey, limit)
ks2, vs2 := rtx.UnsafeRange(bucket, key, endKey, limit)
rtx.RUnlock()

if diff := cmp.Diff(ks1, ks2); diff != "" {
Expand Down Expand Up @@ -292,3 +384,86 @@ func checkUnsafeForEach(t *testing.T, tx backend.ReadTx, expectedKeys, expectedV
t.Errorf("values on transaction doesn't match expected, diff: %s", diff)
}
}

// runWriteback is used test the txWriteBuffer.writeback function, which is called inside tx.Unlock().
// The parameters are chosen based on defaultBatchLimit = 10000
func runWriteback(t testing.TB, kss, vss [][]string, isSeq bool) {
b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
defer betesting.Close(t, b)

tx := b.BatchTx()

tx.Lock()
tx.UnsafeCreateBucket(buckets.Test)
tx.UnsafeCreateBucket(buckets.Key)
tx.Unlock()
for i, ks := range kss {
vs := vss[i]
tx.Lock()
for j := 0; j < len(ks); j++ {
if isSeq {
tx.UnsafeSeqPut(buckets.Key, []byte(ks[j]), []byte(vs[j]))
} else {
tx.UnsafePut(buckets.Test, []byte(ks[j]), []byte(vs[j]))
}
}
tx.Unlock()
}
}

func BenchmarkWritebackSeqBatches1BatchSize10000(b *testing.B) { benchmarkWriteback(b, 1, 10000, true) }

func BenchmarkWritebackSeqBatches10BatchSize1000(b *testing.B) { benchmarkWriteback(b, 10, 1000, true) }

func BenchmarkWritebackSeqBatches100BatchSize100(b *testing.B) { benchmarkWriteback(b, 100, 100, true) }

func BenchmarkWritebackSeqBatches1000BatchSize10(b *testing.B) { benchmarkWriteback(b, 1000, 10, true) }

func BenchmarkWritebackNonSeqBatches1000BatchSize1(b *testing.B) {
// for non sequential writes, the batch size is usually small, 1 or the order of cluster size.
benchmarkWriteback(b, 1000, 1, false)
}

func BenchmarkWritebackNonSeqBatches10000BatchSize1(b *testing.B) {
benchmarkWriteback(b, 10000, 1, false)
}

func BenchmarkWritebackNonSeqBatches100BatchSize10(b *testing.B) {
benchmarkWriteback(b, 100, 10, false)
}

func BenchmarkWritebackNonSeqBatches1000BatchSize10(b *testing.B) {
benchmarkWriteback(b, 1000, 10, false)
}

func benchmarkWriteback(b *testing.B, batches, batchSize int, isSeq bool) {
// kss and vss are key and value arrays to write with size batches*batchSize
var kss, vss [][]string
for i := 0; i < batches; i++ {
var ks, vs []string
for j := i * batchSize; j < (i+1)*batchSize; j++ {
k := fmt.Sprintf("key%d", j)
v := fmt.Sprintf("val%d", j)
ks = append(ks, k)
vs = append(vs, v)
}
if !isSeq {
// make sure each batch is shuffled differently but the same for different test runs.
shuffleList(ks, i*batchSize)
}
kss = append(kss, ks)
vss = append(vss, vs)
}
b.ResetTimer()
for n := 1; n < b.N; n++ {
runWriteback(b, kss, vss, isSeq)
}
}

func shuffleList(l []string, seed int) {
r := rand.New(rand.NewSource(int64(seed)))
for i := 0; i < len(l); i++ {
j := r.Intn(i + 1)
l[i], l[j] = l[j], l[i]
}
}
17 changes: 13 additions & 4 deletions server/mvcc/backend/tx_buffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ func (txw *txWriteBuffer) put(bucket Bucket, k, v []byte) {
}

func (txw *txWriteBuffer) putSeq(bucket Bucket, k, v []byte) {
// TODO: Add (in tests?) verification whether k>b[len(b)]
// putSeq is only be called for the data in the Key bucket. The keys
// in the Key bucket should be monotonically increasing revisions.
txw.putInternal(bucket, k, v)
}

Expand Down Expand Up @@ -80,6 +81,9 @@ func (txw *txWriteBuffer) writeback(txr *txReadBuffer) {
rb, ok := txr.buckets[k]
if !ok {
delete(txw.buckets, k)
if seq, ok := txw.bucket2seq[k]; ok && !seq {
wb.dedupe()
}
txr.buckets[k] = wb
continue
}
Expand Down Expand Up @@ -148,7 +152,7 @@ func newBucketBuffer() *bucketBuffer {
func (bb *bucketBuffer) Range(key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte) {
f := func(i int) bool { return bytes.Compare(bb.buf[i].key, key) >= 0 }
idx := sort.Search(bb.used, f)
if idx < 0 {
if idx < 0 || idx >= bb.used {
return nil, nil
}
if len(endKey) == 0 {
Expand Down Expand Up @@ -201,10 +205,15 @@ func (bb *bucketBuffer) merge(bbsrc *bucketBuffer) {
if bytes.Compare(bb.buf[(bb.used-bbsrc.used)-1].key, bbsrc.buf[0].key) < 0 {
return
}
bb.dedupe()
}

// dedupe removes duplicates, using only newest update
func (bb *bucketBuffer) dedupe() {
if bb.used <= 1 {
return
}
sort.Stable(bb)

// remove duplicates, using only newest update
widx := 0
for ridx := 1; ridx < bb.used; ridx++ {
if !bytes.Equal(bb.buf[ridx].key, bb.buf[widx].key) {
Expand Down
71 changes: 71 additions & 0 deletions server/mvcc/backend/tx_buffer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2023 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package backend

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestDedupe(t *testing.T) {
tests := []struct {
name string
keys, vals, expectedKeys, expectedVals []string
}{
{
name: "empty",
keys: []string{},
vals: []string{},
expectedKeys: []string{},
expectedVals: []string{},
},
{
name: "single kv",
keys: []string{"key1"},
vals: []string{"val1"},
expectedKeys: []string{"key1"},
expectedVals: []string{"val1"},
},
{
name: "duplicate key",
keys: []string{"key1", "key1"},
vals: []string{"val1", "val2"},
expectedKeys: []string{"key1"},
expectedVals: []string{"val2"},
},
{
name: "unordered keys",
keys: []string{"key3", "key1", "key4", "key2", "key1", "key4"},
vals: []string{"val1", "val5", "val3", "val4", "val2", "val6"},
expectedKeys: []string{"key1", "key2", "key3", "key4"},
expectedVals: []string{"val2", "val4", "val1", "val6"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
bb := &bucketBuffer{buf: make([]kv, 10), used: 0}
for i := 0; i < len(tt.keys); i++ {
bb.add([]byte(tt.keys[i]), []byte(tt.vals[i]))
}
bb.dedupe()
assert.Len(t, tt.expectedKeys, bb.used)
for i := 0; i < bb.used; i++ {
assert.Equal(t, bb.buf[i].key, []byte(tt.expectedKeys[i]))
assert.Equal(t, bb.buf[i].val, []byte(tt.expectedVals[i]))
}
})
}
}

0 comments on commit fc2db8e

Please sign in to comment.