release-3.5: backport etcd-io#17263 Fix tx buffer inconsistency if th…

…ere are unordered key writes in one tx. etcd-io#17263 Notes: 1. batch_tx_test.go is not incorporated into this backport because main and release-3.5 diverges a lot on buckets/schema/etc.. It is also not the core of the bug fix. 2. verify in tx_buffer.go is removed as mainly due to the dependency is not there in relese-3.5. Signed-off-by: Chao Chen <[email protected]>
chaochn47 · Nov 11, 2024 · fc2db8e · fc2db8e
1 parent 4726460
commit fc2db8e
Show file tree

Hide file tree

Showing 3 changed files with 264 additions and 9 deletions.
diff --git a/server/mvcc/backend/batch_tx_test.go b/server/mvcc/backend/batch_tx_test.go
@@ -15,11 +15,14 @@
 package backend_test
 
 import (
+	"fmt"
+	"math/rand"
 	"reflect"
 	"testing"
 	"time"
 
 	"github.com/google/go-cmp/cmp"
+
 	bolt "go.etcd.io/bbolt"
 	"go.etcd.io/etcd/server/v3/mvcc/backend"
 	betesting "go.etcd.io/etcd/server/v3/mvcc/backend/testing"
@@ -239,24 +242,113 @@ func TestRangeAfterDeleteMatch(t *testing.T) {
 	tx.Unlock()
 	tx.Commit()
 
-	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0)
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
 	checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar")})
 
 	tx.Lock()
 	tx.UnsafeDelete(buckets.Test, []byte("foo"))
 	tx.Unlock()
 
-	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0)
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
 	checkForEach(t, b.BatchTx(), b.ReadTx(), nil, nil)
 }
 
-func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, key, endKey []byte, limit int64) {
+func TestRangeAfterUnorderedKeyWriteMatch(t *testing.T) {
+	b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
+	defer betesting.Close(t, b)
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket(buckets.Test)
+	tx.UnsafePut(buckets.Test, []byte("foo5"), []byte("bar5"))
+	tx.UnsafePut(buckets.Test, []byte("foo2"), []byte("bar2"))
+	tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar1"))
+	tx.UnsafePut(buckets.Test, []byte("foo3"), []byte("bar3"))
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar"))
+	tx.UnsafePut(buckets.Test, []byte("foo4"), []byte("bar4"))
+	tx.Unlock()
+
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 1)
+}
+
+func TestRangeAfterAlternatingBucketWriteMatch(t *testing.T) {
+	b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
+	defer betesting.Close(t, b)
+
+	tx := b.BatchTx()
+
+	tx.Lock()
+	tx.UnsafeCreateBucket(buckets.Key)
+	tx.UnsafeCreateBucket(buckets.Test)
+	tx.UnsafeSeqPut(buckets.Key, []byte("key1"), []byte("val1"))
+	tx.Unlock()
+
+	tx.Lock()
+	tx.UnsafeSeqPut(buckets.Key, []byte("key2"), []byte("val2"))
+	tx.Unlock()
+	tx.Commit()
+	// only in the 2nd commit the buckets.Key key is removed from the readBuffer.buckets.
+	// This makes sure to test the case when an empty writeBuffer.bucket
+	// is used to replace the read buffer bucket.
+	tx.Commit()
+	tx.Lock()
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar"))
+	tx.Unlock()
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Key, []byte("key"), []byte("key5"), 100)
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1)
+}
+
+func TestRangeAfterOverwriteMatch(t *testing.T) {
+	b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
+	defer betesting.Close(t, b)
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket(buckets.Test)
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2"))
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0"))
+	tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10"))
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1"))
+	tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11"))
+	tx.Unlock()
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1)
+	checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")})
+}
+
+func TestRangeAfterOverwriteAndDeleteMatch(t *testing.T) {
+	b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
+	defer betesting.Close(t, b)
+
+	tx := b.BatchTx()
+
+	tx.Lock()
+	tx.UnsafeCreateBucket(buckets.Test)
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2"))
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0"))
+	tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10"))
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1"))
+	tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11"))
+	tx.Unlock()
+
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
+	checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")})
+
+	tx.Lock()
+	tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar3"))
+	tx.UnsafeDelete(buckets.Test, []byte("foo1"))
+	tx.Unlock()
+
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0)
+	checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo1"), nil, 0)
+	checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar3")})
+}
+
+func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, bucket backend.Bucket, key, endKey []byte, limit int64) {
 	tx.Lock()
-	ks1, vs1 := tx.UnsafeRange(buckets.Test, key, endKey, limit)
+	ks1, vs1 := tx.UnsafeRange(bucket, key, endKey, limit)
 	tx.Unlock()
 
 	rtx.RLock()
-	ks2, vs2 := rtx.UnsafeRange(buckets.Test, key, endKey, limit)
+	ks2, vs2 := rtx.UnsafeRange(bucket, key, endKey, limit)
 	rtx.RUnlock()
 
 	if diff := cmp.Diff(ks1, ks2); diff != "" {
@@ -292,3 +384,86 @@ func checkUnsafeForEach(t *testing.T, tx backend.ReadTx, expectedKeys, expectedV
 		t.Errorf("values on transaction doesn't match expected, diff: %s", diff)
 	}
 }
+
+// runWriteback is used test the txWriteBuffer.writeback function, which is called inside tx.Unlock().
+// The parameters are chosen based on defaultBatchLimit = 10000
+func runWriteback(t testing.TB, kss, vss [][]string, isSeq bool) {
+	b, _ := betesting.NewTmpBackend(t, time.Hour, 10000)
+	defer betesting.Close(t, b)
+
+	tx := b.BatchTx()
+
+	tx.Lock()
+	tx.UnsafeCreateBucket(buckets.Test)
+	tx.UnsafeCreateBucket(buckets.Key)
+	tx.Unlock()
+	for i, ks := range kss {
+		vs := vss[i]
+		tx.Lock()
+		for j := 0; j < len(ks); j++ {
+			if isSeq {
+				tx.UnsafeSeqPut(buckets.Key, []byte(ks[j]), []byte(vs[j]))
+			} else {
+				tx.UnsafePut(buckets.Test, []byte(ks[j]), []byte(vs[j]))
+			}
+		}
+		tx.Unlock()
+	}
+}
+
+func BenchmarkWritebackSeqBatches1BatchSize10000(b *testing.B) { benchmarkWriteback(b, 1, 10000, true) }
+
+func BenchmarkWritebackSeqBatches10BatchSize1000(b *testing.B) { benchmarkWriteback(b, 10, 1000, true) }
+
+func BenchmarkWritebackSeqBatches100BatchSize100(b *testing.B) { benchmarkWriteback(b, 100, 100, true) }
+
+func BenchmarkWritebackSeqBatches1000BatchSize10(b *testing.B) { benchmarkWriteback(b, 1000, 10, true) }
+
+func BenchmarkWritebackNonSeqBatches1000BatchSize1(b *testing.B) {
+	// for non sequential writes, the batch size is usually small, 1 or the order of cluster size.
+	benchmarkWriteback(b, 1000, 1, false)
+}
+
+func BenchmarkWritebackNonSeqBatches10000BatchSize1(b *testing.B) {
+	benchmarkWriteback(b, 10000, 1, false)
+}
+
+func BenchmarkWritebackNonSeqBatches100BatchSize10(b *testing.B) {
+	benchmarkWriteback(b, 100, 10, false)
+}
+
+func BenchmarkWritebackNonSeqBatches1000BatchSize10(b *testing.B) {
+	benchmarkWriteback(b, 1000, 10, false)
+}
+
+func benchmarkWriteback(b *testing.B, batches, batchSize int, isSeq bool) {
+	// kss and vss are key and value arrays to write with size batches*batchSize
+	var kss, vss [][]string
+	for i := 0; i < batches; i++ {
+		var ks, vs []string
+		for j := i * batchSize; j < (i+1)*batchSize; j++ {
+			k := fmt.Sprintf("key%d", j)
+			v := fmt.Sprintf("val%d", j)
+			ks = append(ks, k)
+			vs = append(vs, v)
+		}
+		if !isSeq {
+			// make sure each batch is shuffled differently but the same for different test runs.
+			shuffleList(ks, i*batchSize)
+		}
+		kss = append(kss, ks)
+		vss = append(vss, vs)
+	}
+	b.ResetTimer()
+	for n := 1; n < b.N; n++ {
+		runWriteback(b, kss, vss, isSeq)
+	}
+}
+
+func shuffleList(l []string, seed int) {
+	r := rand.New(rand.NewSource(int64(seed)))
+	for i := 0; i < len(l); i++ {
+		j := r.Intn(i + 1)
+		l[i], l[j] = l[j], l[i]
+	}
+}
diff --git a/server/mvcc/backend/tx_buffer.go b/server/mvcc/backend/tx_buffer.go
@@ -50,7 +50,8 @@ func (txw *txWriteBuffer) put(bucket Bucket, k, v []byte) {
 }
 
 func (txw *txWriteBuffer) putSeq(bucket Bucket, k, v []byte) {
-	// TODO: Add (in tests?) verification whether k>b[len(b)]
+	// putSeq is only be called for the data in the Key bucket. The keys
+	// in the Key bucket should be monotonically increasing revisions.
 	txw.putInternal(bucket, k, v)
 }
 
@@ -80,6 +81,9 @@ func (txw *txWriteBuffer) writeback(txr *txReadBuffer) {
 		rb, ok := txr.buckets[k]
 		if !ok {
 			delete(txw.buckets, k)
+			if seq, ok := txw.bucket2seq[k]; ok && !seq {
+				wb.dedupe()
+			}
 			txr.buckets[k] = wb
 			continue
 		}
@@ -148,7 +152,7 @@ func newBucketBuffer() *bucketBuffer {
 func (bb *bucketBuffer) Range(key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte) {
 	f := func(i int) bool { return bytes.Compare(bb.buf[i].key, key) >= 0 }
 	idx := sort.Search(bb.used, f)
-	if idx < 0 {
+	if idx < 0 || idx >= bb.used {
 		return nil, nil
 	}
 	if len(endKey) == 0 {
@@ -201,10 +205,15 @@ func (bb *bucketBuffer) merge(bbsrc *bucketBuffer) {
 	if bytes.Compare(bb.buf[(bb.used-bbsrc.used)-1].key, bbsrc.buf[0].key) < 0 {
 		return
 	}
+	bb.dedupe()
+}
 
+// dedupe removes duplicates, using only newest update
+func (bb *bucketBuffer) dedupe() {
+	if bb.used <= 1 {
+		return
+	}
 	sort.Stable(bb)
-
-	// remove duplicates, using only newest update
 	widx := 0
 	for ridx := 1; ridx < bb.used; ridx++ {
 		if !bytes.Equal(bb.buf[ridx].key, bb.buf[widx].key) {

diff --git a/server/mvcc/backend/tx_buffer_test.go b/server/mvcc/backend/tx_buffer_test.go
@@ -0,0 +1,71 @@
+// Copyright 2023 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDedupe(t *testing.T) {
+	tests := []struct {
+		name                                   string
+		keys, vals, expectedKeys, expectedVals []string
+	}{
+		{
+			name:         "empty",
+			keys:         []string{},
+			vals:         []string{},
+			expectedKeys: []string{},
+			expectedVals: []string{},
+		},
+		{
+			name:         "single kv",
+			keys:         []string{"key1"},
+			vals:         []string{"val1"},
+			expectedKeys: []string{"key1"},
+			expectedVals: []string{"val1"},
+		},
+		{
+			name:         "duplicate key",
+			keys:         []string{"key1", "key1"},
+			vals:         []string{"val1", "val2"},
+			expectedKeys: []string{"key1"},
+			expectedVals: []string{"val2"},
+		},
+		{
+			name:         "unordered keys",
+			keys:         []string{"key3", "key1", "key4", "key2", "key1", "key4"},
+			vals:         []string{"val1", "val5", "val3", "val4", "val2", "val6"},
+			expectedKeys: []string{"key1", "key2", "key3", "key4"},
+			expectedVals: []string{"val2", "val4", "val1", "val6"},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			bb := &bucketBuffer{buf: make([]kv, 10), used: 0}
+			for i := 0; i < len(tt.keys); i++ {
+				bb.add([]byte(tt.keys[i]), []byte(tt.vals[i]))
+			}
+			bb.dedupe()
+			assert.Len(t, tt.expectedKeys, bb.used)
+			for i := 0; i < bb.used; i++ {
+				assert.Equal(t, bb.buf[i].key, []byte(tt.expectedKeys[i]))
+				assert.Equal(t, bb.buf[i].val, []byte(tt.expectedVals[i]))
+			}
+		})
+	}
+}