Merge pull request #428 from tdakkota/perf/improve-colstr-writing

perf(proto): improve `ColStr` writing performance for small strings
ClickHouse · Oct 9, 2024 · 780d68d · 780d68d
2 parents e4a2d07 + 308ecb3
commit 780d68d
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 22 deletions.
diff --git a/insert_bench_test.go b/insert_bench_test.go
@@ -3,6 +3,7 @@ package ch
 import (
 	"context"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/go-faster/errors"
@@ -15,7 +16,7 @@ func BenchmarkInsert(b *testing.B) {
 	cht.Skip(b)
 	srv := cht.New(b)
 
-	bench := func(rows int) func(b *testing.B) {
+	bench := func(data proto.ColInput) func(b *testing.B) {
 		return func(b *testing.B) {
 			ctx := context.Background()
 			c, err := Dial(ctx, Options{
@@ -25,42 +26,97 @@ func BenchmarkInsert(b *testing.B) {
 			if err != nil {
 				b.Fatal(errors.Wrap(err, "dial"))
 			}
-			defer func() { _ = c.Close() }()
 
+			b.Cleanup(func() {
+				if err := c.Do(ctx, Query{
+					Body: "DROP TABLE IF EXISTS test_table",
+				}); err != nil {
+					b.Logf("Cleanup failed: %+v", err)
+				}
+				_ = c.Close()
+			})
 			if err := c.Do(ctx, Query{
-				Body: "CREATE TABLE IF NOT EXISTS test_table (id Int64) ENGINE = Null",
+				Body: fmt.Sprintf("CREATE TABLE IF NOT EXISTS test_table (row %s) ENGINE = Null", data.Type()),
 			}); err != nil {
 				b.Fatal(err)
 			}
 
-			var id proto.ColInt64
-			for i := 0; i < rows; i++ {
-				id = append(id, 1)
-			}
+			var tmp proto.Buffer
+			data.EncodeColumn(&tmp)
 
-			b.SetBytes(int64(rows) * 8)
+			b.SetBytes(int64(len(tmp.Buf)))
 			b.ResetTimer()
 			b.ReportAllocs()
 
 			for i := 0; i < b.N; i++ {
 				if err := c.Do(ctx, Query{
 					Body: "INSERT INTO test_table VALUES",
 					Input: []proto.InputColumn{
-						{Name: "id", Data: id},
+						{Name: "row", Data: data},
 					},
 				}); err != nil {
-					b.Fatal()
+					b.Fatal(err)
 				}
 			}
 		}
 	}
-	for _, rows := range []int{
-		10_000,
-		100_000,
-		1_000_000,
-		10_000_000,
-		100_000_000,
+	for _, gen := range []struct {
+		name    string
+		getData func(rows int) proto.ColInput
+		maxRows int
+	}{
+		{
+			"ColInt64",
+			func(rows int) proto.ColInput {
+				var data proto.ColInt64
+				for i := 0; i < rows; i++ {
+					data.Append(int64(i))
+				}
+				return data
+			},
+			-1,
+		},
+		{
+			"SmallColStr",
+			func(rows int) proto.ColInput {
+				var data proto.ColStr
+				for i := 0; i < rows; i++ {
+					data.Append(fmt.Sprintf("%016x", i))
+				}
+				return data
+			},
+			1_000_000,
+		},
+		{
+			"BigColStr",
+			func(rows int) proto.ColInput {
+				var (
+					data    proto.ColStr
+					scratch = strings.Repeat("abcd", 1024)
+				)
+				for i := 0; i < rows; i++ {
+					data.Append(scratch)
+				}
+				return data
+			},
+			100_000,
+		},
 	} {
-		b.Run(fmt.Sprintf("Rows%d", rows), bench(rows))
+		b.Run(gen.name, func(b *testing.B) {
+			for _, rows := range []int{
+				10_000,
+				100_000,
+				1_000_000,
+				10_000_000,
+				100_000_000,
+			} {
+				if gen.maxRows > 0 && rows > gen.maxRows {
+					continue
+				}
+				data := gen.getData(rows)
+
+				b.Run(fmt.Sprintf("Rows%d", rows), bench(data))
+			}
+		})
 	}
 }
diff --git a/proto/col_str.go b/proto/col_str.go
@@ -79,13 +79,15 @@ func (c ColStr) EncodeColumn(b *Buffer) {
 // WriteColumn writes String rows to *Writer.
 func (c ColStr) WriteColumn(w *Writer) {
 	buf := make([]byte, binary.MaxVarintLen64)
-	for _, p := range c.Pos {
-		w.ChainBuffer(func(b *Buffer) {
+	// Writing values from c.Buf directly might improve performance if [ColStr] contains a few rows of very long strings.
+	// However, most of the time it is quite opposite, so we copy data.
+	w.ChainBuffer(func(b *Buffer) {
+		for _, p := range c.Pos {
 			n := binary.PutUvarint(buf, uint64(p.End-p.Start))
 			b.PutRaw(buf[:n])
-		})
-		w.ChainWrite(c.Buf[p.Start:p.End])
-	}
+			b.PutRaw(c.Buf[p.Start:p.End])
+		}
+	})
 }
 
 // ForEach calls f on each string from column.