From 0ac9fb4e17ea08a791700380b55ea5c49e776624 Mon Sep 17 00:00:00 2001 From: Andrew Kimball Date: Wed, 23 Oct 2024 22:09:14 -0700 Subject: [PATCH] num32: add num32 package Move float32 numeric functions into new util/num32 package. This package will be used by built-in SQL functions as well as the vector indexing library. It deliberately uses simple float32 input/output types in order to stay as decoupled as possible from CRDB-specific types. Epic: CRDB-42943 Release note: None --- pkg/BUILD.bazel | 3 + pkg/util/num32/BUILD.bazel | 19 ++++++ pkg/util/num32/doc.go | 22 +++++++ pkg/util/num32/vec.go | 55 +++++++++++++++++ pkg/util/num32/vec_test.go | 77 ++++++++++++++++++++++++ pkg/util/vector/BUILD.bazel | 1 + pkg/util/vector/vector.go | 61 ++++++++++--------- pkg/util/vector/vector_set.go | 9 ++- pkg/util/vector/vector_set_test.go | 12 ++-- pkg/util/vector/vector_test.go | 96 +++++++++++++----------------- 10 files changed, 260 insertions(+), 95 deletions(-) create mode 100644 pkg/util/num32/BUILD.bazel create mode 100644 pkg/util/num32/doc.go create mode 100644 pkg/util/num32/vec.go create mode 100644 pkg/util/num32/vec_test.go diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index f3bc4fc7a43d..1f6d90f4b3da 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -727,6 +727,7 @@ ALL_TESTS = [ "//pkg/util/mon:mon_test", "//pkg/util/netutil/addr:addr_test", "//pkg/util/netutil:netutil_test", + "//pkg/util/num32:num32_test", "//pkg/util/optional:optional_test", "//pkg/util/parquet:parquet_test", "//pkg/util/pprofutil:pprofutil_test", @@ -2543,6 +2544,8 @@ GO_TARGETS = [ "//pkg/util/netutil/addr:addr_test", "//pkg/util/netutil:netutil", "//pkg/util/netutil:netutil_test", + "//pkg/util/num32:num32", + "//pkg/util/num32:num32_test", "//pkg/util/optional:optional", "//pkg/util/optional:optional_test", "//pkg/util/parquet:parquet", diff --git a/pkg/util/num32/BUILD.bazel b/pkg/util/num32/BUILD.bazel new file mode 100644 index 000000000000..6ae398e9d666 --- /dev/null +++ b/pkg/util/num32/BUILD.bazel @@ -0,0 +1,19 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "num32", + srcs = [ + "doc.go", + "vec.go", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/util/num32", + visibility = ["//visibility:public"], + deps = ["@com_github_cockroachdb_errors//:errors"], +) + +go_test( + name = "num32_test", + srcs = ["vec_test.go"], + embed = [":num32"], + deps = ["@com_github_stretchr_testify//require"], +) diff --git a/pkg/util/num32/doc.go b/pkg/util/num32/doc.go new file mode 100644 index 000000000000..e2b94a62fd25 --- /dev/null +++ b/pkg/util/num32/doc.go @@ -0,0 +1,22 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +/* +Package num32 contains basic numeric functions that operate on scalar, vector, +and matrix float32 values. Inputs and outputs deliberately use simple float +types so that they can be used in multiple contexts. It uses the gonum library +when possible, since it offers assembly language implementations of various +useful primitives. + +Using the same convention as gonum, when a slice is being modified in place, it +has the name dst and the function does not return a value. + +Where possible, functions in this package are written with the assumption that +the caller prevents bad input. They will panic with assertion errors if this is +not the case, rather than returning error values. Callers should generally have +panic recovery logic further up the stack to gracefully handle these assertions, +as they indicate buggy code. +*/ +package num32 diff --git a/pkg/util/num32/vec.go b/pkg/util/num32/vec.go new file mode 100644 index 000000000000..8db42ed64bbf --- /dev/null +++ b/pkg/util/num32/vec.go @@ -0,0 +1,55 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package num32 + +import ( + "math" + + "github.com/cockroachdb/errors" +) + +// L1Distance returns the L1 norm of s - t, which is the Manhattan distance +// between the two vectors. +func L1Distance(s []float32, t []float32) float32 { + checkDims(s, t) + var distance float32 + for i := range s { + diff := s[i] - t[i] + distance += float32(math.Abs(float64(diff))) + } + return distance +} + +// L2SquaredDistance returns the squared L2 norm of s - t, which is the squared +// Euclidean distance between the two vectors. Comparing squared distance is +// equivalent to comparing distance, but the squared distance avoids an +// expensive square-root operation. +func L2SquaredDistance(s, t []float32) float32 { + checkDims(s, t) + var distance float32 + for i := range s { + diff := s[i] - t[i] + distance += diff * diff + } + return distance +} + +// InnerProduct returns the inner product of t1 and t2, also called the dot +// product. +func InnerProduct(s []float32, t []float32) float32 { + checkDims(s, t) + var distance float32 + for i := range s { + distance += s[i] * t[i] + } + return distance +} + +func checkDims(v []float32, v2 []float32) { + if len(v) != len(v2) { + panic(errors.AssertionFailedf("different vector dimensions %d and %d", len(v), len(v2))) + } +} diff --git a/pkg/util/num32/vec_test.go b/pkg/util/num32/vec_test.go new file mode 100644 index 000000000000..bfc336f685bd --- /dev/null +++ b/pkg/util/num32/vec_test.go @@ -0,0 +1,77 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package num32 + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +var NaN32 = float32(math.NaN()) +var Inf32 = float32(math.Inf(1)) + +func TestDistances(t *testing.T) { + // Test L1, L2, Cosine distance. + testCases := []struct { + v1 []float32 + v2 []float32 + l1 float32 + l2s float32 + panics bool + }{ + {v1: []float32{}, v2: []float32{}, l1: 0, l2s: 0}, + {v1: []float32{1, 2, 3}, v2: []float32{4, 5, 6}, l1: 9, l2s: 27}, + {v1: []float32{-1, -2, -3}, v2: []float32{-4, -5, -6}, l1: 9, l2s: 27}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 3}, l1: 0, l2s: 0}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 4}, l1: 1, l2s: 1}, + {v1: []float32{NaN32}, v2: []float32{1}, l1: NaN32, l2s: NaN32}, + {v1: []float32{Inf32}, v2: []float32{1}, l1: Inf32, l2s: Inf32}, + {v1: []float32{1, 2}, v2: []float32{3, 4, 5}, panics: true}, + } + + for _, tc := range testCases { + if !tc.panics { + l1 := L1Distance(tc.v1, tc.v2) + l2s := L2SquaredDistance(tc.v1, tc.v2) + require.InDelta(t, tc.l1, l1, 0.000001) + require.InDelta(t, tc.l2s, l2s, 0.000001) + } else { + require.Panics(t, func() { L1Distance(tc.v1, tc.v2) }) + require.Panics(t, func() { L2SquaredDistance(tc.v1, tc.v2) }) + } + } +} + +func TestInnerProduct(t *testing.T) { + // Test inner product and negative inner product + testCases := []struct { + v1 []float32 + v2 []float32 + ip float32 + panics bool + }{ + {v1: []float32{}, v2: []float32{}, ip: 0}, + {v1: []float32{1, 2, 3}, v2: []float32{4, 5, 6}, ip: 32}, + {v1: []float32{-1, -2, -3}, v2: []float32{-4, -5, -6}, ip: 32}, + {v1: []float32{0, 0, 0}, v2: []float32{0, 0, 0}, ip: 0}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 3}, ip: 14}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 4}, ip: 17}, + {v1: []float32{NaN32}, v2: []float32{1}, ip: NaN32}, + {v1: []float32{Inf32}, v2: []float32{1}, ip: Inf32}, + {v1: []float32{1, 2}, v2: []float32{3, 4, 5}, panics: true}, + } + + for _, tc := range testCases { + if !tc.panics { + ip := InnerProduct(tc.v1, tc.v2) + require.InDelta(t, tc.ip, ip, 0.000001) + } else { + require.Panics(t, func() { InnerProduct(tc.v1, tc.v2) }) + } + } +} diff --git a/pkg/util/vector/BUILD.bazel b/pkg/util/vector/BUILD.bazel index 6fd11f8f27cb..df2ab07dfce5 100644 --- a/pkg/util/vector/BUILD.bazel +++ b/pkg/util/vector/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", + "//pkg/util/num32", "@com_github_cockroachdb_errors//:errors", ], ) diff --git a/pkg/util/vector/vector.go b/pkg/util/vector/vector.go index 2c3f51857cd6..51789e1b589e 100644 --- a/pkg/util/vector/vector.go +++ b/pkg/util/vector/vector.go @@ -14,6 +14,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/util/encoding" + "github.com/cockroachdb/cockroach/pkg/util/num32" ) // MaxDim is the maximum number of dimensions a vector can have. @@ -137,24 +138,12 @@ func Decode(b []byte) (ret T, err error) { return ret, nil } -func checkDims(t T, t2 T) error { - if len(t) != len(t2) { - return pgerror.Newf(pgcode.DataException, "different vector dimensions %d and %d", len(t), len(t2)) - } - return nil -} - // L1Distance returns the L1 (Manhattan) distance between t and t2. func L1Distance(t T, t2 T) (float64, error) { if err := checkDims(t, t2); err != nil { return 0, err } - var distance float32 - for i := range len(t) { - diff := t[i] - t2[i] - distance += float32(math.Abs(float64(diff))) - } - return float64(distance), nil + return float64(num32.L1Distance(t, t2)), nil } // L2Distance returns the Euclidean distance between t and t2. @@ -162,29 +151,36 @@ func L2Distance(t T, t2 T) (float64, error) { if err := checkDims(t, t2); err != nil { return 0, err } - var distance float32 - for i := range len(t) { - diff := t[i] - t2[i] - distance += diff * diff - } // TODO(queries): check for overflow and validate intermediate result if needed. - return math.Sqrt(float64(distance)), nil + return math.Sqrt(float64(num32.L2SquaredDistance(t, t2))), nil } -// CosDistance returns the cosine distance between t and t2. +// CosDistance returns the cosine distance between t and t2. This represents the +// similarity between the two vectors, ranging from 0 (most similar) to 2 (least +// similar). Only the angle between the vectors matters; the norms (magnitudes) +// are irrelevant. func CosDistance(t T, t2 T) (float64, error) { if err := checkDims(t, t2); err != nil { return 0, err } - var distance, normA, normB float32 - for i := range len(t) { - distance += t[i] * t2[i] + + // Compute the cosine of the angle between the two vectors as their dot + // product divided by the product of their norms: + // t·t2 + // ----------- + // ||t|| ||t2|| + var dot, normA, normB float32 + for i := range t { + dot += t[i] * t2[i] normA += t[i] * t[i] normB += t2[i] * t2[i] } - // Use sqrt(a * b) over sqrt(a) * sqrt(b) - similarity := float64(distance) / math.Sqrt(float64(normA)*float64(normB)) - /* Keep in range */ + + // Use sqrt(a * b) over sqrt(a) * sqrt(b) to compute norms. + similarity := float64(dot) / math.Sqrt(float64(normA)*float64(normB)) + + // Cosine distance = 1 - cosine similarity. Ensure that similarity always + // stays within [-1, 1] despite any floating point arithmetic error. if similarity > 1 { similarity = 1 } else if similarity < -1 { @@ -198,11 +194,7 @@ func InnerProduct(t T, t2 T) (float64, error) { if err := checkDims(t, t2); err != nil { return 0, err } - var distance float32 - for i := range len(t) { - distance += t[i] * t2[i] - } - return float64(distance), nil + return float64(num32.InnerProduct(t, t2)), nil } // NegInnerProduct returns the negative inner product of t1 and t2. @@ -290,3 +282,10 @@ func Random(rng *rand.Rand) T { } return v } + +func checkDims(t T, t2 T) error { + if len(t) != len(t2) { + return pgerror.Newf(pgcode.DataException, "different vector dimensions %d and %d", len(t), len(t2)) + } + return nil +} diff --git a/pkg/util/vector/vector_set.go b/pkg/util/vector/vector_set.go index 1b8939fef4e3..e2fa666704e1 100644 --- a/pkg/util/vector/vector_set.go +++ b/pkg/util/vector/vector_set.go @@ -99,11 +99,10 @@ func (vs *Set) AddZero(count int) { } } -// Remove removes the vector at the given offset from the set. This is an O(1) -// operation. -// NB: This operation changes the ordering of vectors in the set, with the last -// vector moved to the removed vector's offset. -func (vs *Set) Remove(offset int) { +// ReplaceWithLast removes the vector at the given offset from the set, +// replacing it with the last vector in the set. The modified set has one less +// element and the last vector's position changes. +func (vs *Set) ReplaceWithLast(offset int) { targetStart := offset * vs.Dims sourceEnd := len(vs.Data) copy(vs.Data[targetStart:targetStart+vs.Dims], vs.Data[sourceEnd-vs.Dims:sourceEnd]) diff --git a/pkg/util/vector/vector_set_test.go b/pkg/util/vector/vector_set_test.go index 2bb0ab41a352..6d2a47990fd4 100644 --- a/pkg/util/vector/vector_set_test.go +++ b/pkg/util/vector/vector_set_test.go @@ -35,10 +35,10 @@ func TestVectorSet(t *testing.T) { require.Equal(t, 8, vs.Count) require.Equal(t, []float32{1, 2, 5, 3, 6, 6, 1, 2, 5, 3, 6, 6, 0, 0, 0, 0}, vs.Data) - // Remove. - vs.Remove(1) - vs.Remove(4) - vs.Remove(5) + // ReplaceWithLast. + vs.ReplaceWithLast(1) + vs.ReplaceWithLast(4) + vs.ReplaceWithLast(5) require.Equal(t, 5, vs.Count) require.Equal(t, []float32{1, 2, 0, 0, 6, 6, 1, 2, 0, 0}, vs.Data) @@ -95,12 +95,12 @@ func TestVectorSet(t *testing.T) { require.Panics(t, func() { vs11.SplitAt(-1) }) require.Panics(t, func() { vs11.AddZero(-1) }) require.Panics(t, func() { vs11.AddSet(nil) }) - require.Panics(t, func() { vs11.Remove(-1) }) + require.Panics(t, func() { vs11.ReplaceWithLast(-1) }) vs12 := MakeSet(2) require.Panics(t, func() { vs12.At(0) }) require.Panics(t, func() { vs12.SplitAt(1) }) - require.Panics(t, func() { vs12.Remove(0) }) + require.Panics(t, func() { vs12.ReplaceWithLast(0) }) vs13 := MakeSet(-1) require.Panics(t, func() { vs13.Add(v1) }) diff --git a/pkg/util/vector/vector_test.go b/pkg/util/vector/vector_test.go index a42784c8a05f..bff535de9736 100644 --- a/pkg/util/vector/vector_test.go +++ b/pkg/util/vector/vector_test.go @@ -12,8 +12,12 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) +var NaN32 = float32(math.NaN()) +var Inf32 = float32(math.Inf(1)) + func TestParseVector(t *testing.T) { testCases := []struct { input string @@ -74,84 +78,42 @@ func TestRoundtripRandomPGVector(t *testing.T) { } } -func TestDistances(t *testing.T) { +func TestCosDistance(t *testing.T) { // Test L1, L2, Cosine distance. testCases := []struct { - v1 T - v2 T - l1 float64 - l2 float64 + v1 []float32 + v2 []float32 cos float64 err bool }{ - {v1: T{1, 2, 3}, v2: T{4, 5, 6}, l1: 9, l2: 5.196152422, cos: 0.02536815, err: false}, - {v1: T{-1, -2, -3}, v2: T{-4, -5, -6}, l1: 9, l2: 5.196152422, cos: 0.02536815, err: false}, - {v1: T{0, 0, 0}, v2: T{0, 0, 0}, l1: 0, l2: 0, cos: math.NaN(), err: false}, - {v1: T{1, 2, 3}, v2: T{1, 2, 3}, l1: 0, l2: 0, cos: 0, err: false}, - {v1: T{1, 2, 3}, v2: T{1, 2, 4}, l1: 1, l2: 1, cos: 0.008539, err: false}, + {v1: []float32{}, v2: []float32{}, cos: math.NaN(), err: false}, + {v1: []float32{1, 2, 3}, v2: []float32{4, 5, 6}, cos: 0.02536815, err: false}, + {v1: []float32{-1, -2, -3}, v2: []float32{-4, -5, -6}, cos: 0.02536815, err: false}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 3}, cos: 0, err: false}, + {v1: []float32{1, 2, 3}, v2: []float32{1, 2, 4}, cos: 0.008539, err: false}, + {v1: []float32{NaN32}, v2: []float32{1}, cos: math.NaN(), err: false}, + {v1: []float32{Inf32}, v2: []float32{1}, cos: math.NaN(), err: false}, // Different vector sizes errors. {v1: T{1, 2, 3}, v2: T{4, 5}, err: true}, } for _, tc := range testCases { - l1, l1Err := L1Distance(tc.v1, tc.v2) - l2, l2Err := L2Distance(tc.v1, tc.v2) cos, cosErr := CosDistance(tc.v1, tc.v2) - if tc.err { - assert.Error(t, l1Err) - assert.Error(t, l2Err) assert.Error(t, cosErr) } else { - assert.NoError(t, l1Err) - assert.NoError(t, l2Err) assert.NoError(t, cosErr) - assert.InDelta(t, tc.l1, l1, 0.000001) - assert.InDelta(t, tc.l2, l2, 0.000001) assert.InDelta(t, tc.cos, cos, 0.000001) } } } -func TestProducts(t *testing.T) { - // Test inner product and negative inner product - testCases := []struct { - v1 T - v2 T - ip float64 - negIp float64 - err bool - }{ - {v1: T{1, 2, 3}, v2: T{4, 5, 6}, ip: 32, negIp: -32, err: false}, - {v1: T{-1, -2, -3}, v2: T{-4, -5, -6}, ip: 32, negIp: -32, err: false}, - {v1: T{0, 0, 0}, v2: T{0, 0, 0}, ip: 0, negIp: 0, err: false}, - {v1: T{1, 2, 3}, v2: T{1, 2, 3}, ip: 14, negIp: -14, err: false}, - {v1: T{1, 2, 3}, v2: T{1, 2, 4}, ip: 17, negIp: -17, err: false}, - // Different vector sizes errors. - {v1: T{1, 2, 3}, v2: T{4, 5}, err: true}, - } - - for _, tc := range testCases { - ip, ipErr := InnerProduct(tc.v1, tc.v2) - negIp, negIpErr := NegInnerProduct(tc.v1, tc.v2) - - if tc.err { - assert.Error(t, ipErr) - assert.Error(t, negIpErr) - } else { - assert.NoError(t, ipErr) - assert.NoError(t, negIpErr) - assert.InDelta(t, tc.ip, ip, 0.000001) - assert.InDelta(t, tc.negIp, negIp, 0.000001) - } - } -} - func TestNorm(t *testing.T) { testCases := []struct { v T norm float64 }{ + {v: T{}, norm: 0}, {v: T{1, 2, 3}, norm: 3.7416573867739413}, {v: T{0, 0, 0}, norm: 0}, {v: T{-1, -2, -3}, norm: 3.7416573867739413}, @@ -163,6 +125,34 @@ func TestNorm(t *testing.T) { } } +// While the real work of these functions is done by the num32 package, test +// that the wrapper functions are working. +func TestNum32Functions(t *testing.T) { + _, err := L1Distance(T{1, 2}, T{3, 4, 5}) + require.Error(t, err) + res, err := L1Distance(T{1, 2, 3}, T{4, 5, 6}) + require.NoError(t, err) + require.Equal(t, float64(9), res) + + _, err = L2Distance(T{1, 2}, T{3, 4, 5}) + require.Error(t, err) + res, err = L2Distance(T{1, 2, 3}, T{4, 5, 6}) + require.NoError(t, err) + require.InDelta(t, float64(5.196152422), res, 0.000001) + + _, err = InnerProduct(T{1, 2}, T{3, 4, 5}) + require.Error(t, err) + res, err = InnerProduct(T{1, 2, 3}, T{4, 5, 6}) + require.NoError(t, err) + require.Equal(t, float64(32), res) + + _, err = NegInnerProduct(T{1, 2}, T{3, 4, 5}) + require.Error(t, err) + res, err = NegInnerProduct(T{1, 2, 3}, T{4, 5, 6}) + require.NoError(t, err) + require.Equal(t, float64(-32), res) +} + func TestPointwiseOps(t *testing.T) { // Test L1, L2, Cosine distance. testCases := []struct {