Skip to content

Commit

Permalink
Update hash algo to 64bit to reduce collisions (flyteorg#283)
Browse files Browse the repository at this point in the history
* Update Encoding Length hash to 64bit to reduce collisions

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* Allow customization of hashing algorithm

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* Fix tests and add an algorithm test

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* docs

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* Support 128-bit fnv as well

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* tab -> spaces

Signed-off-by: Haytham Abuelfutuh <[email protected]>

Signed-off-by: Haytham Abuelfutuh <[email protected]>
  • Loading branch information
EngHabu authored Sep 8, 2022
1 parent 4f13b4b commit 9ccfa70
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 20 deletions.
78 changes: 67 additions & 11 deletions go/tasks/pluginmachinery/encoding/encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package encoding
import (
"encoding/base32"
"fmt"
"hash"
"hash/fnv"
"strings"
)
Expand All @@ -11,29 +12,84 @@ const specialEncoderKey = "abcdefghijklmnopqrstuvwxyz123456"

var Base32Encoder = base32.NewEncoding(specialEncoderKey).WithPadding(base32.NoPadding)

// Creates a new UniqueID that is based on the inputID and of a specified length, if the given id is longer than the
// maxLength.
func FixedLengthUniqueID(inputID string, maxLength int) (string, error) {
// Algorithm defines an enum for the encoding algorithm to use.
type Algorithm uint32

const (
// Algorithm32 uses fnv32 bit encoder.
Algorithm32 Algorithm = iota

// Algorithm64 uses fnv64 bit encoder.
Algorithm64

// Algorithm128 uses fnv128 bit encoder.
Algorithm128
)

type Option interface {
option()
}

// AlgorithmOption defines a wrapper to pass the algorithm to encoding functions.
type AlgorithmOption struct {
Option
algo Algorithm
}

// NewAlgorithmOption wraps the Algorithm into an AlgorithmOption to pass to the encoding functions.
func NewAlgorithmOption(algo Algorithm) AlgorithmOption {
return AlgorithmOption{
algo: algo,
}
}

// FixedLengthUniqueID creates a new UniqueID that is based on the inputID and of a specified length, if the given id is
// longer than the maxLength.
func FixedLengthUniqueID(inputID string, maxLength int, options ...Option) (string, error) {
if len(inputID) <= maxLength {
return inputID, nil
}

hasher := fnv.New32a()
// Using 32a an error can never happen, so this will always remain not covered by a unit test
var hasher hash.Hash
for _, option := range options {
if algoOption, casted := option.(AlgorithmOption); casted {
switch algoOption.algo {
case Algorithm32:
hasher = fnv.New32a()
case Algorithm64:
hasher = fnv.New64a()
case Algorithm128:
hasher = fnv.New128a()
}
}
}

if hasher == nil {
hasher = fnv.New32a()
}

// Using 32a/64a an error can never happen, so this will always remain not covered by a unit test
_, _ = hasher.Write([]byte(inputID)) // #nosec
b := hasher.Sum(nil)
// expected length after this step is 8 chars (1 + 7 chars from Base32Encoder.EncodeToString(b))

// Encoding Length Calculation:
// Base32 Encoder will encode every 5 bits into an output character (2 ^ 5 = 32)
// output length = ciel(<input bits> / 5)
// for 32a hashing = ceil(32 / 5) = 7
// for 64a hashing = ceil(64 / 5) = 13
// We prefix with character `f` so the final output is 8 or 14

finalStr := "f" + Base32Encoder.EncodeToString(b)
if len(finalStr) > maxLength {
return finalStr, fmt.Errorf("max Length is too small, cannot create an encoded string that is so small")
}
return finalStr, nil
}

// Creates a new uniqueID using the parts concatenated using `-` and ensures that the uniqueID is not longer than the
// maxLength. In case a simple concatenation yields a longer string, a new hashed ID is created which is always
// around 8 characters in length
func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error) {
// FixedLengthUniqueIDForParts creates a new uniqueID using the parts concatenated using `-` and ensures that the
// uniqueID is not longer than the maxLength. In case a simple concatenation yields a longer string, a new hashed ID is
// created which is always around 8 characters in length.
func FixedLengthUniqueIDForParts(maxLength int, parts []string, options ...Option) (string, error) {
b := strings.Builder{}
for i, p := range parts {
if i > 0 && b.Len() > 0 {
Expand All @@ -45,5 +101,5 @@ func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error)
_, _ = b.WriteString(p) // #nosec
}

return FixedLengthUniqueID(b.String(), maxLength)
return FixedLengthUniqueID(b.String(), maxLength, options...)
}
61 changes: 52 additions & 9 deletions go/tasks/pluginmachinery/encoding/encoder_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package encoding

import (
"hash"
"hash/fnv"
"testing"

"github.com/stretchr/testify/assert"
Expand All @@ -18,7 +20,7 @@ func TestFixedLengthUniqueID(t *testing.T) {
{"veryLowLimit", "xx", 1, "flfryc2i", true},
{"highLimit", "xxxxxx", 5, "fufiti6i", true},
{"higherLimit", "xxxxx", 10, "xxxxx", false},
{"largeID", "xxxxxxxxxxx", 10, "fggddjly", false},
{"largeID", "xxxxxxxxxxxxxxxxxxxxxxxx", 20, "fuaa3aji", false},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
Expand All @@ -28,7 +30,8 @@ func TestFixedLengthUniqueID(t *testing.T) {
} else {
assert.NoError(t, err)
}
assert.Equal(t, i, test.output)

assert.Equal(t, test.output, i)
})
}
}
Expand All @@ -38,24 +41,64 @@ func TestFixedLengthUniqueIDForParts(t *testing.T) {
name string
parts []string
maxLength int
algorithm Algorithm
output string
expectError bool
}{
{"smallerThanMax", []string{"x", "y", "z"}, 10, "x-y-z", false},
{"veryLowLimit", []string{"x", "y"}, 1, "fz2jizji", true},
{"fittingID", []string{"x"}, 2, "x", false},
{"highLimit", []string{"x", "y", "z"}, 4, "fxzsoqrq", true},
{"largeID", []string{"x", "y", "z", "m", "n"}, 8, "fsigbmty", false},
{"smallerThanMax", []string{"x", "y", "z"}, 10, Algorithm32, "x-y-z", false},
{"veryLowLimit", []string{"x", "y"}, 1, Algorithm32, "fz2jizji", true},
{"fittingID", []string{"x"}, 2, Algorithm32, "x", false},
{"highLimit", []string{"x", "y", "z"}, 4, Algorithm32, "fxzsoqrq", true},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm32, "fe63sz6y", false},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm64, "fwp4bky2kucex5", false},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n", "z", "m", "n", "y", "z", "m", "n"}, 30, Algorithm128, "fbmesl15enghpjepzjm5cp1zfqe", false},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts...)
i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts, NewAlgorithmOption(test.algorithm))
if test.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
assert.Equal(t, i, test.output)
assert.Equal(t, test.output, i)
})
}
}

func benchmarkKB(b *testing.B, h hash.Hash) {
b.SetBytes(1024)
data := make([]byte, 1024)
for i := range data {
data[i] = byte(i)
}

in := make([]byte, 0, h.Size())

b.ResetTimer()
for i := 0; i < b.N; i++ {
h.Reset()
h.Write(data)
h.Sum(in)
}
}

// Documented Results:
// goos: darwin
// goarch: amd64
// pkg: github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/encoding
// cpu: Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
// BenchmarkFixedLengthUniqueID
// BenchmarkFixedLengthUniqueID/New32a
// BenchmarkFixedLengthUniqueID/New32a-16 1000000 1088 ns/op 941.25 MB/s
// BenchmarkFixedLengthUniqueID/New64a
// BenchmarkFixedLengthUniqueID/New64a-16 1239402 951.3 ns/op 1076.39 MB/s
func BenchmarkFixedLengthUniqueID(b *testing.B) {
b.Run("New32a", func(b *testing.B) {
benchmarkKB(b, fnv.New32a())
})

b.Run("New64a", func(b *testing.B) {
benchmarkKB(b, fnv.New64a())
})
}

0 comments on commit 9ccfa70

Please sign in to comment.