From 9ccfa70359c27b5096d9c6dfa25bb834a0b6c03e Mon Sep 17 00:00:00 2001 From: Haytham Abuelfutuh Date: Thu, 8 Sep 2022 15:54:26 -0700 Subject: [PATCH] Update hash algo to 64bit to reduce collisions (#283) * Update Encoding Length hash to 64bit to reduce collisions Signed-off-by: Haytham Abuelfutuh * Allow customization of hashing algorithm Signed-off-by: Haytham Abuelfutuh * Fix tests and add an algorithm test Signed-off-by: Haytham Abuelfutuh * docs Signed-off-by: Haytham Abuelfutuh * Support 128-bit fnv as well Signed-off-by: Haytham Abuelfutuh * tab -> spaces Signed-off-by: Haytham Abuelfutuh Signed-off-by: Haytham Abuelfutuh --- go/tasks/pluginmachinery/encoding/encoder.go | 78 ++++++++++++++++--- .../pluginmachinery/encoding/encoder_test.go | 61 ++++++++++++--- 2 files changed, 119 insertions(+), 20 deletions(-) diff --git a/go/tasks/pluginmachinery/encoding/encoder.go b/go/tasks/pluginmachinery/encoding/encoder.go index 303eee2de8..8e31650edc 100644 --- a/go/tasks/pluginmachinery/encoding/encoder.go +++ b/go/tasks/pluginmachinery/encoding/encoder.go @@ -3,6 +3,7 @@ package encoding import ( "encoding/base32" "fmt" + "hash" "hash/fnv" "strings" ) @@ -11,18 +12,73 @@ const specialEncoderKey = "abcdefghijklmnopqrstuvwxyz123456" var Base32Encoder = base32.NewEncoding(specialEncoderKey).WithPadding(base32.NoPadding) -// Creates a new UniqueID that is based on the inputID and of a specified length, if the given id is longer than the -// maxLength. -func FixedLengthUniqueID(inputID string, maxLength int) (string, error) { +// Algorithm defines an enum for the encoding algorithm to use. +type Algorithm uint32 + +const ( + // Algorithm32 uses fnv32 bit encoder. + Algorithm32 Algorithm = iota + + // Algorithm64 uses fnv64 bit encoder. + Algorithm64 + + // Algorithm128 uses fnv128 bit encoder. + Algorithm128 +) + +type Option interface { + option() +} + +// AlgorithmOption defines a wrapper to pass the algorithm to encoding functions. +type AlgorithmOption struct { + Option + algo Algorithm +} + +// NewAlgorithmOption wraps the Algorithm into an AlgorithmOption to pass to the encoding functions. +func NewAlgorithmOption(algo Algorithm) AlgorithmOption { + return AlgorithmOption{ + algo: algo, + } +} + +// FixedLengthUniqueID creates a new UniqueID that is based on the inputID and of a specified length, if the given id is +// longer than the maxLength. +func FixedLengthUniqueID(inputID string, maxLength int, options ...Option) (string, error) { if len(inputID) <= maxLength { return inputID, nil } - hasher := fnv.New32a() - // Using 32a an error can never happen, so this will always remain not covered by a unit test + var hasher hash.Hash + for _, option := range options { + if algoOption, casted := option.(AlgorithmOption); casted { + switch algoOption.algo { + case Algorithm32: + hasher = fnv.New32a() + case Algorithm64: + hasher = fnv.New64a() + case Algorithm128: + hasher = fnv.New128a() + } + } + } + + if hasher == nil { + hasher = fnv.New32a() + } + + // Using 32a/64a an error can never happen, so this will always remain not covered by a unit test _, _ = hasher.Write([]byte(inputID)) // #nosec b := hasher.Sum(nil) - // expected length after this step is 8 chars (1 + 7 chars from Base32Encoder.EncodeToString(b)) + + // Encoding Length Calculation: + // Base32 Encoder will encode every 5 bits into an output character (2 ^ 5 = 32) + // output length = ciel( / 5) + // for 32a hashing = ceil(32 / 5) = 7 + // for 64a hashing = ceil(64 / 5) = 13 + // We prefix with character `f` so the final output is 8 or 14 + finalStr := "f" + Base32Encoder.EncodeToString(b) if len(finalStr) > maxLength { return finalStr, fmt.Errorf("max Length is too small, cannot create an encoded string that is so small") @@ -30,10 +86,10 @@ func FixedLengthUniqueID(inputID string, maxLength int) (string, error) { return finalStr, nil } -// Creates a new uniqueID using the parts concatenated using `-` and ensures that the uniqueID is not longer than the -// maxLength. In case a simple concatenation yields a longer string, a new hashed ID is created which is always -// around 8 characters in length -func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error) { +// FixedLengthUniqueIDForParts creates a new uniqueID using the parts concatenated using `-` and ensures that the +// uniqueID is not longer than the maxLength. In case a simple concatenation yields a longer string, a new hashed ID is +// created which is always around 8 characters in length. +func FixedLengthUniqueIDForParts(maxLength int, parts []string, options ...Option) (string, error) { b := strings.Builder{} for i, p := range parts { if i > 0 && b.Len() > 0 { @@ -45,5 +101,5 @@ func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error) _, _ = b.WriteString(p) // #nosec } - return FixedLengthUniqueID(b.String(), maxLength) + return FixedLengthUniqueID(b.String(), maxLength, options...) } diff --git a/go/tasks/pluginmachinery/encoding/encoder_test.go b/go/tasks/pluginmachinery/encoding/encoder_test.go index 097288fc66..9dc807f154 100644 --- a/go/tasks/pluginmachinery/encoding/encoder_test.go +++ b/go/tasks/pluginmachinery/encoding/encoder_test.go @@ -1,6 +1,8 @@ package encoding import ( + "hash" + "hash/fnv" "testing" "github.com/stretchr/testify/assert" @@ -18,7 +20,7 @@ func TestFixedLengthUniqueID(t *testing.T) { {"veryLowLimit", "xx", 1, "flfryc2i", true}, {"highLimit", "xxxxxx", 5, "fufiti6i", true}, {"higherLimit", "xxxxx", 10, "xxxxx", false}, - {"largeID", "xxxxxxxxxxx", 10, "fggddjly", false}, + {"largeID", "xxxxxxxxxxxxxxxxxxxxxxxx", 20, "fuaa3aji", false}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -28,7 +30,8 @@ func TestFixedLengthUniqueID(t *testing.T) { } else { assert.NoError(t, err) } - assert.Equal(t, i, test.output) + + assert.Equal(t, test.output, i) }) } } @@ -38,24 +41,64 @@ func TestFixedLengthUniqueIDForParts(t *testing.T) { name string parts []string maxLength int + algorithm Algorithm output string expectError bool }{ - {"smallerThanMax", []string{"x", "y", "z"}, 10, "x-y-z", false}, - {"veryLowLimit", []string{"x", "y"}, 1, "fz2jizji", true}, - {"fittingID", []string{"x"}, 2, "x", false}, - {"highLimit", []string{"x", "y", "z"}, 4, "fxzsoqrq", true}, - {"largeID", []string{"x", "y", "z", "m", "n"}, 8, "fsigbmty", false}, + {"smallerThanMax", []string{"x", "y", "z"}, 10, Algorithm32, "x-y-z", false}, + {"veryLowLimit", []string{"x", "y"}, 1, Algorithm32, "fz2jizji", true}, + {"fittingID", []string{"x"}, 2, Algorithm32, "x", false}, + {"highLimit", []string{"x", "y", "z"}, 4, Algorithm32, "fxzsoqrq", true}, + {"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm32, "fe63sz6y", false}, + {"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm64, "fwp4bky2kucex5", false}, + {"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n", "z", "m", "n", "y", "z", "m", "n"}, 30, Algorithm128, "fbmesl15enghpjepzjm5cp1zfqe", false}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts...) + i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts, NewAlgorithmOption(test.algorithm)) if test.expectError { assert.Error(t, err) } else { assert.NoError(t, err) } - assert.Equal(t, i, test.output) + assert.Equal(t, test.output, i) }) } } + +func benchmarkKB(b *testing.B, h hash.Hash) { + b.SetBytes(1024) + data := make([]byte, 1024) + for i := range data { + data[i] = byte(i) + } + + in := make([]byte, 0, h.Size()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + h.Reset() + h.Write(data) + h.Sum(in) + } +} + +// Documented Results: +// goos: darwin +// goarch: amd64 +// pkg: github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/encoding +// cpu: Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +// BenchmarkFixedLengthUniqueID +// BenchmarkFixedLengthUniqueID/New32a +// BenchmarkFixedLengthUniqueID/New32a-16 1000000 1088 ns/op 941.25 MB/s +// BenchmarkFixedLengthUniqueID/New64a +// BenchmarkFixedLengthUniqueID/New64a-16 1239402 951.3 ns/op 1076.39 MB/s +func BenchmarkFixedLengthUniqueID(b *testing.B) { + b.Run("New32a", func(b *testing.B) { + benchmarkKB(b, fnv.New32a()) + }) + + b.Run("New64a", func(b *testing.B) { + benchmarkKB(b, fnv.New64a()) + }) +}