Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Update hash algo to 64bit to reduce collisions #283

Merged
merged 6 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 67 additions & 11 deletions go/tasks/pluginmachinery/encoding/encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package encoding
import (
"encoding/base32"
"fmt"
"hash"
"hash/fnv"
"strings"
)
Expand All @@ -11,29 +12,84 @@ const specialEncoderKey = "abcdefghijklmnopqrstuvwxyz123456"

var Base32Encoder = base32.NewEncoding(specialEncoderKey).WithPadding(base32.NoPadding)

// Creates a new UniqueID that is based on the inputID and of a specified length, if the given id is longer than the
// maxLength.
func FixedLengthUniqueID(inputID string, maxLength int) (string, error) {
// Algorithm defines an enum for the encoding algorithm to use.
type Algorithm uint32

const (
// Algorithm32 uses fnv32 bit encoder.
Algorithm32 Algorithm = iota

// Algorithm64 uses fnv64 bit encoder.
Algorithm64

// Algorithm128 uses fnv128 bit encoder.
Algorithm128
)

type Option interface {
option()
}

// AlgorithmOption defines a wrapper to pass the algorithm to encoding functions.
type AlgorithmOption struct {
Option
algo Algorithm
}

// NewAlgorithmOption wraps the Algorithm into an AlgorithmOption to pass to the encoding functions.
func NewAlgorithmOption(algo Algorithm) AlgorithmOption {
return AlgorithmOption{
algo: algo,
}
}

// FixedLengthUniqueID creates a new UniqueID that is based on the inputID and of a specified length, if the given id is
// longer than the maxLength.
func FixedLengthUniqueID(inputID string, maxLength int, options ...Option) (string, error) {
if len(inputID) <= maxLength {
return inputID, nil
}

hasher := fnv.New32a()
// Using 32a an error can never happen, so this will always remain not covered by a unit test
var hasher hash.Hash
for _, option := range options {
if algoOption, casted := option.(AlgorithmOption); casted {
switch algoOption.algo {
case Algorithm32:
hasher = fnv.New32a()
case Algorithm64:
hasher = fnv.New64a()
case Algorithm128:
hasher = fnv.New128a()
}
}
}

if hasher == nil {
hasher = fnv.New32a()
}

// Using 32a/64a an error can never happen, so this will always remain not covered by a unit test
_, _ = hasher.Write([]byte(inputID)) // #nosec
b := hasher.Sum(nil)
// expected length after this step is 8 chars (1 + 7 chars from Base32Encoder.EncodeToString(b))

// Encoding Length Calculation:
// Base32 Encoder will encode every 5 bits into an output character (2 ^ 5 = 32)
// output length = ciel(<input bits> / 5)
// for 32a hashing = ceil(32 / 5) = 7
// for 64a hashing = ceil(64 / 5) = 13
// We prefix with character `f` so the final output is 8 or 14

finalStr := "f" + Base32Encoder.EncodeToString(b)
if len(finalStr) > maxLength {
return finalStr, fmt.Errorf("max Length is too small, cannot create an encoded string that is so small")
}
return finalStr, nil
}

// Creates a new uniqueID using the parts concatenated using `-` and ensures that the uniqueID is not longer than the
// maxLength. In case a simple concatenation yields a longer string, a new hashed ID is created which is always
// around 8 characters in length
func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error) {
// FixedLengthUniqueIDForParts creates a new uniqueID using the parts concatenated using `-` and ensures that the
// uniqueID is not longer than the maxLength. In case a simple concatenation yields a longer string, a new hashed ID is
// created which is always around 8 characters in length.
func FixedLengthUniqueIDForParts(maxLength int, parts []string, options ...Option) (string, error) {
b := strings.Builder{}
for i, p := range parts {
if i > 0 && b.Len() > 0 {
Expand All @@ -45,5 +101,5 @@ func FixedLengthUniqueIDForParts(maxLength int, parts ...string) (string, error)
_, _ = b.WriteString(p) // #nosec
}

return FixedLengthUniqueID(b.String(), maxLength)
return FixedLengthUniqueID(b.String(), maxLength, options...)
}
61 changes: 52 additions & 9 deletions go/tasks/pluginmachinery/encoding/encoder_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package encoding

import (
"hash"
"hash/fnv"
"testing"

"github.com/stretchr/testify/assert"
Expand All @@ -18,7 +20,7 @@ func TestFixedLengthUniqueID(t *testing.T) {
{"veryLowLimit", "xx", 1, "flfryc2i", true},
{"highLimit", "xxxxxx", 5, "fufiti6i", true},
{"higherLimit", "xxxxx", 10, "xxxxx", false},
{"largeID", "xxxxxxxxxxx", 10, "fggddjly", false},
{"largeID", "xxxxxxxxxxxxxxxxxxxxxxxx", 20, "fuaa3aji", false},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
Expand All @@ -28,7 +30,8 @@ func TestFixedLengthUniqueID(t *testing.T) {
} else {
assert.NoError(t, err)
}
assert.Equal(t, i, test.output)

assert.Equal(t, test.output, i)
})
}
}
Expand All @@ -38,24 +41,64 @@ func TestFixedLengthUniqueIDForParts(t *testing.T) {
name string
parts []string
maxLength int
algorithm Algorithm
output string
expectError bool
}{
{"smallerThanMax", []string{"x", "y", "z"}, 10, "x-y-z", false},
{"veryLowLimit", []string{"x", "y"}, 1, "fz2jizji", true},
{"fittingID", []string{"x"}, 2, "x", false},
{"highLimit", []string{"x", "y", "z"}, 4, "fxzsoqrq", true},
{"largeID", []string{"x", "y", "z", "m", "n"}, 8, "fsigbmty", false},
{"smallerThanMax", []string{"x", "y", "z"}, 10, Algorithm32, "x-y-z", false},
{"veryLowLimit", []string{"x", "y"}, 1, Algorithm32, "fz2jizji", true},
{"fittingID", []string{"x"}, 2, Algorithm32, "x", false},
{"highLimit", []string{"x", "y", "z"}, 4, Algorithm32, "fxzsoqrq", true},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm32, "fe63sz6y", false},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n"}, 15, Algorithm64, "fwp4bky2kucex5", false},
{"largeID", []string{"x", "y", "z", "m", "n", "y", "z", "m", "n", "y", "z", "m", "n", "z", "m", "n", "y", "z", "m", "n"}, 30, Algorithm128, "fbmesl15enghpjepzjm5cp1zfqe", false},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts...)
i, err := FixedLengthUniqueIDForParts(test.maxLength, test.parts, NewAlgorithmOption(test.algorithm))
if test.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
assert.Equal(t, i, test.output)
assert.Equal(t, test.output, i)
})
}
}

func benchmarkKB(b *testing.B, h hash.Hash) {
b.SetBytes(1024)
data := make([]byte, 1024)
for i := range data {
data[i] = byte(i)
}

in := make([]byte, 0, h.Size())

b.ResetTimer()
for i := 0; i < b.N; i++ {
h.Reset()
h.Write(data)
h.Sum(in)
}
}

// Documented Results:
// goos: darwin
// goarch: amd64
// pkg: github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/encoding
// cpu: Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
// BenchmarkFixedLengthUniqueID
// BenchmarkFixedLengthUniqueID/New32a
// BenchmarkFixedLengthUniqueID/New32a-16 1000000 1088 ns/op 941.25 MB/s
// BenchmarkFixedLengthUniqueID/New64a
// BenchmarkFixedLengthUniqueID/New64a-16 1239402 951.3 ns/op 1076.39 MB/s
func BenchmarkFixedLengthUniqueID(b *testing.B) {
b.Run("New32a", func(b *testing.B) {
benchmarkKB(b, fnv.New32a())
})

b.Run("New64a", func(b *testing.B) {
benchmarkKB(b, fnv.New64a())
})
}