Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(query): Update CompressedBin IntersectionAlgo #9000

Merged
merged 10 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 34 additions & 27 deletions algo/uidlist.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ import (
"github.com/dgraph-io/dgraph/protos/pb"
)

const jump = 32 // Jump size in InsersectWithJump.
const jump = 32 // Jump size in InsersectWithJump.
const linVsBinRatio = 10 // When is linear search better than binary

// ApplyFilter applies a filter to our UIDList.
func ApplyFilter(u *pb.List, f func(uint64, int) bool) {
Expand Down Expand Up @@ -60,7 +61,7 @@ func IntersectCompressedWith(pack *pb.UidPack, afterUID uint64, v, o *pb.List) {

// Select appropriate function based on heuristics.
ratio := float64(m) / float64(n)
if ratio < 500 {
if ratio < linVsBinRatio {
IntersectCompressedWithLinJump(&dec, v.Uids, &dst)
} else {
IntersectCompressedWithBin(&dec, v.Uids, &dst)
Expand Down Expand Up @@ -94,7 +95,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64)
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3
// Call seek on dec before calling this function
func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
ld := dec.ApproxLen()
ld := codec.ExactLen(dec.Pack)
lq := len(q)

if lq == 0 {
Expand All @@ -105,46 +106,50 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
}

// Pick the shorter list and do binary search
if ld < lq {
if ld <= lq {
for {
blockUids := dec.Uids()
if len(blockUids) == 0 {
break
}
IntersectWithBin(blockUids, q, o)
lastUid := blockUids[len(blockUids)-1]
qidx := sort.Search(len(q), func(idx int) bool {
return q[idx] >= lastUid
})
if qidx >= len(q) {
if ld*linVsBinRatio < len(q) {
q = q[IntersectWithBin(blockUids, q, o):]
} else {
// For small enough difference between two arrays, we should just
// do lin intersect
_, off := IntersectWithLin(blockUids, q, o)
q = q[off:]
}
if len(q) == 0 {
return
}
q = q[qidx:]
dec.Next()
}
return
}

var uids []uint64
for _, u := range q {
uids := dec.Uids()
qidx := 0
for {
if qidx >= len(q) {
return
}
u := q[qidx]
if len(uids) == 0 || u > uids[len(uids)-1] {
uids = dec.Seek(u, codec.SeekStart)
if lq*linVsBinRatio < ld {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix this condition

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried updating the condition, but it made performance much worse. I am guessing its because that the decision to do binary or linear really depends on the total size of the arrays. If the array ratio is too high, then the numbers would be far apart.

uids = dec.LinearSeek(u)
} else {
uids = dec.SeekToBlock(u, codec.SeekCurrent)
harshil-goel marked this conversation as resolved.
Show resolved Hide resolved
}
if len(uids) == 0 {
return
}
}
uidIdx := sort.Search(len(uids), func(idx int) bool {
return uids[idx] >= u
})
if uidIdx >= len(uids) {
// We know that u < max(uids). If we didn't find it here, it's not here.
continue
}
if uids[uidIdx] == u {
*o = append(*o, u)
uidIdx++
_, off := IntersectWithJump(uids, q[qidx:], o)
if off == 0 {
off = 1 // if v[k] isn't in u, move forward
}
uids = uids[uidIdx:]
qidx += off
}
}

Expand Down Expand Up @@ -233,7 +238,8 @@ func IntersectWithJump(u, v []uint64, o *[]uint64) (int, int) {
// IntersectWithBin is based on the paper
// "Fast Intersection Algorithms for Sorted Sequences"
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3
func IntersectWithBin(d, q []uint64, o *[]uint64) {
// Returns where to move the second array(q) to. O means not found
func IntersectWithBin(d, q []uint64, o *[]uint64) int {
harshil-goel marked this conversation as resolved.
Show resolved Hide resolved
ld := len(d)
lq := len(q)

Expand All @@ -242,7 +248,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) {
d, q = q, d
}
if ld == 0 || lq == 0 || d[ld-1] < q[0] || q[lq-1] < d[0] {
return
return 0
}

val := d[0]
Expand All @@ -256,6 +262,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) {
})

binIntersect(d, q[minq:maxq], o)
return maxq
}

// binIntersect is the recursive function used.
Expand Down
60 changes: 53 additions & 7 deletions algo/uidlist_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
for _, r := range rs {
sz1 := sz
sz2 := int(float64(sz) * r)
if sz2 > 1000000 || sz2 == 0 {
if sz2 > 10000000 || sz2 == 0 {
break
}

Expand All @@ -389,8 +389,18 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
sort.Slice(v1, func(i, j int) bool { return v1[i] < v1[j] })

dst2 := &pb.List{}
dst1 := &pb.List{}
compressedUids := codec.Encode(v1, 256)

b.Run(fmt.Sprintf("linJump:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap),
func(b *testing.B) {
for k := 0; k < b.N; k++ {
dec := codec.Decoder{Pack: compressedUids}
dec.Seek(0, codec.SeekStart)
IntersectCompressedWithLinJump(&dec, u1, &dst1.Uids)
}
})

b.Run(fmt.Sprintf("compressed:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap),
func(b *testing.B) {
for k := 0; k < b.N; k++ {
Expand All @@ -399,7 +409,6 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
IntersectCompressedWithBin(&dec, u1, &dst2.Uids)
}
})
fmt.Println()

codec.FreePack(compressedUids)
}
Expand Down Expand Up @@ -493,6 +502,43 @@ func sortUint64(nums []uint64) {
sort.Slice(nums, func(i, j int) bool { return nums[i] < nums[j] })
}

func fillNumsDiff(N1, N2, N3 int) ([]uint64, []uint64, []uint64) {
rand.Seed(time.Now().UnixNano())

commonNums := make([]uint64, N1)
blockNums := make([]uint64, N1+N2)
otherNums := make([]uint64, N1+N3)
allC := make(map[uint64]bool)

for i := 0; i < N1; i++ {
val := rand.Uint64() % 1000
commonNums[i] = val
blockNums[i] = val
otherNums[i] = val
allC[val] = true
}

for i := N1; i < N1+N2; i++ {
val := rand.Uint64() % 1000
blockNums[i] = val
allC[val] = true
}

for i := N1; i < N1+N3; i++ {
val := rand.Uint64()
for ok := true; ok; _, ok = allC[val] {
val = rand.Uint64() % 1000
}
otherNums[i] = val
}

sortUint64(commonNums)
sortUint64(blockNums)
sortUint64(otherNums)

return commonNums, blockNums, otherNums
}

func fillNums(N1, N2 int) ([]uint64, []uint64, []uint64) {
rand.Seed(time.Now().UnixNano())

Expand Down Expand Up @@ -545,12 +591,12 @@ func TestIntersectCompressedWithLinJump(t *testing.T) {
}

func TestIntersectCompressedWithBin(t *testing.T) {
lengths := []int{0, 1, 3, 11, 100}
//lengths := []int{0, 1, 3, 11, 100, 500, 1000}

for _, N1 := range lengths {
for _, N2 := range lengths {
for _, N1 := range []int{11} {
for _, N2 := range []int{3} {
// Intersection of blockNums and otherNums is commonNums.
commonNums, blockNums, otherNums := fillNums(N1, N2)
commonNums, blockNums, otherNums := fillNumsDiff(N1/10, N1, N2)

enc := codec.Encoder{BlockSize: 10}
for _, num := range blockNums {
Expand All @@ -570,7 +616,7 @@ func TestIntersectCompressedWithBin(t *testing.T) {
}

func TestIntersectCompressedWithBinMissingSize(t *testing.T) {
lengths := []int{0, 1, 3, 11, 100}
lengths := []int{0, 1, 3, 11, 100, 500, 1000}

for _, N1 := range lengths {
for _, N2 := range lengths {
Expand Down
58 changes: 58 additions & 0 deletions codec/codec.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,64 @@ func (d *Decoder) ApproxLen() int {

type searchFunc func(int) bool

// SeekToBlock will find the nearest block, and unpack it. Unlike Seek, it doesn't
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can improve this explanation

// apply search in the resulting uid list and then move the pointer forward. When we are going
// to intersect the list later, this function is useful.
func (d *Decoder) SeekToBlock(uid uint64, whence seekPos) []uint64 {
if d.Pack == nil {
return []uint64{}
}
prevBlockIdx := d.blockIdx
d.blockIdx = 0
if uid == 0 {
return d.UnpackBlock()
}

// If for some reason we are searching an older uid, we need to search the entire pack
if prevBlockIdx > 0 && uid < d.Pack.Blocks[prevBlockIdx].Base {
prevBlockIdx = 0
}

pack := d.Pack
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this assignment

blocksFunc := func() searchFunc {
var f searchFunc
switch whence {
case SeekStart:
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base >= uid }
case SeekCurrent:
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base > uid }
}
return f
}

idx := sort.Search(len(pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx
// The first block.Base >= uid.
if idx == 0 {
return d.UnpackBlock()
}
// The uid is the first entry in the block.
if idx < len(pack.Blocks) && pack.Blocks[idx].Base == uid {
d.blockIdx = idx
return d.UnpackBlock()
}

// Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or,
// we found the first block index whose base is greater than uid. In these cases, go to the
// previous block and search there.
d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it.
if d.blockIdx != prevBlockIdx {
d.UnpackBlock() // And get all their uids.
}

if uid <= d.uids[len(d.uids)-1] {
return d.uids
}

// Could not find any uid in the block, which is >= uid. The next block might still have valid
// entries > uid.
return d.Next()
}

// Seek will search for uid in a packed block using the specified whence position.
// The value of whence must be one of the predefined values SeekStart or SeekCurrent.
// SeekStart searches uid and includes it as part of the results.
Expand Down