From ca934738743d70c7bc3ee8893826959a928cb861 Mon Sep 17 00:00:00 2001
From: Jakub Sztandera <kubuxu@protonmail.ch>
Date: Fri, 20 Sep 2019 21:28:22 +0200
Subject: [PATCH] Add lazy RLE+ decoding

License: MIT
Signed-off-by: Jakub Sztandera <kubuxu@protonmail.ch>
---
 lib/rlepluslazy/internal/bitvector.go      | 154 +++++++++++++++++++++
 lib/rlepluslazy/internal/bitvector_test.go | 136 ++++++++++++++++++
 lib/rlepluslazy/rleminus.go                |  42 ++++++
 lib/rlepluslazy/rleplus.go                 | 110 +++++++++++++++
 lib/rlepluslazy/rleplus_test.go            |  44 ++++++
 5 files changed, 486 insertions(+)
 create mode 100644 lib/rlepluslazy/internal/bitvector.go
 create mode 100644 lib/rlepluslazy/internal/bitvector_test.go
 create mode 100644 lib/rlepluslazy/rleminus.go
 create mode 100644 lib/rlepluslazy/rleplus.go
 create mode 100644 lib/rlepluslazy/rleplus_test.go

diff --git a/lib/rlepluslazy/internal/bitvector.go b/lib/rlepluslazy/internal/bitvector.go
new file mode 100644
index 00000000000..65bae074e09
--- /dev/null
+++ b/lib/rlepluslazy/internal/bitvector.go
@@ -0,0 +1,154 @@
+package bitvector
+
+import (
+	"errors"
+	"log"
+)
+
+var (
+	// ErrOutOfRange - the index passed is out of range for the BitVector
+	ErrOutOfRange = errors.New("index out of range")
+)
+
+// BitNumbering indicates the ordering of bits, either
+// least-significant bit in position 0, or most-significant bit
+// in position 0.
+//
+// It it used in 3 ways with BitVector:
+// 1. Ordering of bits within the Buf []byte structure
+// 2. What order to add bits when using Extend()
+// 3. What order to read bits when using Take()
+//
+// https://en.wikipedia.org/wiki/Bit_numbering
+type BitNumbering int
+
+const (
+	// LSB0 - bit ordering starts with the low-order bit
+	LSB0 BitNumbering = iota
+
+	// MSB0 - bit ordering starts with the high-order bit
+	MSB0
+)
+
+// BitVector is used to manipulate ordered collections of bits
+type BitVector struct {
+	Buf []byte
+
+	// BytePacking is the bit ordering within bytes
+	BytePacking BitNumbering
+
+	// Len is the logical number of bits in the vector.
+	// The last byte in Buf may have undefined bits if Len is not a multiple of 8
+	Len uint
+}
+
+// NewBitVector constructs a new BitVector from a slice of bytes.
+//
+// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes.
+func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector {
+	return &BitVector{
+		BytePacking: bytePacking,
+		Buf:         buf,
+		Len:         uint(len(buf) * 8),
+	}
+}
+
+// Push adds a single bit to the BitVector.
+//
+// Although it takes a byte, only the low-order bit is used, so just use 0 or 1.
+func (v *BitVector) Push(val byte) {
+	if v.Len%8 == 0 {
+		v.Buf = append(v.Buf, 0)
+	}
+	lastIdx := v.Len / 8
+
+	switch v.BytePacking {
+	case LSB0:
+		v.Buf[lastIdx] |= (val & 1) << (v.Len % 8)
+	default:
+		v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8))
+	}
+
+	v.Len++
+}
+
+// Get returns a single bit as a byte -- either 0 or 1
+func (v *BitVector) Get(idx uint) (byte, error) {
+	if idx >= v.Len {
+		return 0, ErrOutOfRange
+	}
+	blockIdx := idx / 8
+
+	switch v.BytePacking {
+	case LSB0:
+		return v.Buf[blockIdx] >> (idx % 8) & 1, nil
+	default:
+		return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil
+	}
+}
+
+// Extend adds up to 8 bits to the receiver
+//
+// Given a byte b == 0b11010101
+// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 >
+// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 >
+//
+// Panics if count is out of range
+func (v *BitVector) Extend(val byte, count uint, order BitNumbering) {
+	if count > 8 {
+		log.Panicf("invalid count")
+	}
+
+	for i := uint(0); i < count; i++ {
+		switch order {
+		case LSB0:
+			v.Push((val >> i) & 1)
+		default:
+			v.Push((val >> (7 - i)) & 1)
+		}
+	}
+}
+
+// Take reads up to 8 bits at the given index.
+//
+// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 >
+// v.Take(0, 4, LSB0) would return 0b00001011
+// v.Take(0, 4, MSB0) would return 0b11010000
+//
+// Panics if count is out of range
+func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) {
+	if count > 8 {
+		log.Panicf("invalid count")
+	}
+
+	for i := uint(0); i < count; i++ {
+		val, _ := v.Get(index + i)
+
+		switch order {
+		case LSB0:
+			out |= val << i
+		default:
+			out |= val << (7 - i)
+		}
+	}
+	return
+}
+
+// Iterator returns a function, which when invoked, returns the number
+// of bits requested, and increments an internal cursor.
+//
+// When the end of the BitVector is reached, it returns zeroes indefinitely
+//
+// Panics if count is out of range
+func (v *BitVector) Iterator(order BitNumbering) func(uint) byte {
+	cursor := uint(0)
+	return func(count uint) (out byte) {
+		if count > 8 {
+			log.Panicf("invalid count")
+		}
+
+		out = v.Take(cursor, count, order)
+		cursor += count
+		return
+	}
+}
diff --git a/lib/rlepluslazy/internal/bitvector_test.go b/lib/rlepluslazy/internal/bitvector_test.go
new file mode 100644
index 00000000000..a98c00a8e55
--- /dev/null
+++ b/lib/rlepluslazy/internal/bitvector_test.go
@@ -0,0 +1,136 @@
+package bitvector_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal"
+)
+
+func TestBitVector(t *testing.T) {
+	t.Run("zero value", func(t *testing.T) {
+		var v bitvector.BitVector
+
+		assert.Equal(t, bitvector.LSB0, v.BytePacking)
+	})
+
+	t.Run("Push", func(t *testing.T) {
+		// MSB0 bit numbering
+		v := bitvector.BitVector{BytePacking: bitvector.MSB0}
+		v.Push(1)
+		v.Push(0)
+		v.Push(1)
+		v.Push(1)
+
+		assert.Equal(t, byte(176), v.Buf[0])
+
+		// LSB0 bit numbering
+		v = bitvector.BitVector{BytePacking: bitvector.LSB0}
+		v.Push(1)
+		v.Push(0)
+		v.Push(1)
+		v.Push(1)
+
+		assert.Equal(t, byte(13), v.Buf[0])
+	})
+
+	t.Run("Get", func(t *testing.T) {
+		bits := []byte{1, 0, 1, 1, 0, 0, 1, 0}
+
+		for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} {
+			v := bitvector.BitVector{BytePacking: numbering}
+
+			for _, bit := range bits {
+				v.Push(bit)
+			}
+
+			for idx, expected := range bits {
+				actual, _ := v.Get(uint(idx))
+				assert.Equal(t, expected, actual)
+			}
+		}
+	})
+
+	t.Run("Extend", func(t *testing.T) {
+		val := byte(171) // 0b10101011
+
+		var v bitvector.BitVector
+
+		// MSB0 bit numbering
+		v = bitvector.BitVector{}
+		v.Extend(val, 4, bitvector.MSB0)
+		assertBitVector(t, []byte{1, 0, 1, 0}, v)
+		v.Extend(val, 5, bitvector.MSB0)
+		assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v)
+
+		// LSB0 bit numbering
+		v = bitvector.BitVector{}
+		v.Extend(val, 4, bitvector.LSB0)
+		assertBitVector(t, []byte{1, 1, 0, 1}, v)
+		v.Extend(val, 5, bitvector.LSB0)
+		assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v)
+	})
+
+	t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) {
+		v := bitvector.BitVector{BytePacking: bitvector.LSB0}
+
+		assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) })
+
+		assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) })
+
+		next := v.Iterator(bitvector.LSB0)
+		assert.Panics(t, func() { next(9) })
+	})
+
+	t.Run("Take", func(t *testing.T) {
+		var v bitvector.BitVector
+
+		bits := []byte{1, 0, 1, 0, 1, 0, 1, 1}
+		for _, bit := range bits {
+			v.Push(bit)
+		}
+
+		assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0))
+		assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0))
+	})
+
+	t.Run("Iterator", func(t *testing.T) {
+		var buf []byte
+
+		// make a bitvector of 256 sample bits
+		for i := 0; i < 32; i++ {
+			buf = append(buf, 128+32)
+		}
+
+		v := bitvector.NewBitVector(buf, bitvector.LSB0)
+
+		next := v.Iterator(bitvector.LSB0)
+
+		// compare to Get()
+		for i := uint(0); i < v.Len; i++ {
+			expected, _ := v.Get(i)
+			assert.Equal(t, expected, next(1))
+		}
+
+		// out of range should return zero
+		assert.Equal(t, byte(0), next(1))
+		assert.Equal(t, byte(0), next(8))
+
+		// compare to Take()
+		next = v.Iterator(bitvector.LSB0)
+		assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0))
+		assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0))
+	})
+}
+
+// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s.
+func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) {
+	assert.Equal(t, uint(len(expectedBits)), actual.Len)
+
+	for idx, bit := range expectedBits {
+		actualBit, err := actual.Get(uint(idx))
+		assert.NoError(t, err)
+		assert.Equal(t, bit, actualBit)
+	}
+}
diff --git a/lib/rlepluslazy/rleminus.go b/lib/rlepluslazy/rleminus.go
new file mode 100644
index 00000000000..0bafbb6489a
--- /dev/null
+++ b/lib/rlepluslazy/rleminus.go
@@ -0,0 +1,42 @@
+package rlepluslazy
+
+/*
+
+const version = 0
+
+// uncompressed 1: celi(7/8x + 1/8)
+// uncompressed 2: celi(log(x*2)/log(128)) + x
+
+func Encode(first byte, runs []uint64) []byte {
+	varBuf := make([]byte, binary.MaxVarintLen64)
+	outBuf := make([]byte, 0, 1024)
+
+	n := binary.PutUvarint(varBuf, version)
+	outBuf = append(outBuf, varBuf[:n]...)
+
+	curBit := first
+
+	carryOver := uint64(0)
+	carryOverLen := uint(0)
+	for x, run := range runs {
+		if carryOverLen != 1 {
+			diff := carryOverLen % 7
+
+			if diff > run {
+				diff = run
+			}
+			run = run - diff
+
+			carryOver = carryOver>>diff | (math.MaxUint64 << (64 - diff))
+			carryOver = carryOver >> (64 - carryOverLen)
+			n = binary.PutUvarint(varBuf, carryOver)
+			outBuf = append(outBuf, varBuf[:n]...)
+		} else {
+
+		}
+
+		curBit = 1 - curBit
+	}
+	return nil
+}
+*/
diff --git a/lib/rlepluslazy/rleplus.go b/lib/rlepluslazy/rleplus.go
new file mode 100644
index 00000000000..d24e1ed51b9
--- /dev/null
+++ b/lib/rlepluslazy/rleplus.go
@@ -0,0 +1,110 @@
+package rlepluslazy
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+
+	bitvector "github.com/filecoin-project/go-lotus/lib/rlepluslazy/internal"
+	"golang.org/x/xerrors"
+)
+
+const Version = 0
+
+var (
+	ErrWrongVersion = errors.New("invalid RLE+ version")
+	ErrDecode       = fmt.Errorf("invalid encoding for RLE+ version %d", Version)
+)
+
+type RLE struct {
+	vec *bitvector.BitVector
+}
+
+func FromBuf(buf []byte) (*RLE, error) {
+	rle := &RLE{vec: bitvector.NewBitVector(buf, bitvector.LSB0)}
+
+	if err := rle.check(); err != nil {
+		return nil, xerrors.Errorf("could not create RLE+ for a buffer: %w", err)
+	}
+	return rle, nil
+}
+
+func (rle *RLE) check() error {
+	ver := rle.vec.Take(0, 2, bitvector.LSB0)
+	if ver != Version {
+		return ErrWrongVersion
+	}
+	return nil
+}
+
+func (rle *RLE) Iterator() (*iterator, error) {
+	vit := rle.vec.Iterator(bitvector.LSB0)
+	vit(2) // Take version
+
+	it := &iterator{next: vit}
+	if err := it.prep(vit(1)); err != nil {
+		return nil, err
+	}
+	return it, nil
+}
+
+type iterator struct {
+	next func(uint) byte
+
+	curIdx uint64
+	rep    uint64
+}
+
+func (it *iterator) HasNext() bool {
+	return it.rep != 0
+}
+
+func (it *iterator) prep(curBit byte) error {
+
+loop:
+	for it.rep == 0 {
+		x := it.next(1)
+		switch x {
+		case 1:
+			it.rep = 1
+		case 0:
+			y := it.next(1)
+			switch y {
+			case 1:
+				it.rep = uint64(it.next(4))
+			case 0:
+				var buf = make([]byte, 0, 10)
+				for {
+					b := it.next(8)
+					buf = append(buf, b)
+					if b&0x80 == 0 {
+						break
+					}
+					if len(buf) > 10 {
+						return xerrors.Errorf("run too long: %w", ErrDecode)
+					}
+				}
+				it.rep, _ = binary.Uvarint(buf)
+			}
+
+			// run with 0 length means end
+			if it.rep == 0 {
+				break loop
+			}
+		}
+
+		if curBit == 0 {
+			curBit = 1
+			it.curIdx = it.curIdx + it.rep
+			it.rep = 0
+		}
+	}
+	return nil
+}
+
+func (it *iterator) Next() (uint64, error) {
+	it.rep--
+	res := it.curIdx
+	it.curIdx++
+	return res, it.prep(0)
+}
diff --git a/lib/rlepluslazy/rleplus_test.go b/lib/rlepluslazy/rleplus_test.go
new file mode 100644
index 00000000000..90aaee2e467
--- /dev/null
+++ b/lib/rlepluslazy/rleplus_test.go
@@ -0,0 +1,44 @@
+package rlepluslazy
+
+import (
+	"testing"
+
+	"github.com/filecoin-project/go-lotus/extern/rleplus"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDecode(t *testing.T) {
+	// Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+	// in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field)
+	// The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }
+
+	// This is the above reference output with a version header "00" manually added
+	referenceEncoding := []byte{124, 71, 34, 2}
+
+	expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}
+
+	encoded, _, err := rleplus.Encode(expectedNumbers)
+	assert.NoError(t, err)
+
+	// Our encoded bytes are the same as the ref bytes
+	assert.Equal(t, len(referenceEncoding), len(encoded))
+	for idx, expected := range referenceEncoding {
+		assert.Equal(t, expected, encoded[idx])
+	}
+
+	rle, err := FromBuf(referenceEncoding)
+	assert.NoError(t, err)
+	decoded := make([]uint64, 0, len(expectedNumbers))
+
+	it, err := rle.Iterator()
+	assert.NoError(t, err)
+	for it.HasNext() {
+		bit, err := it.Next()
+		assert.NoError(t, err)
+		decoded = append(decoded, bit)
+	}
+
+	// Our decoded integers are the same as expected
+	assert.Equal(t, expectedNumbers, decoded)
+
+}