From ca934738743d70c7bc3ee8893826959a928cb861 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Fri, 20 Sep 2019 21:28:22 +0200 Subject: [PATCH] Add lazy RLE+ decoding License: MIT Signed-off-by: Jakub Sztandera --- lib/rlepluslazy/internal/bitvector.go | 154 +++++++++++++++++++++ lib/rlepluslazy/internal/bitvector_test.go | 136 ++++++++++++++++++ lib/rlepluslazy/rleminus.go | 42 ++++++ lib/rlepluslazy/rleplus.go | 110 +++++++++++++++ lib/rlepluslazy/rleplus_test.go | 44 ++++++ 5 files changed, 486 insertions(+) create mode 100644 lib/rlepluslazy/internal/bitvector.go create mode 100644 lib/rlepluslazy/internal/bitvector_test.go create mode 100644 lib/rlepluslazy/rleminus.go create mode 100644 lib/rlepluslazy/rleplus.go create mode 100644 lib/rlepluslazy/rleplus_test.go diff --git a/lib/rlepluslazy/internal/bitvector.go b/lib/rlepluslazy/internal/bitvector.go new file mode 100644 index 00000000000..65bae074e09 --- /dev/null +++ b/lib/rlepluslazy/internal/bitvector.go @@ -0,0 +1,154 @@ +package bitvector + +import ( + "errors" + "log" +) + +var ( + // ErrOutOfRange - the index passed is out of range for the BitVector + ErrOutOfRange = errors.New("index out of range") +) + +// BitNumbering indicates the ordering of bits, either +// least-significant bit in position 0, or most-significant bit +// in position 0. +// +// It it used in 3 ways with BitVector: +// 1. Ordering of bits within the Buf []byte structure +// 2. What order to add bits when using Extend() +// 3. What order to read bits when using Take() +// +// https://en.wikipedia.org/wiki/Bit_numbering +type BitNumbering int + +const ( + // LSB0 - bit ordering starts with the low-order bit + LSB0 BitNumbering = iota + + // MSB0 - bit ordering starts with the high-order bit + MSB0 +) + +// BitVector is used to manipulate ordered collections of bits +type BitVector struct { + Buf []byte + + // BytePacking is the bit ordering within bytes + BytePacking BitNumbering + + // Len is the logical number of bits in the vector. + // The last byte in Buf may have undefined bits if Len is not a multiple of 8 + Len uint +} + +// NewBitVector constructs a new BitVector from a slice of bytes. +// +// The bytePacking parameter is required to know how to interpret the bit ordering within the bytes. +func NewBitVector(buf []byte, bytePacking BitNumbering) *BitVector { + return &BitVector{ + BytePacking: bytePacking, + Buf: buf, + Len: uint(len(buf) * 8), + } +} + +// Push adds a single bit to the BitVector. +// +// Although it takes a byte, only the low-order bit is used, so just use 0 or 1. +func (v *BitVector) Push(val byte) { + if v.Len%8 == 0 { + v.Buf = append(v.Buf, 0) + } + lastIdx := v.Len / 8 + + switch v.BytePacking { + case LSB0: + v.Buf[lastIdx] |= (val & 1) << (v.Len % 8) + default: + v.Buf[lastIdx] |= (val & 1) << (7 - (v.Len % 8)) + } + + v.Len++ +} + +// Get returns a single bit as a byte -- either 0 or 1 +func (v *BitVector) Get(idx uint) (byte, error) { + if idx >= v.Len { + return 0, ErrOutOfRange + } + blockIdx := idx / 8 + + switch v.BytePacking { + case LSB0: + return v.Buf[blockIdx] >> (idx % 8) & 1, nil + default: + return v.Buf[blockIdx] >> (7 - idx%8) & 1, nil + } +} + +// Extend adds up to 8 bits to the receiver +// +// Given a byte b == 0b11010101 +// v.Extend(b, 4, LSB0) would add < 1, 0, 1, 0 > +// v.Extend(b, 4, MSB0) would add < 1, 1, 0, 1 > +// +// Panics if count is out of range +func (v *BitVector) Extend(val byte, count uint, order BitNumbering) { + if count > 8 { + log.Panicf("invalid count") + } + + for i := uint(0); i < count; i++ { + switch order { + case LSB0: + v.Push((val >> i) & 1) + default: + v.Push((val >> (7 - i)) & 1) + } + } +} + +// Take reads up to 8 bits at the given index. +// +// Given a BitVector < 1, 1, 0, 1, 0, 1, 0, 1 > +// v.Take(0, 4, LSB0) would return 0b00001011 +// v.Take(0, 4, MSB0) would return 0b11010000 +// +// Panics if count is out of range +func (v *BitVector) Take(index uint, count uint, order BitNumbering) (out byte) { + if count > 8 { + log.Panicf("invalid count") + } + + for i := uint(0); i < count; i++ { + val, _ := v.Get(index + i) + + switch order { + case LSB0: + out |= val << i + default: + out |= val << (7 - i) + } + } + return +} + +// Iterator returns a function, which when invoked, returns the number +// of bits requested, and increments an internal cursor. +// +// When the end of the BitVector is reached, it returns zeroes indefinitely +// +// Panics if count is out of range +func (v *BitVector) Iterator(order BitNumbering) func(uint) byte { + cursor := uint(0) + return func(count uint) (out byte) { + if count > 8 { + log.Panicf("invalid count") + } + + out = v.Take(cursor, count, order) + cursor += count + return + } +} diff --git a/lib/rlepluslazy/internal/bitvector_test.go b/lib/rlepluslazy/internal/bitvector_test.go new file mode 100644 index 00000000000..a98c00a8e55 --- /dev/null +++ b/lib/rlepluslazy/internal/bitvector_test.go @@ -0,0 +1,136 @@ +package bitvector_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + bitvector "github.com/filecoin-project/go-lotus/extern/rleplus/internal" +) + +func TestBitVector(t *testing.T) { + t.Run("zero value", func(t *testing.T) { + var v bitvector.BitVector + + assert.Equal(t, bitvector.LSB0, v.BytePacking) + }) + + t.Run("Push", func(t *testing.T) { + // MSB0 bit numbering + v := bitvector.BitVector{BytePacking: bitvector.MSB0} + v.Push(1) + v.Push(0) + v.Push(1) + v.Push(1) + + assert.Equal(t, byte(176), v.Buf[0]) + + // LSB0 bit numbering + v = bitvector.BitVector{BytePacking: bitvector.LSB0} + v.Push(1) + v.Push(0) + v.Push(1) + v.Push(1) + + assert.Equal(t, byte(13), v.Buf[0]) + }) + + t.Run("Get", func(t *testing.T) { + bits := []byte{1, 0, 1, 1, 0, 0, 1, 0} + + for _, numbering := range []bitvector.BitNumbering{bitvector.MSB0, bitvector.LSB0} { + v := bitvector.BitVector{BytePacking: numbering} + + for _, bit := range bits { + v.Push(bit) + } + + for idx, expected := range bits { + actual, _ := v.Get(uint(idx)) + assert.Equal(t, expected, actual) + } + } + }) + + t.Run("Extend", func(t *testing.T) { + val := byte(171) // 0b10101011 + + var v bitvector.BitVector + + // MSB0 bit numbering + v = bitvector.BitVector{} + v.Extend(val, 4, bitvector.MSB0) + assertBitVector(t, []byte{1, 0, 1, 0}, v) + v.Extend(val, 5, bitvector.MSB0) + assertBitVector(t, []byte{1, 0, 1, 0, 1, 0, 1, 0, 1}, v) + + // LSB0 bit numbering + v = bitvector.BitVector{} + v.Extend(val, 4, bitvector.LSB0) + assertBitVector(t, []byte{1, 1, 0, 1}, v) + v.Extend(val, 5, bitvector.LSB0) + assertBitVector(t, []byte{1, 1, 0, 1, 1, 1, 0, 1, 0}, v) + }) + + t.Run("invalid counts to Take/Extend/Iterator cause panics", func(t *testing.T) { + v := bitvector.BitVector{BytePacking: bitvector.LSB0} + + assert.Panics(t, func() { v.Extend(0xff, 9, bitvector.LSB0) }) + + assert.Panics(t, func() { v.Take(0, 9, bitvector.LSB0) }) + + next := v.Iterator(bitvector.LSB0) + assert.Panics(t, func() { next(9) }) + }) + + t.Run("Take", func(t *testing.T) { + var v bitvector.BitVector + + bits := []byte{1, 0, 1, 0, 1, 0, 1, 1} + for _, bit := range bits { + v.Push(bit) + } + + assert.Equal(t, byte(176), v.Take(4, 4, bitvector.MSB0)) + assert.Equal(t, byte(13), v.Take(4, 4, bitvector.LSB0)) + }) + + t.Run("Iterator", func(t *testing.T) { + var buf []byte + + // make a bitvector of 256 sample bits + for i := 0; i < 32; i++ { + buf = append(buf, 128+32) + } + + v := bitvector.NewBitVector(buf, bitvector.LSB0) + + next := v.Iterator(bitvector.LSB0) + + // compare to Get() + for i := uint(0); i < v.Len; i++ { + expected, _ := v.Get(i) + assert.Equal(t, expected, next(1)) + } + + // out of range should return zero + assert.Equal(t, byte(0), next(1)) + assert.Equal(t, byte(0), next(8)) + + // compare to Take() + next = v.Iterator(bitvector.LSB0) + assert.Equal(t, next(5), v.Take(0, 5, bitvector.LSB0)) + assert.Equal(t, next(8), v.Take(5, 8, bitvector.LSB0)) + }) +} + +// Note: When using this helper assertion, expectedBits should *only* be 0s and 1s. +func assertBitVector(t *testing.T, expectedBits []byte, actual bitvector.BitVector) { + assert.Equal(t, uint(len(expectedBits)), actual.Len) + + for idx, bit := range expectedBits { + actualBit, err := actual.Get(uint(idx)) + assert.NoError(t, err) + assert.Equal(t, bit, actualBit) + } +} diff --git a/lib/rlepluslazy/rleminus.go b/lib/rlepluslazy/rleminus.go new file mode 100644 index 00000000000..0bafbb6489a --- /dev/null +++ b/lib/rlepluslazy/rleminus.go @@ -0,0 +1,42 @@ +package rlepluslazy + +/* + +const version = 0 + +// uncompressed 1: celi(7/8x + 1/8) +// uncompressed 2: celi(log(x*2)/log(128)) + x + +func Encode(first byte, runs []uint64) []byte { + varBuf := make([]byte, binary.MaxVarintLen64) + outBuf := make([]byte, 0, 1024) + + n := binary.PutUvarint(varBuf, version) + outBuf = append(outBuf, varBuf[:n]...) + + curBit := first + + carryOver := uint64(0) + carryOverLen := uint(0) + for x, run := range runs { + if carryOverLen != 1 { + diff := carryOverLen % 7 + + if diff > run { + diff = run + } + run = run - diff + + carryOver = carryOver>>diff | (math.MaxUint64 << (64 - diff)) + carryOver = carryOver >> (64 - carryOverLen) + n = binary.PutUvarint(varBuf, carryOver) + outBuf = append(outBuf, varBuf[:n]...) + } else { + + } + + curBit = 1 - curBit + } + return nil +} +*/ diff --git a/lib/rlepluslazy/rleplus.go b/lib/rlepluslazy/rleplus.go new file mode 100644 index 00000000000..d24e1ed51b9 --- /dev/null +++ b/lib/rlepluslazy/rleplus.go @@ -0,0 +1,110 @@ +package rlepluslazy + +import ( + "encoding/binary" + "errors" + "fmt" + + bitvector "github.com/filecoin-project/go-lotus/lib/rlepluslazy/internal" + "golang.org/x/xerrors" +) + +const Version = 0 + +var ( + ErrWrongVersion = errors.New("invalid RLE+ version") + ErrDecode = fmt.Errorf("invalid encoding for RLE+ version %d", Version) +) + +type RLE struct { + vec *bitvector.BitVector +} + +func FromBuf(buf []byte) (*RLE, error) { + rle := &RLE{vec: bitvector.NewBitVector(buf, bitvector.LSB0)} + + if err := rle.check(); err != nil { + return nil, xerrors.Errorf("could not create RLE+ for a buffer: %w", err) + } + return rle, nil +} + +func (rle *RLE) check() error { + ver := rle.vec.Take(0, 2, bitvector.LSB0) + if ver != Version { + return ErrWrongVersion + } + return nil +} + +func (rle *RLE) Iterator() (*iterator, error) { + vit := rle.vec.Iterator(bitvector.LSB0) + vit(2) // Take version + + it := &iterator{next: vit} + if err := it.prep(vit(1)); err != nil { + return nil, err + } + return it, nil +} + +type iterator struct { + next func(uint) byte + + curIdx uint64 + rep uint64 +} + +func (it *iterator) HasNext() bool { + return it.rep != 0 +} + +func (it *iterator) prep(curBit byte) error { + +loop: + for it.rep == 0 { + x := it.next(1) + switch x { + case 1: + it.rep = 1 + case 0: + y := it.next(1) + switch y { + case 1: + it.rep = uint64(it.next(4)) + case 0: + var buf = make([]byte, 0, 10) + for { + b := it.next(8) + buf = append(buf, b) + if b&0x80 == 0 { + break + } + if len(buf) > 10 { + return xerrors.Errorf("run too long: %w", ErrDecode) + } + } + it.rep, _ = binary.Uvarint(buf) + } + + // run with 0 length means end + if it.rep == 0 { + break loop + } + } + + if curBit == 0 { + curBit = 1 + it.curIdx = it.curIdx + it.rep + it.rep = 0 + } + } + return nil +} + +func (it *iterator) Next() (uint64, error) { + it.rep-- + res := it.curIdx + it.curIdx++ + return res, it.prep(0) +} diff --git a/lib/rlepluslazy/rleplus_test.go b/lib/rlepluslazy/rleplus_test.go new file mode 100644 index 00000000000..90aaee2e467 --- /dev/null +++ b/lib/rlepluslazy/rleplus_test.go @@ -0,0 +1,44 @@ +package rlepluslazy + +import ( + "testing" + + "github.com/filecoin-project/go-lotus/extern/rleplus" + "github.com/stretchr/testify/assert" +) + +func TestDecode(t *testing.T) { + // Encoding bitvec![LittleEndian; 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + // in the Rust reference implementation gives an encoding of [223, 145, 136, 0] (without version field) + // The bit vector is equivalent to the integer set { 0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 } + + // This is the above reference output with a version header "00" manually added + referenceEncoding := []byte{124, 71, 34, 2} + + expectedNumbers := []uint64{0, 2, 4, 5, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27} + + encoded, _, err := rleplus.Encode(expectedNumbers) + assert.NoError(t, err) + + // Our encoded bytes are the same as the ref bytes + assert.Equal(t, len(referenceEncoding), len(encoded)) + for idx, expected := range referenceEncoding { + assert.Equal(t, expected, encoded[idx]) + } + + rle, err := FromBuf(referenceEncoding) + assert.NoError(t, err) + decoded := make([]uint64, 0, len(expectedNumbers)) + + it, err := rle.Iterator() + assert.NoError(t, err) + for it.HasNext() { + bit, err := it.Next() + assert.NoError(t, err) + decoded = append(decoded, bit) + } + + // Our decoded integers are the same as expected + assert.Equal(t, expectedNumbers, decoded) + +}