From 21fcea2731d1de356b89a27a75b5c6af164a4a68 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 9 Apr 2021 14:37:40 +0200 Subject: [PATCH 1/3] s2: More accurate scoring in best mode Use more accurate scoring function for best mode. More often a benefit than not. ``` BEFORE/AFTER: Reading nyc-taxi-data-10M.csv... Compressing... 3325605752 -> 794873465 [23.90%]; 6.172s, 513.9MB/s Compressing... 3325605752 -> 786648492 [23.65%]; 7.389s, 429.2MB/s Reading adresser.json... Compressing... 7983034785 -> 375370404 [4.70%]; 3.527s, 2158.6MB/s Compressing... 7983034785 -> 380912248 [4.77%]; 3.924s, 1940.0MB/s Reading 10gb.tar... Compressing... 10065157632 -> 5246634524 [52.13%]; 24.294s, 395.1MB/s Compressing... 10065157632 -> 5215462149 [51.82%]; 29.462s, 325.8MB/s Reading enwik9... Compressing... 1000000000 -> 375981068 [37.60%]; 3.562s, 267.7MB/s Compressing... 1000000000 -> 373289535 [37.33%]; 4.047s, 235.6MB/s Reading sample.tar... Compressing... 808796160 -> 297514564 [36.78%]; 1.458s, 529.0MB/s Compressing... 808796160 -> 277822539 [34.35%]; 2.043s, 377.5MB/s ``` --- s2/encode_best.go | 108 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 20 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 9a9587036c..5ebb366d2c 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -103,31 +103,39 @@ func encodeBlockBest(dst, src []byte) (d int) { m.length -= offset return m } - + score := func(m match, otherS int) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - (m.s - otherS) + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + if m.rep { + return score - emitRepeatSize(offset, m.length) + } + return score - emitCopySize(offset, m.length) + } bestOf := func(a, b match) match { - aScore := b.s - a.s + a.length - bScore := a.s - b.s + b.length - if !a.rep { - // Estimate bytes needed to store offset. - offset := a.s - a.offset - if offset >= 65536 { - aScore -= 5 - } else { - aScore -= 3 - } + if b.length == 0 { + return a } - if !b.rep { - // Estimate bytes needed to store offset. - offset := b.s - b.offset - if offset >= 65536 { - bScore -= 5 - } else { - bScore -= 3 - } + if a.length == 0 { + return b } - if aScore >= bScore { + as := score(a, b.s) + bs := score(b, a.s) + if as >= bs { + if as <= 0 { + // Eliminate if no savings, we might find a better one. + a.length = 0 + } return a } + if bs <= 0 { + // Eliminate if no savings, we might find a better one. + b.length = 0 + } return b } @@ -251,3 +259,63 @@ emitRemainder: } return d } + +// emitCopySize returns the size to encode the offset+length +// +// It assumes that: +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopySize(offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitRepeatSize(offset, length) + } + i = 5 + } + if length == 0 { + return i + } + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitRepeatSize(offset, length-60) + } + if length >= 12 || offset >= 2048 { + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + return 2 +} + +// emitRepeatSize returns the number of bytes required to encode a repeat. +// Length must be at least 4 and < 1<<24 +func emitRepeatSize(offset, length int) int { + // Repeat offset, make length cheaper + if length <= 4+4 || (length < 8+4 && offset < 2048) { + return 2 + } + if length < (1<<8)+4+4 { + return 3 + } + if length < (1<<16)+(1<<8)+4 { + return 4 + } + const maxRepeat = (1 << 24) - 1 + length -= (1 << 16) - 4 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + 4 + length = maxRepeat - 4 + } + if left > 0 { + return 5 + emitRepeatSize(offset, left) + } + return 5 +} From 62a8b1fa8731035a21bb94476f04de44f6c7c952 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 9 Apr 2021 14:46:07 +0200 Subject: [PATCH 2/3] Update travis. --- .travis.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index df68ea4370..59d225d8e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,7 +55,5 @@ deploy: tags: true condition: ($TRAVIS_OS_NAME = linux) && ($TRAVIS_CPU_ARCH = amd64) go: 1.16.x -branches: - only: - - master - - /^v\d+\.\d+(\.\d+)?(-\S*)?$/ + branch: master + From 8dbe101b350b3121b578d6de74a5097fdd2b4814 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 9 Apr 2021 14:50:12 +0200 Subject: [PATCH 3/3] Revert "Update travis." This reverts commit 62a8b1fa8731035a21bb94476f04de44f6c7c952. --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 59d225d8e3..df68ea4370 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,5 +55,7 @@ deploy: tags: true condition: ($TRAVIS_OS_NAME = linux) && ($TRAVIS_CPU_ARCH = amd64) go: 1.16.x - branch: master - +branches: + only: + - master + - /^v\d+\.\d+(\.\d+)?(-\S*)?$/