diff --git a/src/SharpCompress/Common/CompressionType.cs b/src/SharpCompress/Common/CompressionType.cs index 11263245..b26e5a95 100644 --- a/src/SharpCompress/Common/CompressionType.cs +++ b/src/SharpCompress/Common/CompressionType.cs @@ -14,5 +14,6 @@ public enum CompressionType LZip, Xz, Unknown, - Deflate64 + Deflate64, + Shrink } diff --git a/src/SharpCompress/Common/Zip/ZipCompressionMethod.cs b/src/SharpCompress/Common/Zip/ZipCompressionMethod.cs index 13c2dbe7..a98ed667 100644 --- a/src/SharpCompress/Common/Zip/ZipCompressionMethod.cs +++ b/src/SharpCompress/Common/Zip/ZipCompressionMethod.cs @@ -3,6 +3,7 @@ namespace SharpCompress.Common.Zip; internal enum ZipCompressionMethod { None = 0, + Shrink = 1, Deflate = 8, Deflate64 = 9, BZip2 = 12, diff --git a/src/SharpCompress/Common/Zip/ZipEntry.cs b/src/SharpCompress/Common/Zip/ZipEntry.cs index 2c544b94..61e1b6af 100644 --- a/src/SharpCompress/Common/Zip/ZipEntry.cs +++ b/src/SharpCompress/Common/Zip/ZipEntry.cs @@ -52,6 +52,10 @@ public override CompressionType CompressionType { return CompressionType.None; } + case ZipCompressionMethod.Shrink: + { + return CompressionType.Shrink; + } default: { return CompressionType.Unknown; diff --git a/src/SharpCompress/Common/Zip/ZipFilePart.cs b/src/SharpCompress/Common/Zip/ZipFilePart.cs index faefdf15..f8a0e8a8 100644 --- a/src/SharpCompress/Common/Zip/ZipFilePart.cs +++ b/src/SharpCompress/Common/Zip/ZipFilePart.cs @@ -9,6 +9,7 @@ using SharpCompress.Compressors.Deflate64; using SharpCompress.Compressors.LZMA; using SharpCompress.Compressors.PPMd; +using SharpCompress.Compressors.Shrink; using SharpCompress.Compressors.Xz; using SharpCompress.IO; using ZstdSharp; @@ -79,6 +80,15 @@ protected Stream CreateDecompressionStream(Stream stream, ZipCompressionMethod m return new DataDescriptorStream(stream); } + case ZipCompressionMethod.Shrink: + { + return new ShrinkStream( + stream, + CompressionMode.Decompress, + Header.CompressedSize, + Header.UncompressedSize + ); + } case ZipCompressionMethod.Deflate: { return new DeflateStream(stream, CompressionMode.Decompress); @@ -192,6 +202,7 @@ protected Stream GetCryptoStream(Stream plainStream) switch (Header.CompressionMethod) { case ZipCompressionMethod.None: + case ZipCompressionMethod.Shrink: case ZipCompressionMethod.Deflate: case ZipCompressionMethod.Deflate64: case ZipCompressionMethod.BZip2: diff --git a/src/SharpCompress/Compressors/Shrink/BitStream.cs b/src/SharpCompress/Compressors/Shrink/BitStream.cs new file mode 100644 index 00000000..ef01e2af --- /dev/null +++ b/src/SharpCompress/Compressors/Shrink/BitStream.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace SharpCompress.Compressors.Shrink +{ + internal class BitStream + { + private byte[] _src; + private int _srcLen; + private int _byteIdx; + private int _bitIdx; + private int _bitsLeft; + private ulong _bitBuffer; + private static uint[] _maskBits = new uint[17] + { + 0U, + 1U, + 3U, + 7U, + 15U, + 31U, + 63U, + (uint)sbyte.MaxValue, + (uint)byte.MaxValue, + 511U, + 1023U, + 2047U, + 4095U, + 8191U, + 16383U, + (uint)short.MaxValue, + (uint)ushort.MaxValue + }; + + public BitStream(byte[] src, int srcLen) + { + _src = src; + _srcLen = srcLen; + _byteIdx = 0; + _bitIdx = 0; + } + + public int BytesRead => (_byteIdx << 3) + _bitIdx; + + private int NextByte() + { + if (_byteIdx >= _srcLen) + { + return 0; + } + + return _src[_byteIdx++]; + } + + public int NextBits(int nbits) + { + int result = 0; + if (nbits > _bitsLeft) + { + int num; + while (_bitsLeft <= 24 && (num = NextByte()) != 1234) + { + _bitBuffer |= (ulong)num << _bitsLeft; + _bitsLeft += 8; + } + } + result = (int)((long)_bitBuffer & (long)_maskBits[nbits]); + _bitBuffer >>= nbits; + _bitsLeft -= nbits; + return result; + } + + public bool Advance(int count) + { + if (_byteIdx > _srcLen) + { + return false; + } + return true; + } + } +} diff --git a/src/SharpCompress/Compressors/Shrink/HwUnshrink.cs b/src/SharpCompress/Compressors/Shrink/HwUnshrink.cs new file mode 100644 index 00000000..04f38c2b --- /dev/null +++ b/src/SharpCompress/Compressors/Shrink/HwUnshrink.cs @@ -0,0 +1,433 @@ +using System; + +namespace SharpCompress.Compressors.Shrink +{ + public class HwUnshrink + { + private const int MIN_CODE_SIZE = 9; + private const int MAX_CODE_SIZE = 13; + + private const ushort MAX_CODE = (ushort)((1U << MAX_CODE_SIZE) - 1); + private const ushort INVALID_CODE = ushort.MaxValue; + private const ushort CONTROL_CODE = 256; + private const ushort INC_CODE_SIZE = 1; + private const ushort PARTIAL_CLEAR = 2; + + private const int HASH_BITS = MAX_CODE_SIZE + 1; // For a load factor of 0.5. + private const int HASHTAB_SIZE = 1 << HASH_BITS; + private const ushort UNKNOWN_LEN = ushort.MaxValue; + + private struct CodeTabEntry + { + public int prefixCode; // INVALID_CODE means the entry is invalid. + public byte extByte; + public ushort len; + public int lastDstPos; + } + + private static void CodeTabInit(CodeTabEntry[] codeTab) + { + for (int i = 0; i <= byte.MaxValue; i++) + { + codeTab[i].prefixCode = (ushort)i; + codeTab[i].extByte = (byte)i; + codeTab[i].len = 1; + } + + for (int i = byte.MaxValue + 1; i <= MAX_CODE; i++) + { + codeTab[i].prefixCode = INVALID_CODE; + } + } + + private static void UnshrinkPartialClear(CodeTabEntry[] codeTab, ref CodeQueue queue) + { + bool[] isPrefix = new bool[MAX_CODE + 1]; + int codeQueueSize; + + // Scan for codes that have been used as a prefix. + for (int i = CONTROL_CODE + 1; i <= MAX_CODE; i++) + { + if (codeTab[i].prefixCode != INVALID_CODE) + { + isPrefix[codeTab[i].prefixCode] = true; + } + } + + // Clear "non-prefix" codes in the table; populate the code queue. + codeQueueSize = 0; + for (int i = CONTROL_CODE + 1; i <= MAX_CODE; i++) + { + if (!isPrefix[i]) + { + codeTab[i].prefixCode = INVALID_CODE; + queue.codes[codeQueueSize++] = (ushort)i; + } + } + + queue.codes[codeQueueSize] = INVALID_CODE; // End-of-queue marker. + queue.nextIdx = 0; + } + + private static bool ReadCode( + BitStream stream, + ref int codeSize, + CodeTabEntry[] codeTab, + ref CodeQueue queue, + out int nextCode + ) + { + int code, + controlCode; + + code = (int)stream.NextBits(codeSize); + if (!stream.Advance(codeSize)) + { + nextCode = INVALID_CODE; + return false; + } + + // Handle regular codes (the common case). + if (code != CONTROL_CODE) + { + nextCode = code; + return true; + } + + // Handle control codes. + controlCode = (ushort)stream.NextBits(codeSize); + if (!stream.Advance(codeSize)) + { + nextCode = INVALID_CODE; + return true; + } + + if (controlCode == INC_CODE_SIZE && codeSize < MAX_CODE_SIZE) + { + codeSize++; + return ReadCode(stream, ref codeSize, codeTab, ref queue, out nextCode); + } + + if (controlCode == PARTIAL_CLEAR) + { + UnshrinkPartialClear(codeTab, ref queue); + return ReadCode(stream, ref codeSize, codeTab, ref queue, out nextCode); + } + + nextCode = INVALID_CODE; + return true; + } + + private static void CopyFromPrevPos(byte[] dst, int prevPos, int dstPos, int len) + { + if (dstPos + len > dst.Length) + { + // Not enough room in dst for the sloppy copy below. + Array.Copy(dst, prevPos, dst, dstPos, len); + return; + } + + if (prevPos + len > dstPos) + { + // Benign one-byte overlap possible in the KwKwK case. + //assert(prevPos + len == dstPos + 1); + //assert(dst[prevPos] == dst[prevPos + len - 1]); + } + + Buffer.BlockCopy(dst, prevPos, dst, dstPos, len); + } + + private static UnshrnkStatus OutputCode( + int code, + byte[] dst, + int dstPos, + int dstCap, + int prevCode, + CodeTabEntry[] codeTab, + ref CodeQueue queue, + out byte firstByte, + out int len + ) + { + int prefixCode; + + //assert(code <= MAX_CODE && code != CONTROL_CODE); + //assert(dstPos < dstCap); + firstByte = 0; + if (code <= byte.MaxValue) + { + // Output literal byte. + firstByte = (byte)code; + len = 1; + dst[dstPos] = (byte)code; + return UnshrnkStatus.Ok; + } + + if (codeTab[code].prefixCode == INVALID_CODE || codeTab[code].prefixCode == code) + { + // Reject invalid codes. Self-referential codes may exist in the table but cannot be used. + firstByte = 0; + len = 0; + return UnshrnkStatus.Error; + } + + if (codeTab[code].len != UNKNOWN_LEN) + { + // Output string with known length (the common case). + if (dstCap - dstPos < codeTab[code].len) + { + firstByte = 0; + len = 0; + return UnshrnkStatus.Full; + } + + CopyFromPrevPos(dst, codeTab[code].lastDstPos, dstPos, codeTab[code].len); + firstByte = dst[dstPos]; + len = codeTab[code].len; + return UnshrnkStatus.Ok; + } + + // Output a string of unknown length. + //assert(codeTab[code].len == UNKNOWN_LEN); + prefixCode = codeTab[code].prefixCode; + // assert(prefixCode > CONTROL_CODE); + + if (prefixCode == queue.codes[queue.nextIdx]) + { + // The prefix code hasn't been added yet, but we were just about to: the KwKwK case. + //assert(codeTab[prevCode].prefixCode != INVALID_CODE); + codeTab[prefixCode].prefixCode = prevCode; + codeTab[prefixCode].extByte = firstByte; + codeTab[prefixCode].len = (ushort)(codeTab[prevCode].len + 1); + codeTab[prefixCode].lastDstPos = codeTab[prevCode].lastDstPos; + dst[dstPos] = firstByte; + } + else if (codeTab[prefixCode].prefixCode == INVALID_CODE) + { + // The prefix code is still invalid. + firstByte = 0; + len = 0; + return UnshrnkStatus.Error; + } + + // Output the prefix string, then the extension byte. + len = codeTab[prefixCode].len + 1; + if (dstCap - dstPos < len) + { + firstByte = 0; + len = 0; + return UnshrnkStatus.Full; + } + + CopyFromPrevPos(dst, codeTab[prefixCode].lastDstPos, dstPos, codeTab[prefixCode].len); + dst[dstPos + len - 1] = codeTab[code].extByte; + firstByte = dst[dstPos]; + + // Update the code table now that the string has a length and pos. + //assert(prevCode != code); + codeTab[code].len = (ushort)len; + codeTab[code].lastDstPos = dstPos; + + return UnshrnkStatus.Ok; + } + + public static UnshrnkStatus Unshrink( + byte[] src, + int srcLen, + out int srcUsed, + byte[] dst, + int dstCap, + out int dstUsed + ) + { + CodeTabEntry[] codeTab = new CodeTabEntry[HASHTAB_SIZE]; + CodeQueue queue = new CodeQueue(); + var stream = new BitStream(src, srcLen); + int codeSize, + dstPos, + len; + int currCode, + prevCode, + newCode; + byte firstByte; + + CodeTabInit(codeTab); + CodeQueueInit(ref queue); + codeSize = MIN_CODE_SIZE; + dstPos = 0; + + // Handle the first code separately since there is no previous code. + if (!ReadCode(stream, ref codeSize, codeTab, ref queue, out currCode)) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Ok; + } + + //assert(currCode != CONTROL_CODE); + if (currCode > byte.MaxValue) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Error; // The first code must be a literal. + } + + if (dstPos == dstCap) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Full; + } + + firstByte = (byte)currCode; + dst[dstPos] = (byte)currCode; + codeTab[currCode].lastDstPos = dstPos; + dstPos++; + + prevCode = currCode; + while (ReadCode(stream, ref codeSize, codeTab, ref queue, out currCode)) + { + if (currCode == INVALID_CODE) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Error; + } + + if (dstPos == dstCap) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Full; + } + + // Handle KwKwK: next code used before being added. + if (currCode == queue.codes[queue.nextIdx]) + { + if (codeTab[prevCode].prefixCode == INVALID_CODE) + { + // The previous code is no longer valid. + srcUsed = stream.BytesRead; + dstUsed = 0; + return UnshrnkStatus.Error; + } + + // Extend the previous code with its first byte. + //assert(currCode != prevCode); + codeTab[currCode].prefixCode = prevCode; + codeTab[currCode].extByte = firstByte; + codeTab[currCode].len = (ushort)(codeTab[prevCode].len + 1); + codeTab[currCode].lastDstPos = codeTab[prevCode].lastDstPos; + //assert(dstPos < dstCap); + dst[dstPos] = firstByte; + } + + // Output the string represented by the current code. + UnshrnkStatus status = OutputCode( + currCode, + dst, + dstPos, + dstCap, + prevCode, + codeTab, + ref queue, + out firstByte, + out len + ); + if (status != UnshrnkStatus.Ok) + { + srcUsed = stream.BytesRead; + dstUsed = 0; + return status; + } + + // Verify that the output matches walking the prefixes. + var c = currCode; + for (int i = 0; i < len; i++) + { + // assert(codeTab[c].len == len - i); + //assert(codeTab[c].extByte == dst[dstPos + len - i - 1]); + c = codeTab[c].prefixCode; + } + + // Add a new code to the string table if there's room. + // The string is the previous code's string extended with the first byte of the current code's string. + newCode = CodeQueueRemoveNext(ref queue); + if (newCode != INVALID_CODE) + { + //assert(codeTab[prevCode].lastDstPos < dstPos); + codeTab[newCode].prefixCode = prevCode; + codeTab[newCode].extByte = firstByte; + codeTab[newCode].len = (ushort)(codeTab[prevCode].len + 1); + codeTab[newCode].lastDstPos = codeTab[prevCode].lastDstPos; + + if (codeTab[prevCode].prefixCode == INVALID_CODE) + { + // prevCode was invalidated in a partial clearing. Until that code is re-used, the + // string represented by newCode is indeterminate. + codeTab[newCode].len = UNKNOWN_LEN; + } + // If prevCode was invalidated in a partial clearing, it's possible that newCode == prevCode, + // in which case it will never be used or cleared. + } + + codeTab[currCode].lastDstPos = dstPos; + dstPos += len; + + prevCode = currCode; + } + + srcUsed = stream.BytesRead; + dstUsed = dstPos; + + return UnshrnkStatus.Ok; + } + + public enum UnshrnkStatus + { + Ok, + Full, + Error + } + + private struct CodeQueue + { + public int nextIdx; + public ushort[] codes; + } + + private static void CodeQueueInit(ref CodeQueue q) + { + int codeQueueSize; + ushort code; + + codeQueueSize = 0; + q.codes = new ushort[MAX_CODE - CONTROL_CODE + 2]; + + for (code = CONTROL_CODE + 1; code <= MAX_CODE; code++) + { + q.codes[codeQueueSize++] = code; + } + + //assert(codeQueueSize < q.codes.Length); + q.codes[codeQueueSize] = INVALID_CODE; // End-of-queue marker. + q.nextIdx = 0; + } + + private static ushort CodeQueueNext(ref CodeQueue q) + { + //assert(q.nextIdx < q.codes.Length); + return q.codes[q.nextIdx]; + } + + private static ushort CodeQueueRemoveNext(ref CodeQueue q) + { + ushort code = CodeQueueNext(ref q); + if (code != INVALID_CODE) + { + q.nextIdx++; + } + return code; + } + } +} diff --git a/src/SharpCompress/Compressors/Shrink/ShrinkStream.cs b/src/SharpCompress/Compressors/Shrink/ShrinkStream.cs new file mode 100644 index 00000000..52bba1ad --- /dev/null +++ b/src/SharpCompress/Compressors/Shrink/ShrinkStream.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace SharpCompress.Compressors.Shrink; + +internal class ShrinkStream : Stream +{ + private Stream inStream; + private CompressionMode _compressionMode; + + private ulong _compressedSize; + private long _uncompressedSize; + private byte[] _byteOut; + private long _outBytesCount; + + public ShrinkStream( + Stream stream, + CompressionMode compressionMode, + long compressedSize, + long uncompressedSize + ) + { + inStream = stream; + _compressionMode = compressionMode; + + _compressedSize = (ulong)compressedSize; + _uncompressedSize = uncompressedSize; + _byteOut = new byte[_uncompressedSize]; + _outBytesCount = 0L; + } + + public override bool CanRead => true; + + public override bool CanSeek => true; + + public override bool CanWrite => false; + + public override long Length => _uncompressedSize; + + public override long Position + { + get => _outBytesCount; + set => throw new NotImplementedException(); + } + + public override void Flush() => throw new NotImplementedException(); + + public override int Read(byte[] buffer, int offset, int count) + { + if (inStream.Position == (long)_compressedSize) + { + return 0; + } + byte[] src = new byte[_compressedSize]; + inStream.Read(src, offset, (int)_compressedSize); + int srcUsed = 0; + int dstUsed = 0; + + HwUnshrink.Unshrink( + src, + (int)_compressedSize, + out srcUsed, + _byteOut, + (int)_uncompressedSize, + out dstUsed + ); + _outBytesCount = _byteOut.Length; + + for (int index = 0; index < _outBytesCount; ++index) + { + buffer[offset + index] = _byteOut[index]; + } + var tmp = _outBytesCount; + _outBytesCount = 0; + return (int)tmp; + } + + public override long Seek(long offset, SeekOrigin origin) => + throw new NotImplementedException(); + + public override void SetLength(long value) => throw new NotImplementedException(); + + public override void Write(byte[] buffer, int offset, int count) => + throw new NotImplementedException(); +} diff --git a/tests/SharpCompress.Test/Zip/ZipArchiveTests.cs b/tests/SharpCompress.Test/Zip/ZipArchiveTests.cs index fde87dc6..ebcb4243 100644 --- a/tests/SharpCompress.Test/Zip/ZipArchiveTests.cs +++ b/tests/SharpCompress.Test/Zip/ZipArchiveTests.cs @@ -16,6 +16,14 @@ public class ZipArchiveTests : ArchiveTests { public ZipArchiveTests() => UseExtensionInsteadOfNameToVerify = true; + [Fact] + public void Zip_Shrink_ArchiveStreamRead() + { + UseExtensionInsteadOfNameToVerify = true; + UseCaseInsensitiveToVerify = true; + ArchiveStreamRead("Zip.shrink.zip"); + } + [Fact] public void Zip_ZipX_ArchiveStreamRead() => ArchiveStreamRead("Zip.zipx"); diff --git a/tests/TestArchives/Archives/Zip.shrink.zip b/tests/TestArchives/Archives/Zip.shrink.zip new file mode 100644 index 00000000..697c7e00 Binary files /dev/null and b/tests/TestArchives/Archives/Zip.shrink.zip differ