diff --git a/Codecs/Image/Jpeg/Classes/Block.cs b/Codecs/Image/Jpeg/Classes/Block.cs index b5acb611..5a0cff10 100644 --- a/Codecs/Image/Jpeg/Classes/Block.cs +++ b/Codecs/Image/Jpeg/Classes/Block.cs @@ -1,12 +1,10 @@ -using Media.Codec.Jpeg; -using Media.Common; +using Media.Common; using System; -using System.Drawing; using System.Numerics; -using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using System.Text; namespace Media.Codec.Jpeg.Classes; @@ -20,10 +18,194 @@ namespace Media.Codec.Jpeg.Classes; /// internal class Block : MemorySegment { + /// + /// By default, how many coefficients are in a block. + /// public const int DefaultSize = JpegCodec.BlockSize * JpegCodec.BlockSize; + /// + /// Gets a value indicating whether code is being JIT-ed to AVX2 instructions + /// where both float and integer registers are of size 256 byte. + /// + public static bool HasVector8 { get; } = + Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; + #region Static Functions + /// + /// Transform all scalars in 'v' in a way that converting them to would have rounding semantics. + /// + /// The vector + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector4 PseudoRound(Vector4 v) + { + Vector4 sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1)); + + return v + (sign * 0.5f); + } + + /// + /// Rounds all values in 'v' to the nearest integer following semantics. + /// Source: + /// + /// https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110 + /// + /// + /// The vector + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector FastRound(Vector v) + { + if (Avx2.IsSupported) + { + ref Vector256 v256 = ref Unsafe.As, Vector256>(ref v); + Vector256 vRound = Avx.RoundToNearestInteger(v256); + return Unsafe.As, Vector>(ref vRound); + } + else + { + var magic0 = new Vector(int.MinValue); // 0x80000000 + var sgn0 = Vector.AsVectorSingle(magic0); + var and0 = Vector.BitwiseAnd(sgn0, v); + var or0 = Vector.BitwiseOr(and0, new Vector(8388608.0f)); + var add0 = Vector.Add(v, or0); + return Vector.Subtract(add0, or0); + } + } + + /// + /// + /// + /// + /// + /// + /// + private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) + { + row += off; + row = Vector.Max(row, Vector.Zero); + row = Vector.Min(row, max); + return FastRound(row); + } + + private static void MultiplyIntoInt16_Avx2(Block a, Block b, Block dest) + { + Vector256 aBase = a.V0f; + Vector256 bBase = b.V0f; + + Vector256 destRef = dest.V01; + Vector256 multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); + + for (nuint i = 0; i < JpegCodec.BlockSize; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); + + Unsafe.Add(ref destRef, i / 2) = row; + } + } + + private static void MultiplyIntoInt16_Sse2(Block a, Block b, Block dest) + { + //Must redo as we can't use unsafe As with classes easily (without chaning the layout of the base class) + + ref Vector128 aBase = ref Unsafe.As>(ref a); + ref Vector128 bBase = ref Unsafe.As>(ref b); + + ref Vector128 destBase = ref Unsafe.As>(ref dest); + + for (nuint i = 0; i < 16; i += 2) + { + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector128 row = Sse2.PackSignedSaturate(left, right); + Unsafe.Add(ref destBase, i / 2) = row; + } + } + + private void TransposeInplace_Avx() + { + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 + var vector = V4L; + Vector256 r0 = Avx.InsertVector128( + V0f, + Unsafe.As>(ref vector), + 1); + + vector = V5L; + Vector256 r1 = Avx.InsertVector128( + V1f, + Unsafe.As>(ref vector), + 1); + + vector = V6L; + Vector256 r2 = Avx.InsertVector128( + V2f, + Unsafe.As>(ref vector), + 1); + + vector = V7L; + Vector256 r3 = Avx.InsertVector128( + V3f, + Unsafe.As>(ref vector), + 1); + + vector = V0R; + var right = V4R; + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref vector).ToVector256(), + Unsafe.As>(ref right), + 1); + + vector = V1R; + right = V5R; + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref vector).ToVector256(), + Unsafe.As>(ref right), + 1); + + vector = V2R; + right = V6R; + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref vector).ToVector256(), + Unsafe.As>(ref right), + 1); + + vector = V3R; + right = V7R; + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref vector).ToVector256(), + Unsafe.As>(ref right), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + V0f = Avx.Blend(t0, v, 0xCC); + V1f = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + V4f = Avx.Blend(t4, v, 0xCC); + V5f = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + V2f = Avx.Blend(t1, v, 0xCC); + V3f = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + V6f = Avx.Blend(t5, v, 0xCC); + V7f = Avx.Blend(t7, v, 0x33); + } + /// /// Calculate the total sum of absolute differences of elements in 'a' and 'b'. /// @@ -53,6 +235,19 @@ public static Block Load(Span data) return block; } + /// + /// Loads a block from a span of float data + /// + /// + /// + public static Block Load(Span data) + { + var block = new Block(); + var bytes = MemoryMarshal.Cast(data); + bytes.CopyTo(block.Array); + return block; + } + #endregion #region Constructor @@ -80,78 +275,176 @@ public Block(int coefficientCount) #region Vector Properties + #region Vector 256 + + public Vector256 V0f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset], value); + } + + public Vector256 V1f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 32]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 32], value); + } + + public Vector256 V2f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 64]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 64], value); + } + + public Vector256 V3f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 96]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 96], value); + } + + public Vector256 V4f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 128]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 128], value); + } + + public Vector256 V5f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 160]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 160], value); + } + + public Vector256 V6f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 192]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 192], value); + } + + public Vector256 V7f + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Unsafe.ReadUnaligned>(ref Array[Offset + 224]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set => Unsafe.WriteUnaligned(ref Array[Offset + 224], value); + } + + #endregion + + #region Vector 128 + public Vector128 V0 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset], value); } public Vector128 V1 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 16]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 16], value); } public Vector128 V2 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 32]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 32], value); } public Vector128 V3 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 48]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 48], value); } public Vector128 V4 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 64]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 64], value); } public Vector128 V5 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 80]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 80], value); } public Vector128 V6 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 96]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 96], value); } public Vector128 V7 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 112]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 112], value); } public Vector256 V01 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset], value); } public Vector256 V23 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 32]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 32], value); } public Vector256 V45 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 64]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 64], value); } public Vector256 V67 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.ReadUnaligned>(ref Array[Offset + 96]); + [MethodImpl(MethodImplOptions.AggressiveInlining)] set => Unsafe.WriteUnaligned(ref Array[Offset + 96], value); } + #endregion + + #region Vector4 + public Vector4 V0L { get => new Vector4(GetFourFloats(0)); @@ -250,10 +543,18 @@ public Vector4 V7R #endregion + #endregion + #region Properties - + + /// + /// The length of the block in values + /// public int ShortLength => Count / Binary.BytesPerShort; + /// + /// The length of the block in values + /// public int FloatLength => Count / Binary.BytesPerInteger; #endregion @@ -343,6 +644,280 @@ public float this[uint index] #region Methods + public void NormalizeColorsAndRoundInPlaceVector8(float maximum) + { + var off = new Vector(MathF.Ceiling(maximum * 0.5F)); + var max = new Vector(maximum); + + var v0l = V0L; + ref Vector row0 = ref Unsafe.As>(ref v0l); + row0 = NormalizeAndRound(row0, off, max); + + var v1l = V1L; + ref Vector row1 = ref Unsafe.As>(ref v1l); + row1 = NormalizeAndRound(row1, off, max); + + var v2l = V2L; + ref Vector row2 = ref Unsafe.As>(ref v2l); + row2 = NormalizeAndRound(row2, off, max); + + var v3l = V3L; + ref Vector row3 = ref Unsafe.As>(ref v3l); + row3 = NormalizeAndRound(row3, off, max); + + var v4l = V4L; + ref Vector row4 = ref Unsafe.As>(ref v4l); + row4 = NormalizeAndRound(row4, off, max); + + var v5l = V5L; + ref Vector row5 = ref Unsafe.As>(ref v5l); + row5 = NormalizeAndRound(row5, off, max); + + var v6l = V6L; + ref Vector row6 = ref Unsafe.As>(ref v6l); + row6 = NormalizeAndRound(row6, off, max); + + var v7l = V7L; + ref Vector row7 = ref Unsafe.As>(ref v7l); + row7 = NormalizeAndRound(row7, off, max); + + } + + /// + /// Level shift by +maximum/2, clip to [0, maximum] + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NormalizeColorsInPlace(float maximum) + { + var CMin4 = new Vector4(0F); + var CMax4 = new Vector4(maximum); + var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); + + V0L = Vector4.Clamp(V0L + COff4, CMin4, CMax4); + V0R = Vector4.Clamp(V0R + COff4, CMin4, CMax4); + V1L = Vector4.Clamp(V1L + COff4, CMin4, CMax4); + V1R = Vector4.Clamp(V1R + COff4, CMin4, CMax4); + V2L = Vector4.Clamp(V2L + COff4, CMin4, CMax4); + V2R = Vector4.Clamp(V2R + COff4, CMin4, CMax4); + V3L = Vector4.Clamp(V3L + COff4, CMin4, CMax4); + V3R = Vector4.Clamp(V3R + COff4, CMin4, CMax4); + V4L = Vector4.Clamp(V4L + COff4, CMin4, CMax4); + V4R = Vector4.Clamp(V4R + COff4, CMin4, CMax4); + V5L = Vector4.Clamp(V5L + COff4, CMin4, CMax4); + V5R = Vector4.Clamp(V5R + COff4, CMin4, CMax4); + V6L = Vector4.Clamp(V6L + COff4, CMin4, CMax4); + V6R = Vector4.Clamp(V6R + COff4, CMin4, CMax4); + V7L = Vector4.Clamp(V7L + COff4, CMin4, CMax4); + V7R = Vector4.Clamp(V7R + COff4, CMin4, CMax4); + } + + /// + /// Multiply all elements of the block. + /// + /// The value to multiply by. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void MultiplyInPlace(float value) + { + if (Avx.IsSupported) + { + Vector256 valueVec = Vector256.Create(value); + V0f = Avx.Multiply(V0f, valueVec); + V1f = Avx.Multiply(V1f, valueVec); + V2f = Avx.Multiply(V2f, valueVec); + V3f = Avx.Multiply(V3f, valueVec); + V4f = Avx.Multiply(V4f, valueVec); + V5f = Avx.Multiply(V5f, valueVec); + V6f = Avx.Multiply(V6f, valueVec); + V7f = Avx.Multiply(V7f, valueVec); + } + else + { + Vector4 valueVec = new(value); + V0L *= valueVec; + V0R *= valueVec; + V1L *= valueVec; + V1R *= valueVec; + V2L *= valueVec; + V2R *= valueVec; + V3L *= valueVec; + V3R *= valueVec; + V4L *= valueVec; + V4R *= valueVec; + V5L *= valueVec; + V5R *= valueVec; + V6L *= valueVec; + V6R *= valueVec; + V7L *= valueVec; + V7R *= valueVec; + } + } + + /// + /// Multiply all elements of the block by the corresponding elements of 'other'. + /// + /// The other block. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void MultiplyInPlace(Block other) + { + if (Avx.IsSupported) + { + V0f = Avx.Multiply(V0f, other.V0f); + V1f = Avx.Multiply(V1f, other.V1f); + V2f = Avx.Multiply(V2f, other.V2f); + V3f = Avx.Multiply(V3f, other.V3f); + V4f = Avx.Multiply(V4f, other.V4f); + V5f = Avx.Multiply(V5f, other.V5f); + V6f = Avx.Multiply(V6f, other.V6f); + V7f = Avx.Multiply(V7f, other.V7f); + } + else + { + V0L *= other.V0L; + V0R *= other.V0R; + V1L *= other.V1L; + V1R *= other.V1R; + V2L *= other.V2L; + V2R *= other.V2R; + V3L *= other.V3L; + V3R *= other.V3R; + V4L *= other.V4L; + V4R *= other.V4R; + V5L *= other.V5L; + V5R *= other.V5R; + V6L *= other.V6L; + V6R *= other.V6R; + V7L *= other.V7L; + V7R *= other.V7R; + } + } + + /// + /// Adds a vector to all elements of the block. + /// + /// The added vector. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void AddInPlace(float value) + { + if (Avx.IsSupported) + { + Vector256 valueVec = Vector256.Create(value); + V0f = Avx.Add(V0f, valueVec); + V1f = Avx.Add(V1f, valueVec); + V2f = Avx.Add(V2f, valueVec); + V3f = Avx.Add(V3f, valueVec); + V4f = Avx.Add(V4f, valueVec); + V5f = Avx.Add(V5f, valueVec); + V6f = Avx.Add(V6f, valueVec); + V7f = Avx.Add(V7f, valueVec); + } + else + { + Vector4 valueVec = new(value); + V0L += valueVec; + V0R += valueVec; + V1L += valueVec; + V1R += valueVec; + V2L += valueVec; + V2R += valueVec; + V3L += valueVec; + V3R += valueVec; + V4L += valueVec; + V4R += valueVec; + V5L += valueVec; + V5R += valueVec; + V6L += valueVec; + V6R += valueVec; + V7L += valueVec; + V7R += valueVec; + } + } + + /// + /// Quantize input block, transpose, apply zig-zag ordering and store as . + /// + /// Source block. + /// Destination block. + /// The quantization table. + public static void Quantize(Block block, ref Block dest, ref Block qt) + { + if (Avx2.IsSupported) + { + MultiplyIntoInt16_Avx2(block, qt, dest); + //ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest); + } + else if (Ssse3.IsSupported) + { + MultiplyIntoInt16_Sse2(block, qt, dest); + //ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest); + } + else + { + for (int i = 0, e = block.FloatLength; i < e; i++) + { + //int idx = ZigZag.TransposingOrder[i]; + int idx = 0; + float quantizedVal = block[idx] * qt[idx]; + quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f; + dest[i] = (short)quantizedVal; + } + } + } + + public void RoundInto(Block dest) + { + for (int i = 0, e = FloatLength; i < e; i++) + { + float val = this[i]; + + if (val < 0) + { + val -= 0.5f; + } + else + { + val += 0.5f; + } + + dest[i] = (short)val; + } + } + + public Block RoundAsInt16Block() + { + Block result = new Block(); + RoundInto(result); + return result; + } + + /// + /// Level shift by +maximum/2, clip to [0..maximum], and round all the values in the block. + /// + /// The maximum value. + public void NormalizeColorsAndRoundInPlace(float maximum) + { + if (HasVector8) + { + NormalizeColorsAndRoundInPlaceVector8(maximum); + } + else + { + NormalizeColorsInPlace(maximum); + RoundInPlace(); + } + } + + /// + /// Rounds all values in the block. + /// + public void RoundInPlace() + { + for (uint i = 0, e = (uint)FloatLength; i < e; i++) + { + this[i] = MathF.Round(this[i]); + } + } + /// /// Gets a of length 4 in the block at the given index /// diff --git a/Codecs/Image/Jpeg/JpegUnitTests.cs b/Codecs/Image/Jpeg/JpegUnitTests.cs index 6d3a997d..3d600ccb 100644 --- a/Codecs/Image/Jpeg/JpegUnitTests.cs +++ b/Codecs/Image/Jpeg/JpegUnitTests.cs @@ -34,7 +34,6 @@ public static void TestBlockSetGetVector4Properties() throw new Exception("Indexer is not aligned"); } } - } public static void TestBlockTotalDifference() @@ -55,17 +54,6 @@ public static void TestBlockTotalDifference() { throw new Exception("TotalDifference calculation failed."); } - - block1 = new Block(); - block1.V0L = new Vector4(1, 2, 3, 4); - block2.V0L = new Vector4(1, 2, 3, 5); - - difference = Block.TotalDifference(ref block1, ref block2); - - if (difference != 1) - { - throw new Exception("TotalDifference calculation failed."); - } } public static void TestBlockLoad()