diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 073580d40c..83227ff079 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -3,6 +3,7 @@ #if SUPPORTS_RUNTIME_INTRINSICS using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; @@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0, 1, 4, 5, 2, 3, 6, 7 }; - private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); @@ -53,8 +54,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components for (int i = 0; i < 8; i += 2) { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); Vector256 row = Avx2.PackSignedSaturate(row0, row1); row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); @@ -64,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); @@ -75,13 +76,81 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components for (int i = 0; i < 16; i += 2) { - Vector128 left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector128 right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); Vector128 row = Sse2.PackSignedSaturate(left, right); Unsafe.Add(ref destBase, i / 2) = row; } } + + private void TransposeAvx() + { + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 + Vector256 r0 = Avx.InsertVector128( + this.V0, + Unsafe.As>(ref this.V4L), + 1); + + Vector256 r1 = Avx.InsertVector128( + this.V1, + Unsafe.As>(ref this.V5L), + 1); + + Vector256 r2 = Avx.InsertVector128( + this.V2, + Unsafe.As>(ref this.V6L), + 1); + + Vector256 r3 = Avx.InsertVector128( + this.V3, + Unsafe.As>(ref this.V7L), + 1); + + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref this.V0R).ToVector256(), + Unsafe.As>(ref this.V4R), + 1); + + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref this.V1R).ToVector256(), + Unsafe.As>(ref this.V5R), + 1); + + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref this.V2R).ToVector256(), + Unsafe.As>(ref this.V6R), + 1); + + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref this.V3R).ToVector256(), + Unsafe.As>(ref this.V7R), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + this.V0 = Avx.Blend(t0, v, 0xCC); + this.V1 = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + this.V4 = Avx.Blend(t4, v, 0xCC); + this.V5 = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + this.V2 = Avx.Blend(t1, v, 0xCC); + this.V3 = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + this.V6 = Avx.Blend(t5, v, 0xCC); + this.V7 = Avx.Blend(t7, v, 0x33); + } } } #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 2656f07cab..0b78735852 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -413,41 +413,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { - DivideIntoInt16_Avx2(ref block, ref qt, ref dest); + MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest); } else if (Ssse3.IsSupported) { - DivideIntoInt16_Sse2(ref block, ref qt, ref dest); + MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest); } else #endif { - Divide(ref block, ref qt); + Multiply(ref block, ref qt); block.RoundInto(ref dest); } } [MethodImpl(InliningOptions.ShortMethod)] - private static void Divide(ref Block8x8F a, ref Block8x8F b) - { - a.V0L /= b.V0L; - a.V0R /= b.V0R; - a.V1L /= b.V1L; - a.V1R /= b.V1R; - a.V2L /= b.V2L; - a.V2R /= b.V2R; - a.V3L /= b.V3L; - a.V3R /= b.V3R; - a.V4L /= b.V4L; - a.V4R /= b.V4R; - a.V5L /= b.V5L; - a.V5R /= b.V5R; - a.V6L /= b.V6L; - a.V6R /= b.V6R; - a.V7L /= b.V7L; - a.V7R /= b.V7R; + private static void Multiply(ref Block8x8F a, ref Block8x8F b) + { + a.V0L *= b.V0L; + a.V0R *= b.V0R; + a.V1L *= b.V1L; + a.V1R *= b.V1R; + a.V2L *= b.V2L; + a.V2R *= b.V2R; + a.V3L *= b.V3L; + a.V3R *= b.V3R; + a.V4L *= b.V4L; + a.V4R *= b.V4R; + a.V5L *= b.V5L; + a.V5R *= b.V5R; + a.V6L *= b.V6L; + a.V6R *= b.V6R; + a.V7L *= b.V7L; + a.V7R *= b.V7R; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -608,154 +608,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - /// Transpose the block into the destination block. + /// Transpose the block inplace. /// - /// The destination block [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInto(ref Block8x8F d) + public void Transpose() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref this.V0L).ToVector256(), - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref this.V1L).ToVector256(), - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref this.V2L).ToVector256(), - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref this.V3L).ToVector256(), - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackLow(r2, r3); - Vector256 v = Avx.Shuffle(t0, t2, 0x4E); - d.V0 = Avx.Blend(t0, v, 0xCC); - d.V1 = Avx.Blend(t2, v, 0x33); - - Vector256 t4 = Avx.UnpackLow(r4, r5); - Vector256 t6 = Avx.UnpackLow(r6, r7); - v = Avx.Shuffle(t4, t6, 0x4E); - d.V4 = Avx.Blend(t4, v, 0xCC); - d.V5 = Avx.Blend(t6, v, 0x33); - - Vector256 t1 = Avx.UnpackHigh(r0, r1); - Vector256 t3 = Avx.UnpackHigh(r2, r3); - v = Avx.Shuffle(t1, t3, 0x4E); - d.V2 = Avx.Blend(t1, v, 0xCC); - d.V3 = Avx.Blend(t3, v, 0x33); - - Vector256 t5 = Avx.UnpackHigh(r4, r5); - Vector256 t7 = Avx.UnpackHigh(r6, r7); - v = Avx.Shuffle(t5, t7, 0x4E); - d.V6 = Avx.Blend(t5, v, 0xCC); - d.V7 = Avx.Blend(t7, v, 0x33); + this.TransposeAvx(); } else #endif { - d.V0L.X = this.V0L.X; - d.V1L.X = this.V0L.Y; - d.V2L.X = this.V0L.Z; - d.V3L.X = this.V0L.W; - d.V4L.X = this.V0R.X; - d.V5L.X = this.V0R.Y; - d.V6L.X = this.V0R.Z; - d.V7L.X = this.V0R.W; - - d.V0L.Y = this.V1L.X; - d.V1L.Y = this.V1L.Y; - d.V2L.Y = this.V1L.Z; - d.V3L.Y = this.V1L.W; - d.V4L.Y = this.V1R.X; - d.V5L.Y = this.V1R.Y; - d.V6L.Y = this.V1R.Z; - d.V7L.Y = this.V1R.W; - - d.V0L.Z = this.V2L.X; - d.V1L.Z = this.V2L.Y; - d.V2L.Z = this.V2L.Z; - d.V3L.Z = this.V2L.W; - d.V4L.Z = this.V2R.X; - d.V5L.Z = this.V2R.Y; - d.V6L.Z = this.V2R.Z; - d.V7L.Z = this.V2R.W; - - d.V0L.W = this.V3L.X; - d.V1L.W = this.V3L.Y; - d.V2L.W = this.V3L.Z; - d.V3L.W = this.V3L.W; - d.V4L.W = this.V3R.X; - d.V5L.W = this.V3R.Y; - d.V6L.W = this.V3R.Z; - d.V7L.W = this.V3R.W; - - d.V0R.X = this.V4L.X; - d.V1R.X = this.V4L.Y; - d.V2R.X = this.V4L.Z; - d.V3R.X = this.V4L.W; - d.V4R.X = this.V4R.X; - d.V5R.X = this.V4R.Y; - d.V6R.X = this.V4R.Z; - d.V7R.X = this.V4R.W; - - d.V0R.Y = this.V5L.X; - d.V1R.Y = this.V5L.Y; - d.V2R.Y = this.V5L.Z; - d.V3R.Y = this.V5L.W; - d.V4R.Y = this.V5R.X; - d.V5R.Y = this.V5R.Y; - d.V6R.Y = this.V5R.Z; - d.V7R.Y = this.V5R.W; - - d.V0R.Z = this.V6L.X; - d.V1R.Z = this.V6L.Y; - d.V2R.Z = this.V6L.Z; - d.V3R.Z = this.V6L.W; - d.V4R.Z = this.V6R.X; - d.V5R.Z = this.V6R.Y; - d.V6R.Z = this.V6R.Z; - d.V7R.Z = this.V6R.W; - - d.V0R.W = this.V7L.X; - d.V1R.W = this.V7L.Y; - d.V2R.W = this.V7L.Z; - d.V3R.W = this.V7L.W; - d.V4R.W = this.V7R.X; - d.V5R.W = this.V7R.Y; - d.V6R.W = this.V7R.Z; - d.V7R.W = this.V7R.W; + this.TransposeScalar(); + } + } + + /// + /// Scalar inplace transpose implementation for + /// + [MethodImpl(InliningOptions.ShortMethod)] + private void TransposeScalar() + { + float tmp; + int horIndex, verIndex; + + // We don't care about the last row as it consists of a single element + // Which won't be swapped with anything + for (int i = 0; i < 7; i++) + { + // We don't care about the first element in each row as it's not swapped + for (int j = i + 1; j < 8; j++) + { + horIndex = (i * 8) + j; + verIndex = (j * 8) + i; + tmp = this[horIndex]; + this[horIndex] = this[verIndex]; + this[verIndex] = tmp; + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index cf5fdd2dfb..085cd4a291 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder // Dequantize: block.MultiplyInPlace(ref this.DequantiazationTable); - FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock); + FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock); // To conform better to libjpeg we actually NEED TO loose precision here. // This is because they store blocks as Int16 between all the operations. diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index db0bc32ae5..da4723e217 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private int bitCount; - private Block8x8F temporalBlock; - private Block8x8 temporalShortBlock; + private Block8x8 tempBlock; /// /// The output stream. All attempted writes after the first error become no-ops. @@ -130,6 +129,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -190,6 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -256,6 +269,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -301,6 +320,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -365,14 +390,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F block, ref Block8x8F quant) { - ref Block8x8F refTemp = ref this.temporalBlock; - ref Block8x8 spectralBlock = ref this.temporalShortBlock; + ref Block8x8 spectralBlock = ref this.tempBlock; // Shifting level from 0..255 to -128..127 block.AddInPlace(-128f); // Discrete cosine transform - FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp); + FastFloatingPointDCT.TransformFDCT(ref block); // Quantization Block8x8F.Quantize(ref block, ref spectralBlock, ref quant); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs new file mode 100644 index 0000000000..eb60445d3f --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -0,0 +1,210 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Collections.Generic; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Text; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal static partial class FastFloatingPointDCT + { + /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. Values in this table must be divided + /// by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// + /// + public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + { + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; + +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); + private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); + private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); + + private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); + private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); + private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); + private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); +#pragma warning restore SA1310, SA1311, IDE1006 + + /// + /// Apply floating point FDCT inplace using simd operations. + /// + /// Input matrix. + private static void ForwardTransformSimd(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation."); + + // First pass - process rows + block.Transpose(); + if (Avx.IsSupported) + { + FDCT8x8_avx(ref block); + } + else if (Sse.IsSupported) + { + // Left part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + + // Right part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + } + + // Second pass - process columns + block.Transpose(); + if (Avx.IsSupported) + { + FDCT8x8_avx(ref block); + } + else if (Sse.IsSupported) + { + // Left part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + + // Right part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + } + } + + /// + /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix. + /// + /// + /// Requires Sse support. + /// Must be called on both 8x4 matrix parts for the full FDCT transform. + /// + /// Input reference to the first + public static void FDCT8x4_sse(ref Vector128 blockRef) + { + DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation."); + + Vector128 tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); + Vector128 tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); + Vector128 tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); + Vector128 tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); + Vector128 tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); + Vector128 tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); + Vector128 tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); + Vector128 tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); + + // Even part + Vector128 tmp10 = Sse.Add(tmp0, tmp3); + Vector128 tmp13 = Sse.Subtract(tmp0, tmp3); + Vector128 tmp11 = Sse.Add(tmp1, tmp2); + Vector128 tmp12 = Sse.Subtract(tmp1, tmp2); + + Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11); + Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11); + + Vector128 z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071); + Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1); + Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1); + + // Odd part + tmp10 = Sse.Add(tmp4, tmp5); + tmp11 = Sse.Add(tmp5, tmp6); + tmp12 = Sse.Add(tmp6, tmp7); + + Vector128 z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826); + Vector128 z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5); + Vector128 z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5); + Vector128 z3 = Sse.Multiply(tmp11, mm128_F_0_7071); + + Vector128 z11 = Sse.Add(tmp7, z3); + Vector128 z13 = Sse.Subtract(tmp7, z3); + + Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2); + Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2); + Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4); + Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4); + } + + /// + /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. + /// + /// + /// Requires Avx support. + /// + /// Input matrix. + public static void FDCT8x8_avx(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); + + Vector256 tmp0 = Avx.Add(block.V0, block.V7); + Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); + Vector256 tmp1 = Avx.Add(block.V1, block.V6); + Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); + Vector256 tmp2 = Avx.Add(block.V2, block.V5); + Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); + Vector256 tmp3 = Avx.Add(block.V3, block.V4); + Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + + // Even part + Vector256 tmp10 = Avx.Add(tmp0, tmp3); + Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); + Vector256 tmp11 = Avx.Add(tmp1, tmp2); + Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); + + block.V0 = Avx.Add(tmp10, tmp11); + block.V4 = Avx.Subtract(tmp10, tmp11); + + Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); + block.V2 = Avx.Add(tmp13, z1); + block.V6 = Avx.Subtract(tmp13, z1); + + // Odd part + tmp10 = Avx.Add(tmp4, tmp5); + tmp11 = Avx.Add(tmp5, tmp6); + tmp12 = Avx.Add(tmp6, tmp7); + + Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); + Vector256 z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5); + Vector256 z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5); + Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); + + Vector256 z11 = Avx.Add(tmp7, z3); + Vector256 z13 = Avx.Subtract(tmp7, z3); + + block.V5 = Avx.Add(z13, z2); + block.V3 = Avx.Subtract(z13, z2); + block.V1 = Avx.Add(z11, z4); + block.V7 = Avx.Subtract(z11, z4); + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index dd46a83e3b..a554e8577b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -46,11 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #if SUPPORTS_RUNTIME_INTRINSICS private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); - private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f); private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); - private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f); - private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f); - private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f); private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); @@ -62,250 +58,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); - - private static readonly Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f); #endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore - private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f); - - /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// - /// - /// Source - /// Destination - public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 c0 = s.V0L; - Vector4 c1 = s.V7L; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6L; - c0 = s.V1L; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5L; - c0 = s.V2L; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3L; - c1 = s.V4L; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0L = c0 + c1; - d.V4L = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2L = (w0 * c2) + (w1 * c3); - d.V6L = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3L = c0 - c2; - d.V5L = c3 - c1; - - float invsqrt2 = 0.707107f; - c0 = (c0 + c2) * invsqrt2; - c3 = (c3 + c1) * invsqrt2; - - d.V1L = c0 + c3; - d.V7L = c0 - c3; - } - - /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// - /// - /// Source - /// Destination - public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 c0 = s.V0R; - Vector4 c1 = s.V7R; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6R; - c0 = s.V1R; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5R; - c0 = s.V2R; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3R; - c1 = s.V4R; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0R = c0 + c1; - d.V4R = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2R = (w0 * c2) + (w1 * c3); - d.V6R = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3R = c0 - c2; - d.V5R = c3 - c1; - - c0 = (c0 + c2) * InvSqrt2; - c3 = (c3 + c1) * InvSqrt2; - - d.V1R = c0 + c3; - d.V7R = c0 - c3; - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 t0 = Avx.Add(s.V0, s.V7); - Vector256 t7 = Avx.Subtract(s.V0, s.V7); - Vector256 t1 = Avx.Add(s.V1, s.V6); - Vector256 t6 = Avx.Subtract(s.V1, s.V6); - Vector256 t2 = Avx.Add(s.V2, s.V5); - Vector256 t5 = Avx.Subtract(s.V2, s.V5); - Vector256 t3 = Avx.Add(s.V3, s.V4); - Vector256 t4 = Avx.Subtract(s.V3, s.V4); - - Vector256 c0 = Avx.Add(t0, t3); - Vector256 c1 = Avx.Add(t1, t2); - - // 0 4 - d.V0 = Avx.Add(c0, c1); - d.V4 = Avx.Subtract(c0, c1); - - Vector256 c3 = Avx.Subtract(t0, t3); - Vector256 c2 = Avx.Subtract(t1, t2); - - // 2 6 - d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); - d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411); - - c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); - c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758); - - c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); - c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870); - - // 3 5 - d.V3 = Avx.Subtract(c0, c2); - d.V5 = Avx.Subtract(c3, c1); - - c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); - c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); - - // 1 7 - d.V1 = Avx.Add(c0, c3); - d.V7 = Avx.Subtract(c0, c3); -#endif - } - - /// - /// Performs 8x8 matrix Forward Discrete Cosine Transform - /// - /// Source - /// Destination - public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - FDCT8x4_LeftPart(ref s, ref d); - FDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Apply floating point FDCT from src into dest - /// - /// Source - /// Destination - /// Temporary block provided by the caller for optimization - public static void TransformFDCT( - ref Block8x8F src, - ref Block8x8F dest, - ref Block8x8F temp) - { - src.TransposeInto(ref temp); - FDCT8x8(ref temp, ref dest); - - dest.TransposeInto(ref temp); - FDCT8x8(ref temp, ref dest); - - dest.MultiplyInPlace(C_0_125); - } - - /// - /// Apply floating point FDCT inplace. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp) - { - matrix.TransposeInto(ref temp); - FDCT8x8(ref temp, ref matrix); - - matrix.TransposeInto(ref temp); - FDCT8x8(ref temp, ref matrix); - - matrix.MultiplyInPlace(C_0_125); - } - /// /// Performs 8x8 matrix Inverse Discrete Cosine Transform /// @@ -501,40 +255,148 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. /// - /// Source - /// Destination - /// Temporary block provided by the caller - public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) { - src.TransposeInto(ref temp); - - IDCT8x8(ref temp, ref dest); - dest.TransposeInto(ref temp); - IDCT8x8(ref temp, ref dest); + block.Transpose(); + IDCT8x8(ref block, ref temp); + temp.Transpose(); + IDCT8x8(ref temp, ref block); // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - dest.MultiplyInPlace(C_0_125); + block.MultiplyInPlace(C_0_125); } /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// Apply 2D floating point FDCT inplace using scalar operations. /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp) + /// + /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. + /// + /// Input matrix. + private static void ForwardTransformScalar(ref Block8x8F block) { - block.TransposeInto(ref temp); + const int dctSize = 8; - IDCT8x8(ref temp, ref block); - block.TransposeInto(ref temp); - IDCT8x8(ref temp, ref block); + float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + float tmp10, tmp11, tmp12, tmp13; + float z1, z2, z3, z4, z5, z11, z13; - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - block.MultiplyInPlace(C_0_125); + // First pass - process rows + ref float dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) + { + tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); + tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); + tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); + tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); + tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); + tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); + tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); + tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, 5) = z13 + z2; + Unsafe.Add(ref dataRef, 3) = z13 - z2; + Unsafe.Add(ref dataRef, 1) = z11 + z4; + Unsafe.Add(ref dataRef, 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, dctSize); + } + + // Second pass - process columns + dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) + { + tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); + tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); + tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); + tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); + tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); + tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); + tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); + tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; + Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; + Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; + Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, 1); + } + } + + /// + /// Apply 2D floating point FDCT inplace. + /// + /// Input matrix. + public static void TransformFDCT(ref Block8x8F block) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported || Sse.IsSupported) + { + ForwardTransformSimd(ref block); + } + else +#endif + { + ForwardTransformScalar(ref block); + } } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 066eb28469..878a67b500 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -10,10 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class ZigZag { +#pragma warning disable SA1309 // naming rules violation warnings /// /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics. /// - private const byte Z = 0xff; + private const byte _ = 0xff; +#pragma warning restore SA1309 /// /// Gets shuffle vectors for @@ -22,82 +24,82 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static ReadOnlySpan SseShuffleMasks => new byte[] { // 0_A - 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z, + 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, // 0_B - Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, + _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, // 0_C - Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, // 1_A - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11, + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, // 1_B - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, // 1_C - 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, + 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, // 1_D - Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, // 1_E - Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, // 2_B - 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 2_C - Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, // 2_D - Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, // 2_E - Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, + _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, // 2_F - Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, + _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, // 2_G - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, // 3_A - Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, // 3_B - Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, + _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, // 3_C - Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, + _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, // 3_D/4_E - 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, + 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, // 4_F - Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, + _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, // 4_G - Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, + _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, // 4_H - Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, // 5_B - Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, // 5_C - Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, // 5_D - 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z, + 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, // 5_E - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, // 5_F - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, // 5_G - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, // 6_D - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, // 6_E - Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, + _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, // 6_F - Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, + _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, // 6_G - Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, // 6_H - 4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, // 7_F - Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, // 7_G - 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, + 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, // 7_H - Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15 + _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; /// @@ -110,55 +112,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, // 01_AB - inner-lane - 0, 1, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, 6, 7, + 0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7, // 01_CD/23_GH - cross-lane - 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, // 01_CD - inner-lane - Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, // 01_EF - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, // 23_AB/45_CD/67_EF - cross-lane - 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, + 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, // 23_AB - inner-lane - 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, 2, 3, 8, 9, Z, Z, Z, Z, + 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _, // 23_CD - inner-lane - Z, Z, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 12, 13, + _, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13, // 23_EF - inner-lane - Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 23_GH - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 45_AB - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, // 45_CD - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, // 45_EF - cross-lane - 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, Z, Z, Z, Z, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, + 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, // 45_EF - inner-lane - 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, + 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, // 45_GH - inner-lane - Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, // 67_CD - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 67_EF - inner-lane - Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, // 67_GH - inner-lane - 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, 6, 7, 12, 13, 14, 15 + 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15 }; /// diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 1d103cd1a0..8e8787475a 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -12,15 +12,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations private static readonly Block8x8F Source = Create8x8FloatData(); [Benchmark] - public void TransposeInto() - { - var dest = default(Block8x8F); - Source.TransposeInto(ref dest); - } + public void TransposeInto() => Source.Transpose(); private static Block8x8F Create8x8FloatData() { - var result = new float[64]; + float[] result = new float[64]; for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index fc642dcc79..89ef74d8b7 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -164,52 +164,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } [Fact] - public void TransposeInto() + public void Transpose() { static void RunTest() { float[] expected = Create8x8FloatData(); ReferenceImplementations.Transpose8x8(expected); - var source = default(Block8x8F); - source.LoadFrom(Create8x8FloatData()); + var block8x8 = default(Block8x8F); + block8x8.LoadFrom(Create8x8FloatData()); - var dest = default(Block8x8F); - source.TransposeInto(ref dest); + block8x8.Transpose(); float[] actual = new float[64]; - dest.ScaledCopyTo(actual); + block8x8.ScaledCopyTo(actual); Assert.Equal(expected, actual); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX); - } - - private class BufferHolder - { - public Block8x8F Buffer; - } - - [Fact] - public void TransposeInto_Benchmark() - { - var source = new BufferHolder(); - source.Buffer.LoadFrom(Create8x8FloatData()); - var dest = new BufferHolder(); - - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ..."); - var sw = Stopwatch.StartNew(); - - for (int i = 0; i < Times; i++) - { - source.Buffer.TransposeInto(ref dest.Buffer); - } - - sw.Stop(); - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms"); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } private static float[] Create8x8ColorCropTestData() @@ -281,16 +256,21 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed); + // Reference implementation quantizes given block via division Block8x8 expected = default; ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); + // Actual current implementation quantizes given block via multiplication + // With quantization table reciprocal + for (int i = 0; i < Block8x8F.Size; i++) + { + quant[i] = 1f / quant[i]; + } + Block8x8 actual = default; Block8x8F.Quantize(ref source, ref actual, ref quant); - for (int i = 0; i < Block8x8.Size; i++) - { - Assert.Equal(expected[i], actual[i]); - } + this.CompareBlocks(expected, actual, 1); } [Fact] diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 34ca7f9eb9..55d208c5af 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -2,6 +2,9 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; #endif @@ -33,15 +36,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } [Theory] @@ -52,15 +54,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } // Inverse transform @@ -167,8 +168,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg var srcBlock = default(Block8x8F); srcBlock.LoadFrom(src); - var destBlock = default(Block8x8F); - var expectedDest = new float[64]; var temp1 = new float[64]; var temp2 = default(Block8x8F); @@ -177,10 +176,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); // testee - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2); var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + srcBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } @@ -198,95 +197,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } // Forward transform - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_LeftPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // testee - FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_RightPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee - FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x8_Avx(int seed) - { -#if SUPPORTS_RUNTIME_INTRINSICS - var skip = !Avx.IsSupported; -#else - var skip = true; -#endif - if (skip) - { - this.Output.WriteLine("No AVX present, skipping test!"); - return; - } - - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference, left part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // reference, right part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee, whole 8x8 - FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - + // This test covers entire FDCT conversions chain + // This test checks all implementations: intrinsic and scalar fallback [Theory] [InlineData(1)] [InlineData(2)] @@ -297,37 +209,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int seed = FeatureTestRunner.Deserialize(serialized); Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); + var block = default(Block8x8F); + block.LoadFrom(src); - var expectedDest = new float[64]; - var temp1 = new float[64]; - var temp2 = default(Block8x8F); + float[] expectedDest = new float[64]; + float[] temp1 = new float[64]; // reference ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); // testee - FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2); + // Part of the FDCT calculations is fused into the quantization step + // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen + FastFloatingPointDCT.TransformFDCT(ref block); + for (int i = 0; i < 64; i++) + { + block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i]; + } - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + float[] actualDest = block.ToArray(); - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); + Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f)); } // 3 paths: // 1. AllowAll - call avx/fma implementation // 2. DisableFMA - call avx implementation without fma acceleration - // 3. DisableAvx - call fallback code of Vector4 implementation - // - // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + // 3. DisableAvx - call sse implementation + // 4. DisableHWIntrinsic - call scalar fallback implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } } }