From 408462a4ac358abd10ce8d1f2d542bd811915992 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 09:09:18 +0300 Subject: [PATCH] Added new IDCT implementation --- .../FastFloatingPointDCT.Intrinsic.cs | 237 ++++---- .../Jpeg/Components/FastFloatingPointDCT.cs | 525 ++++++++---------- 2 files changed, 346 insertions(+), 416 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index ab9462632f..94864005ec 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -2,9 +2,6 @@ // Licensed under the Apache License, Version 2.0. #if SUPPORTS_RUNTIME_INTRINSICS -using System.Diagnostics; -using System.Numerics; -using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class FastFloatingPointDCT { -#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings +#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); - private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); - private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); - private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f); - private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f); - private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f); - private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f); - private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f); - private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f); - private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f); - private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); + private static readonly Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f); + private static readonly Vector256 mm256_F_1_8477 = Vector256.Create(1.847759065f); + private static readonly Vector256 mm256_F_n1_0823 = Vector256.Create(-1.082392200f); + private static readonly Vector256 mm256_F_n2_6131 = Vector256.Create(-2.613125930f); #pragma warning restore SA1310, SA1311, IDE1006 /// /// Apply floating point FDCT inplace using simd operations. /// - /// Input matrix. - private static void ForwardTransform_Avx(ref Block8x8F block) + /// Input block. + private static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); // First pass - process rows block.TransposeInplace(); - FDCT8x8_Avx(ref block); + FDCT8x8_1D_Avx(ref block); // Second pass - process columns block.TransposeInplace(); - FDCT8x8_Avx(ref block); + FDCT8x8_1D_Avx(ref block); + + // Applies 1D floating point FDCT inplace + static void FDCT8x8_1D_Avx(ref Block8x8F block) + { + Vector256 tmp0 = Avx.Add(block.V0, block.V7); + Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); + Vector256 tmp1 = Avx.Add(block.V1, block.V6); + Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); + Vector256 tmp2 = Avx.Add(block.V2, block.V5); + Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); + Vector256 tmp3 = Avx.Add(block.V3, block.V4); + Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + + // Even part + Vector256 tmp10 = Avx.Add(tmp0, tmp3); + Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); + Vector256 tmp11 = Avx.Add(tmp1, tmp2); + Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); + + block.V0 = Avx.Add(tmp10, tmp11); + block.V4 = Avx.Subtract(tmp10, tmp11); + + Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); + block.V2 = Avx.Add(tmp13, z1); + block.V6 = Avx.Subtract(tmp13, z1); + + // Odd part + tmp10 = Avx.Add(tmp4, tmp5); + tmp11 = Avx.Add(tmp5, tmp6); + tmp12 = Avx.Add(tmp6, tmp7); + + Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); + Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); + Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); + Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); + + Vector256 z11 = Avx.Add(tmp7, z3); + Vector256 z13 = Avx.Subtract(tmp7, z3); + + block.V5 = Avx.Add(z13, z2); + block.V3 = Avx.Subtract(z13, z2); + block.V1 = Avx.Add(z11, z4); + block.V7 = Avx.Subtract(z11, z4); + } } /// - /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. + /// Apply floating point IDCT inplace using simd operations. /// - /// - /// Requires Avx support. - /// - /// Input matrix. - public static void FDCT8x8_Avx(ref Block8x8F block) + /// Transposed input block. + private static void IDCT8x8_Avx(ref Block8x8F transposedBlock) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - Vector256 tmp0 = Avx.Add(block.V0, block.V7); - Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); - Vector256 tmp1 = Avx.Add(block.V1, block.V6); - Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); - Vector256 tmp2 = Avx.Add(block.V2, block.V5); - Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); - Vector256 tmp3 = Avx.Add(block.V3, block.V4); - Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); - - // Even part - Vector256 tmp10 = Avx.Add(tmp0, tmp3); - Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); - Vector256 tmp11 = Avx.Add(tmp1, tmp2); - Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); - - block.V0 = Avx.Add(tmp10, tmp11); - block.V4 = Avx.Subtract(tmp10, tmp11); - - Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); - block.V2 = Avx.Add(tmp13, z1); - block.V6 = Avx.Subtract(tmp13, z1); - - // Odd part - tmp10 = Avx.Add(tmp4, tmp5); - tmp11 = Avx.Add(tmp5, tmp6); - tmp12 = Avx.Add(tmp6, tmp7); - - Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); - Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); - Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); - Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); - - Vector256 z11 = Avx.Add(tmp7, z3); - Vector256 z13 = Avx.Subtract(tmp7, z3); - - block.V5 = Avx.Add(z13, z2); - block.V3 = Avx.Subtract(z13, z2); - block.V1 = Avx.Add(z11, z4); - block.V7 = Avx.Subtract(z11, z4); - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 my1 = s.V1; - Vector256 my7 = s.V7; - Vector256 mz0 = Avx.Add(my1, my7); - - Vector256 my3 = s.V3; - Vector256 mz2 = Avx.Add(my3, my7); - Vector256 my5 = s.V5; - Vector256 mz1 = Avx.Add(my3, my5); - Vector256 mz3 = Avx.Add(my1, my5); - - Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); - - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); - mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); - mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); - - Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); - Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); - Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); - Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); - - Vector256 my2 = s.V2; - Vector256 my6 = s.V6; - mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); - Vector256 my0 = s.V0; - Vector256 my4 = s.V4; - mz0 = Avx.Add(my0, my4); - mz1 = Avx.Subtract(my0, my4); - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); - - my0 = Avx.Add(mz0, mz3); - my3 = Avx.Subtract(mz0, mz3); - my1 = Avx.Add(mz1, mz2); - my2 = Avx.Subtract(mz1, mz2); - - d.V0 = Avx.Add(my0, mb0); - d.V7 = Avx.Subtract(my0, mb0); - d.V1 = Avx.Add(my1, mb1); - d.V6 = Avx.Subtract(my1, mb1); - d.V2 = Avx.Add(my2, mb2); - d.V5 = Avx.Subtract(my2, mb2); - d.V3 = Avx.Add(my3, mb3); - d.V4 = Avx.Subtract(my3, mb3); + // First pass - process columns + IDCT8x8_1D_Avx(ref transposedBlock); + + // Second pass - process rows + transposedBlock.TransposeInplace(); + IDCT8x8_1D_Avx(ref transposedBlock); + + // Applies 1D floating point FDCT inplace + static void IDCT8x8_1D_Avx(ref Block8x8F block) + { + // Even part + Vector256 tmp0 = block.V0; + Vector256 tmp1 = block.V2; + Vector256 tmp2 = block.V4; + Vector256 tmp3 = block.V6; + + Vector256 z5 = tmp0; + Vector256 tmp10 = Avx.Add(z5, tmp2); + Vector256 tmp11 = Avx.Subtract(z5, tmp2); + + Vector256 tmp13 = Avx.Add(tmp1, tmp3); + Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142); + + tmp0 = Avx.Add(tmp10, tmp13); + tmp3 = Avx.Subtract(tmp10, tmp13); + tmp1 = Avx.Add(tmp11, tmp12); + tmp2 = Avx.Subtract(tmp11, tmp12); + + // Odd part + Vector256 tmp4 = block.V1; + Vector256 tmp5 = block.V3; + Vector256 tmp6 = block.V5; + Vector256 tmp7 = block.V7; + + Vector256 z13 = Avx.Add(tmp6, tmp5); + Vector256 z10 = Avx.Subtract(tmp6, tmp5); + Vector256 z11 = Avx.Add(tmp4, tmp7); + Vector256 z12 = Avx.Subtract(tmp4, tmp7); + + tmp7 = Avx.Add(z11, z13); + tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142); + + z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477); + + tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823); + tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131); + + tmp6 = Avx.Subtract(tmp12, tmp7); + tmp5 = Avx.Subtract(tmp11, tmp6); + tmp4 = Avx.Subtract(tmp10, tmp5); + + block.V0 = Avx.Add(tmp0, tmp7); + block.V7 = Avx.Subtract(tmp0, tmp7); + block.V1 = Avx.Add(tmp1, tmp6); + block.V6 = Avx.Subtract(tmp1, tmp6); + block.V2 = Avx.Add(tmp2, tmp5); + block.V5 = Avx.Subtract(tmp2, tmp5); + block.V3 = Avx.Add(tmp3, tmp4); + block.V4 = Avx.Subtract(tmp3, tmp4); + } } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 6963c36369..a1c03e65c0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -15,102 +15,196 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// internal static partial class FastFloatingPointDCT { -#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore - private const float C_1_175876 = 1.175875602f; - private const float C_1_961571 = -1.961570560f; - private const float C_0_390181 = -0.390180644f; - private const float C_0_899976 = -0.899976223f; - private const float C_2_562915 = -2.562915447f; - private const float C_0_298631 = 0.298631336f; - private const float C_2_053120 = 2.053119869f; - private const float C_3_072711 = 3.072711026f; - private const float C_1_501321 = 1.501321110f; - private const float C_0_541196 = 0.541196100f; - private const float C_1_847759 = -1.847759065f; - private const float C_0_765367 = 0.765366865f; - - private const float C_0_125 = 0.1250f; - -#pragma warning disable SA1311, IDE1006 // naming rules violation warnings - private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); - private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); - private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); - private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); -#pragma warning restore SA1311, IDE1006 - -#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f); + private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f); + private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f); + private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f); + + private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f); + private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f); + private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f); + private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f); +#pragma warning restore SA1310, SA1311, IDE1006 /// - /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// Gets adjustment table for quantization tables. /// /// /// - /// Current FDCT implementation expects its results to be multiplied by - /// a reciprocal quantization table. To get 8x8 reciprocal block values in this - /// table must be divided by quantization table values scaled with quality settings. + /// Current IDCT and FDCT implementations are based on Arai, Agui, + /// and Nakajima's algorithm. Both DCT methods does not + /// produce finished DCT output, final step is fused into the + /// quantization step. Quantization and de-quantization coefficients + /// must be multiplied by these values. /// /// - /// These values were calculates with this formula: - /// - /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; - /// - /// Where: + /// Given values were generated by formula: /// + /// scalefactor[row] * scalefactor[col], where /// scalefactor[0] = 1 - /// - /// /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 /// - /// Values are also scaled by 8 so DCT code won't do extra division/multiplication. /// /// - internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] + private static readonly float[] AdjustmentCoefficients = new float[] { - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, - 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, - 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, - 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, - 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, + 1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f, + 1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f, + 1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f, + 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, + 0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f, + 0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f, + 0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f, }; /// - /// Adjusts given quantization table to be complient with FDCT implementation. + /// Adjusts given quantization table for usage with . + /// + /// Quantization table to adjust. + public static void AdjustToIDCT(ref Block8x8F quantTable) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + quantTable[i] = quantTable[i] * AdjustmentCoefficients[i] * 0.125f; + } + + // Spectral macroblocks are transposed before quantization + // so we must transpose quantization table + quantTable.TransposeInplace(); + } + + /// + /// Adjusts given quantization table for usage with . + /// + /// Quantization table to adjust. + public static void AdjustToFDCT(ref Block8x8F quantTable) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + quantTable[i] = 0.125f / (quantTable[i] * AdjustmentCoefficients[i]); + } + } + + /// + /// Apply 2D floating point IDCT inplace. /// /// - /// See docs for explanation. + /// Input block must be dequantized before this method with table + /// adjusted by . /// - /// Quantization table to adjust. - public static void AdjustToFDCT(ref Block8x8F quantizationtable) + /// Input block. + public static void TransformIDCT(ref Block8x8F block) { - for (int i = 0; i < Block8x8F.Size; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref block); + } + else +#endif { - quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; + IDCT_Vector4(ref block); } } /// - /// Apply 2D floating point FDCT inplace. + /// Apply 2D floating point IDCT inplace. /// - /// Input matrix. + /// + /// Input block must be quantized after this method with table adjusted + /// by . + /// + /// Input block. public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - ForwardTransform_Avx(ref block); + FDCT8x8_Avx(ref block); } else #endif if (Vector.IsHardwareAccelerated) { - ForwardTransform_Vector4(ref block); + FDCT_Vector4(ref block); } else { - ForwardTransform_Scalar(ref block); + FDCT_Scalar(ref block); + } + } + + /// + /// Apply floating point IDCT inplace using API. + /// + /// Input block. + private static void IDCT_Vector4(ref Block8x8F transposedBlock) + { + DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); + + // First pass - process columns + IDCT8x4_Vector4(ref transposedBlock.V0L); + IDCT8x4_Vector4(ref transposedBlock.V0R); + + // Second pass - process rows + transposedBlock.TransposeInplace(); + IDCT8x4_Vector4(ref transposedBlock.V0L); + IDCT8x4_Vector4(ref transposedBlock.V0R); + + // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block + static void IDCT8x4_Vector4(ref Vector4 vecRef) + { + // Even part + Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2); + Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2); + Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2); + Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2); + + Vector4 z5 = tmp0; + Vector4 tmp10 = z5 + tmp2; + Vector4 tmp11 = z5 - tmp2; + + Vector4 tmp13 = tmp1 + tmp3; + Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2); + Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2); + Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2); + Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2); + + Vector4 z13 = tmp6 + tmp5; + Vector4 z10 = tmp6 - tmp5; + Vector4 z11 = tmp4 + tmp7; + Vector4 z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * mm128_F_1_4142; + + z5 = (z10 + z12) * mm128_F_1_8477; + + tmp10 = (z12 * mm128_F_n1_0823) + z5; + tmp12 = (z10 * mm128_F_n2_6131) + z5; + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7; + Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7; + Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6; + Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6; + Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5; + Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5; + Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4; + Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4; } } @@ -120,8 +214,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. /// - /// Input matrix. - private static void ForwardTransform_Scalar(ref Block8x8F block) + /// Input block. + private static void FDCT_Scalar(ref Block8x8F block) { const int dctSize = 8; @@ -130,17 +224,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components float z1, z2, z3, z4, z5, z11, z13; // First pass - process rows - ref float dataRef = ref Unsafe.As(ref block); + ref float blockRef = ref Unsafe.As(ref block); for (int ctr = 7; ctr >= 0; ctr--) { - tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); - tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); - tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); - tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); - tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); - tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); - tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); - tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); + tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7); + tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7); + tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6); + tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6); + tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5); + tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5); + tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4); + tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4); // Even part tmp10 = tmp0 + tmp3; @@ -148,12 +242,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; - Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; + Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11; z1 = (tmp12 + tmp13) * 0.707106781f; - Unsafe.Add(ref dataRef, 2) = tmp13 + z1; - Unsafe.Add(ref dataRef, 6) = tmp13 - z1; + Unsafe.Add(ref blockRef, 2) = tmp13 + z1; + Unsafe.Add(ref blockRef, 6) = tmp13 - z1; // Odd part tmp10 = tmp4 + tmp5; @@ -168,26 +262,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components z11 = tmp7 + z3; z13 = tmp7 - z3; - Unsafe.Add(ref dataRef, 5) = z13 + z2; - Unsafe.Add(ref dataRef, 3) = z13 - z2; - Unsafe.Add(ref dataRef, 1) = z11 + z4; - Unsafe.Add(ref dataRef, 7) = z11 - z4; + Unsafe.Add(ref blockRef, 5) = z13 + z2; + Unsafe.Add(ref blockRef, 3) = z13 - z2; + Unsafe.Add(ref blockRef, 1) = z11 + z4; + Unsafe.Add(ref blockRef, 7) = z11 - z4; - dataRef = ref Unsafe.Add(ref dataRef, dctSize); + blockRef = ref Unsafe.Add(ref blockRef, dctSize); } // Second pass - process columns - dataRef = ref Unsafe.As(ref block); + blockRef = ref Unsafe.As(ref block); for (int ctr = 7; ctr >= 0; ctr--) { - tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); - tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); - tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); - tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); - tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); - tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); - tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); - tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); + tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7); + tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7); + tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6); + tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6); + tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5); + tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5); + tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4); + tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4); // Even part tmp10 = tmp0 + tmp3; @@ -195,12 +289,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; - Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; + Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11; z1 = (tmp12 + tmp13) * 0.707106781f; - Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; - Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; + Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1; + Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1; // Odd part tmp10 = tmp4 + tmp5; @@ -215,12 +309,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components z11 = tmp7 + z3; z13 = tmp7 - z3; - Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; - Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; - Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; - Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; + Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2; + Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2; + Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4; + Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4; - dataRef = ref Unsafe.Add(ref dataRef, 1); + blockRef = ref Unsafe.Add(ref blockRef, 1); } } @@ -230,11 +324,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// This implementation must be called only if hardware supports 4 /// floating point numbers vector. Otherwise explicit scalar - /// implementation is faster - /// because it does not rely on matrix transposition. + /// implementation is faster + /// because it does not rely on block transposition. /// - /// Input matrix. - private static void ForwardTransform_Vector4(ref Block8x8F block) + /// Input block. + public static void FDCT_Vector4(ref Block8x8F block) { DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); @@ -247,209 +341,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.TransposeInplace(); FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0R); - } - /// - /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. - /// - /// - /// Implemented using Vector4 API operations for either scalar or sse hardware implementation. - /// Must be called on both 8x4 matrix parts for the full FDCT transform. - /// - /// Input reference to the first - private static void FDCT8x4_Vector4(ref Vector4 blockRef) - { - Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); - Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); - Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); - Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); - Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); - Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); - Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); - Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); - - // Even part - Vector4 tmp10 = tmp0 + tmp3; - Vector4 tmp13 = tmp0 - tmp3; - Vector4 tmp11 = tmp1 + tmp2; - Vector4 tmp12 = tmp1 - tmp2; - - Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; - Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; - - Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; - Unsafe.Add(ref blockRef, 4) = tmp13 + z1; - Unsafe.Add(ref blockRef, 12) = tmp13 - z1; - - // Odd part - tmp10 = tmp4 + tmp5; - tmp11 = tmp5 + tmp6; - tmp12 = tmp6 + tmp7; - - Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; - Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; - Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; - Vector4 z3 = tmp11 * mm128_F_0_7071; - - Vector4 z11 = tmp7 + z3; - Vector4 z13 = tmp7 - z3; - - Unsafe.Add(ref blockRef, 10) = z13 + z2; - Unsafe.Add(ref blockRef, 6) = z13 - z2; - Unsafe.Add(ref blockRef, 2) = z11 + z4; - Unsafe.Add(ref blockRef, 14) = z11 - z4; - } + // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block + static void FDCT8x4_Vector4(ref Vector4 vecRef) + { + Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14); + Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14); + Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12); + Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12); + Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10); + Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10); + Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8); + Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8); - /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) - { - block.TransposeInplace(); - IDCT8x8(ref block, ref temp); - temp.TransposeInplace(); - IDCT8x8(ref temp, ref block); + // Even part + Vector4 tmp10 = tmp0 + tmp3; + Vector4 tmp13 = tmp0 - tmp3; + Vector4 tmp11 = tmp1 + tmp2; + Vector4 tmp12 = tmp1 - tmp2; - // TODO: This can be fused into quantization table step - block.MultiplyInPlace(C_0_125); - } + Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11; - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } + Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; + Unsafe.Add(ref vecRef, 4) = tmp13 + z1; + Unsafe.Add(ref vecRef, 12) = tmp13 - z1; - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; + Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; + Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; + Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; + Vector4 z3 = tmp11 * mm128_F_0_7071; + + Vector4 z11 = tmp7 + z3; + Vector4 z13 = tmp7 - z3; + + Unsafe.Add(ref vecRef, 10) = z13 + z2; + Unsafe.Add(ref vecRef, 6) = z13 - z2; + Unsafe.Add(ref vecRef, 2) = z11 + z4; + Unsafe.Add(ref vecRef, 14) = z11 - z4; + } } } }