From 408462a4ac358abd10ce8d1f2d542bd811915992 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 21 Nov 2021 09:09:18 +0300
Subject: [PATCH] Added new IDCT implementation

---
 .../FastFloatingPointDCT.Intrinsic.cs         | 237 ++++----
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 525 ++++++++----------
 2 files changed, 346 insertions(+), 416 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index ab9462632f..94864005ec 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -2,9 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
@@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     internal static partial class FastFloatingPointDCT
     {
-#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
         private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
         private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
         private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
         private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
 
-        private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
-        private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
-        private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
-        private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
-        private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
-        private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f);
-        private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f);
-        private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f);
-        private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f);
-        private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
-        private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
+        private static readonly Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
+        private static readonly Vector256<float> mm256_F_1_8477 = Vector256.Create(1.847759065f);
+        private static readonly Vector256<float> mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
+        private static readonly Vector256<float> mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
 #pragma warning restore SA1310, SA1311, IDE1006
 
         /// <summary>
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Avx(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        private static void FDCT8x8_Avx(ref Block8x8F block)
         {
             DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
             // First pass - process rows
             block.TransposeInplace();
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_1D_Avx(ref block);
 
             // Second pass - process columns
             block.TransposeInplace();
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_1D_Avx(ref block);
+
+            // Applies 1D floating point FDCT inplace
+            static void FDCT8x8_1D_Avx(ref Block8x8F block)
+            {
+                Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
+                Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
+                Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
+                Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
+                Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
+                Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
+                Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
+                Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+
+                // Even part
+                Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
+                Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
+                Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
+                Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
+
+                block.V0 = Avx.Add(tmp10, tmp11);
+                block.V4 = Avx.Subtract(tmp10, tmp11);
+
+                Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+                block.V2 = Avx.Add(tmp13, z1);
+                block.V6 = Avx.Subtract(tmp13, z1);
+
+                // Odd part
+                tmp10 = Avx.Add(tmp4, tmp5);
+                tmp11 = Avx.Add(tmp5, tmp6);
+                tmp12 = Avx.Add(tmp6, tmp7);
+
+                Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+                Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+                Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+                Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+                Vector256<float> z11 = Avx.Add(tmp7, z3);
+                Vector256<float> z13 = Avx.Subtract(tmp7, z3);
+
+                block.V5 = Avx.Add(z13, z2);
+                block.V3 = Avx.Subtract(z13, z2);
+                block.V1 = Avx.Add(z11, z4);
+                block.V7 = Avx.Subtract(z11, z4);
+            }
         }
 
         /// <summary>
-        /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+        /// Apply floating point IDCT inplace using simd operations.
         /// </summary>
-        /// <remarks>
-        /// Requires Avx support.
-        /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        public static void FDCT8x8_Avx(ref Block8x8F block)
+        /// <param name="transposedBlock">Transposed input block.</param>
+        private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
         {
             DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
-            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
-            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
-            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
-            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
-            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
-            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
-            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
-            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
-
-            // Even part
-            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
-            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
-            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
-            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
-
-            block.V0 = Avx.Add(tmp10, tmp11);
-            block.V4 = Avx.Subtract(tmp10, tmp11);
-
-            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
-            block.V2 = Avx.Add(tmp13, z1);
-            block.V6 = Avx.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Avx.Add(tmp4, tmp5);
-            tmp11 = Avx.Add(tmp5, tmp6);
-            tmp12 = Avx.Add(tmp6, tmp7);
-
-            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
-            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
-            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
-            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
-            Vector256<float> z11 = Avx.Add(tmp7, z3);
-            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
-
-            block.V5 = Avx.Add(z13, z2);
-            block.V3 = Avx.Subtract(z13, z2);
-            block.V1 = Avx.Add(z11, z4);
-            block.V7 = Avx.Subtract(z11, z4);
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> my1 = s.V1;
-            Vector256<float> my7 = s.V7;
-            Vector256<float> mz0 = Avx.Add(my1, my7);
-
-            Vector256<float> my3 = s.V3;
-            Vector256<float> mz2 = Avx.Add(my3, my7);
-            Vector256<float> my5 = s.V5;
-            Vector256<float> mz1 = Avx.Add(my3, my5);
-            Vector256<float> mz3 = Avx.Add(my1, my5);
-
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
-
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
-            mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
-            mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
-
-            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
-            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
-            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
-            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
-
-            Vector256<float> my2 = s.V2;
-            Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
-            Vector256<float> my0 = s.V0;
-            Vector256<float> my4 = s.V4;
-            mz0 = Avx.Add(my0, my4);
-            mz1 = Avx.Subtract(my0, my4);
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
-
-            my0 = Avx.Add(mz0, mz3);
-            my3 = Avx.Subtract(mz0, mz3);
-            my1 = Avx.Add(mz1, mz2);
-            my2 = Avx.Subtract(mz1, mz2);
-
-            d.V0 = Avx.Add(my0, mb0);
-            d.V7 = Avx.Subtract(my0, mb0);
-            d.V1 = Avx.Add(my1, mb1);
-            d.V6 = Avx.Subtract(my1, mb1);
-            d.V2 = Avx.Add(my2, mb2);
-            d.V5 = Avx.Subtract(my2, mb2);
-            d.V3 = Avx.Add(my3, mb3);
-            d.V4 = Avx.Subtract(my3, mb3);
+            // First pass - process columns
+            IDCT8x8_1D_Avx(ref transposedBlock);
+
+            // Second pass - process rows
+            transposedBlock.TransposeInplace();
+            IDCT8x8_1D_Avx(ref transposedBlock);
+
+            // Applies 1D floating point FDCT inplace
+            static void IDCT8x8_1D_Avx(ref Block8x8F block)
+            {
+                // Even part
+                Vector256<float> tmp0 = block.V0;
+                Vector256<float> tmp1 = block.V2;
+                Vector256<float> tmp2 = block.V4;
+                Vector256<float> tmp3 = block.V6;
+
+                Vector256<float> z5 = tmp0;
+                Vector256<float> tmp10 = Avx.Add(z5, tmp2);
+                Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
+
+                Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
+                Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+
+                tmp0 = Avx.Add(tmp10, tmp13);
+                tmp3 = Avx.Subtract(tmp10, tmp13);
+                tmp1 = Avx.Add(tmp11, tmp12);
+                tmp2 = Avx.Subtract(tmp11, tmp12);
+
+                // Odd part
+                Vector256<float> tmp4 = block.V1;
+                Vector256<float> tmp5 = block.V3;
+                Vector256<float> tmp6 = block.V5;
+                Vector256<float> tmp7 = block.V7;
+
+                Vector256<float> z13 = Avx.Add(tmp6, tmp5);
+                Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
+                Vector256<float> z11 = Avx.Add(tmp4, tmp7);
+                Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
+
+                tmp7 = Avx.Add(z11, z13);
+                tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
+
+                z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
+
+                tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
+                tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
+
+                tmp6 = Avx.Subtract(tmp12, tmp7);
+                tmp5 = Avx.Subtract(tmp11, tmp6);
+                tmp4 = Avx.Subtract(tmp10, tmp5);
+
+                block.V0 = Avx.Add(tmp0, tmp7);
+                block.V7 = Avx.Subtract(tmp0, tmp7);
+                block.V1 = Avx.Add(tmp1, tmp6);
+                block.V6 = Avx.Subtract(tmp1, tmp6);
+                block.V2 = Avx.Add(tmp2, tmp5);
+                block.V5 = Avx.Subtract(tmp2, tmp5);
+                block.V3 = Avx.Add(tmp3, tmp4);
+                block.V4 = Avx.Subtract(tmp3, tmp4);
+            }
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 6963c36369..a1c03e65c0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -15,102 +15,196 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// </summary>
     internal static partial class FastFloatingPointDCT
     {
-#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
-        private const float C_1_175876 = 1.175875602f;
-        private const float C_1_961571 = -1.961570560f;
-        private const float C_0_390181 = -0.390180644f;
-        private const float C_0_899976 = -0.899976223f;
-        private const float C_2_562915 = -2.562915447f;
-        private const float C_0_298631 = 0.298631336f;
-        private const float C_2_053120 = 2.053119869f;
-        private const float C_3_072711 = 3.072711026f;
-        private const float C_1_501321 = 1.501321110f;
-        private const float C_0_541196 = 0.541196100f;
-        private const float C_1_847759 = -1.847759065f;
-        private const float C_0_765367 = 0.765366865f;
-
-        private const float C_0_125 = 0.1250f;
-
-#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
-        private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
-        private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
-        private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
-        private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
-#pragma warning restore SA1311, IDE1006
-
-#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f);
+        private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f);
+        private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f);
+        private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f);
+
+        private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f);
+        private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f);
+        private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f);
+        private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f);
+#pragma warning restore SA1310, SA1311, IDE1006
 
         /// <summary>
-        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// Gets adjustment table for quantization tables.
         /// </summary>
         /// <remarks>
         /// <para>
-        /// Current FDCT implementation expects its results to be multiplied by
-        /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
-        /// table must be divided by quantization table values scaled with quality settings.
+        /// Current IDCT and FDCT implementations are based on  Arai, Agui,
+        /// and Nakajima's algorithm. Both DCT methods does not
+        /// produce finished DCT output, final step is fused into the
+        /// quantization step. Quantization and de-quantization coefficients
+        /// must be multiplied by these values.
         /// </para>
         /// <para>
-        /// These values were calculates with this formula:
-        /// <code>
-        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
-        /// </code>
-        /// Where:
+        /// Given values were generated by formula:
         /// <code>
+        /// scalefactor[row] * scalefactor[col], where
         /// scalefactor[0] = 1
-        /// </code>
-        /// <code>
         /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
         /// </code>
-        /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
         /// </para>
         /// </remarks>
-        internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+        private static readonly float[] AdjustmentCoefficients = new float[]
         {
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
-            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
-            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
-            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
-            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+            1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+            1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f,
+            1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f,
+            1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f,
+            1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+            0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f,
+            0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f,
+            0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f,
         };
 
         /// <summary>
-        /// Adjusts given quantization table to be complient with FDCT implementation.
+        /// Adjusts given quantization table for usage with <see cref="TransformIDCT"/>.
+        /// </summary>
+        /// <param name="quantTable">Quantization table to adjust.</param>
+        public static void AdjustToIDCT(ref Block8x8F quantTable)
+        {
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quantTable[i] = quantTable[i] * AdjustmentCoefficients[i] * 0.125f;
+            }
+
+            // Spectral macroblocks are transposed before quantization
+            // so we must transpose quantization table
+            quantTable.TransposeInplace();
+        }
+
+        /// <summary>
+        /// Adjusts given quantization table for usage with <see cref="TransformFDCT"/>.
+        /// </summary>
+        /// <param name="quantTable">Quantization table to adjust.</param>
+        public static void AdjustToFDCT(ref Block8x8F quantTable)
+        {
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quantTable[i] = 0.125f / (quantTable[i] * AdjustmentCoefficients[i]);
+            }
+        }
+
+        /// <summary>
+        /// Apply 2D floating point IDCT inplace.
         /// </summary>
         /// <remarks>
-        /// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
+        /// Input block must be dequantized before this method with table
+        /// adjusted by <see cref="AdjustToIDCT"/>.
         /// </remarks>
-        /// <param name="quantizationtable">Quantization table to adjust.</param>
-        public static void AdjustToFDCT(ref Block8x8F quantizationtable)
+        /// <param name="block">Input block.</param>
+        public static void TransformIDCT(ref Block8x8F block)
         {
-            for (int i = 0; i < Block8x8F.Size; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref block);
+            }
+            else
+#endif
             {
-                quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+                IDCT_Vector4(ref block);
             }
         }
 
         /// <summary>
-        /// Apply 2D floating point FDCT inplace.
+        /// Apply 2D floating point IDCT inplace.
         /// </summary>
-        /// <param name="block">Input matrix.</param>
+        /// <remarks>
+        /// Input block must be quantized after this method with table adjusted
+        /// by <see cref="AdjustToFDCT"/>.
+        /// </remarks>
+        /// <param name="block">Input block.</param>
         public static void TransformFDCT(ref Block8x8F block)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                ForwardTransform_Avx(ref block);
+                FDCT8x8_Avx(ref block);
             }
             else
 #endif
             if (Vector.IsHardwareAccelerated)
             {
-                ForwardTransform_Vector4(ref block);
+                FDCT_Vector4(ref block);
             }
             else
             {
-                ForwardTransform_Scalar(ref block);
+                FDCT_Scalar(ref block);
+            }
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace using <see cref="Vector4"/> API.
+        /// </summary>
+        /// <param name="transposedBlock">Input block.</param>
+        private static void IDCT_Vector4(ref Block8x8F transposedBlock)
+        {
+            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+            // First pass - process columns
+            IDCT8x4_Vector4(ref transposedBlock.V0L);
+            IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+            // Second pass - process rows
+            transposedBlock.TransposeInplace();
+            IDCT8x4_Vector4(ref transposedBlock.V0L);
+            IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+            // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
+            static void IDCT8x4_Vector4(ref Vector4 vecRef)
+            {
+                // Even part
+                Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2);
+                Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2);
+                Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2);
+                Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2);
+
+                Vector4 z5 = tmp0;
+                Vector4 tmp10 = z5 + tmp2;
+                Vector4 tmp11 = z5 - tmp2;
+
+                Vector4 tmp13 = tmp1 + tmp3;
+                Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13;
+
+                tmp0 = tmp10 + tmp13;
+                tmp3 = tmp10 - tmp13;
+                tmp1 = tmp11 + tmp12;
+                tmp2 = tmp11 - tmp12;
+
+                // Odd part
+                Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2);
+                Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2);
+                Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2);
+                Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2);
+
+                Vector4 z13 = tmp6 + tmp5;
+                Vector4 z10 = tmp6 - tmp5;
+                Vector4 z11 = tmp4 + tmp7;
+                Vector4 z12 = tmp4 - tmp7;
+
+                tmp7 = z11 + z13;
+                tmp11 = (z11 - z13) * mm128_F_1_4142;
+
+                z5 = (z10 + z12) * mm128_F_1_8477;
+
+                tmp10 = (z12 * mm128_F_n1_0823) + z5;
+                tmp12 = (z10 * mm128_F_n2_6131) + z5;
+
+                tmp6 = tmp12 - tmp7;
+                tmp5 = tmp11 - tmp6;
+                tmp4 = tmp10 - tmp5;
+
+                Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7;
+                Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7;
+                Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6;
+                Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6;
+                Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5;
+                Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5;
+                Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4;
+                Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4;
             }
         }
 
@@ -120,8 +214,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <remarks>
         /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
         /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Scalar(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        private static void FDCT_Scalar(ref Block8x8F block)
         {
             const int dctSize = 8;
 
@@ -130,17 +224,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             float z1, z2, z3, z4, z5, z11, z13;
 
             // First pass - process rows
-            ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
             for (int ctr = 7; ctr >= 0; ctr--)
             {
-                tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
-                tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
-                tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
-                tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
-                tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
-                tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
-                tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
-                tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+                tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
+                tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
+                tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
+                tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
+                tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
+                tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
+                tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
+                tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
 
                 // Even part
                 tmp10 = tmp0 + tmp3;
@@ -148,12 +242,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 tmp11 = tmp1 + tmp2;
                 tmp12 = tmp1 - tmp2;
 
-                Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
-                Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+                Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
 
                 z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
-                Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+                Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
+                Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
 
                 // Odd part
                 tmp10 = tmp4 + tmp5;
@@ -168,26 +262,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 z11 = tmp7 + z3;
                 z13 = tmp7 - z3;
 
-                Unsafe.Add(ref dataRef, 5) = z13 + z2;
-                Unsafe.Add(ref dataRef, 3) = z13 - z2;
-                Unsafe.Add(ref dataRef, 1) = z11 + z4;
-                Unsafe.Add(ref dataRef, 7) = z11 - z4;
+                Unsafe.Add(ref blockRef, 5) = z13 + z2;
+                Unsafe.Add(ref blockRef, 3) = z13 - z2;
+                Unsafe.Add(ref blockRef, 1) = z11 + z4;
+                Unsafe.Add(ref blockRef, 7) = z11 - z4;
 
-                dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+                blockRef = ref Unsafe.Add(ref blockRef, dctSize);
             }
 
             // Second pass - process columns
-            dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
             for (int ctr = 7; ctr >= 0; ctr--)
             {
-                tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
-                tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
-                tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
-                tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
-                tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
-                tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
-                tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
-                tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+                tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
+                tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
+                tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
+                tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
+                tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
+                tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
+                tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
+                tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
 
                 // Even part
                 tmp10 = tmp0 + tmp3;
@@ -195,12 +289,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 tmp11 = tmp1 + tmp2;
                 tmp12 = tmp1 - tmp2;
 
-                Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
-                Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+                Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
+                Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
 
                 z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
-                Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+                Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
+                Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
 
                 // Odd part
                 tmp10 = tmp4 + tmp5;
@@ -215,12 +309,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 z11 = tmp7 + z3;
                 z13 = tmp7 - z3;
 
-                Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
-                Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
-                Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
-                Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+                Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
+                Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
+                Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
+                Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
 
-                dataRef = ref Unsafe.Add(ref dataRef, 1);
+                blockRef = ref Unsafe.Add(ref blockRef, 1);
             }
         }
 
@@ -230,11 +324,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <remarks>
         /// This implementation must be called only if hardware supports 4
         /// floating point numbers vector. Otherwise explicit scalar
-        /// implementation <see cref="ForwardTransform_Scalar"/> is faster
-        /// because it does not rely on matrix transposition.
+        /// implementation <see cref="FDCT_Scalar"/> is faster
+        /// because it does not rely on block transposition.
         /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Vector4(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        public static void FDCT_Vector4(ref Block8x8F block)
         {
             DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
 
@@ -247,209 +341,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             block.TransposeInplace();
             FDCT8x4_Vector4(ref block.V0L);
             FDCT8x4_Vector4(ref block.V0R);
-        }
 
-        /// <summary>
-        /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
-        /// </summary>
-        /// <remarks>
-        /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
-        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
-        /// </remarks>
-        /// <param name="blockRef">Input reference to the first </param>
-        private static void FDCT8x4_Vector4(ref Vector4 blockRef)
-        {
-            Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
-            Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
-            Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
-            Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
-            Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
-            Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
-            Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
-            Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
-
-            // Even part
-            Vector4 tmp10 = tmp0 + tmp3;
-            Vector4 tmp13 = tmp0 - tmp3;
-            Vector4 tmp11 = tmp1 + tmp2;
-            Vector4 tmp12 = tmp1 - tmp2;
-
-            Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
-            Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
-
-            Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
-            Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
-            Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
-
-            // Odd part
-            tmp10 = tmp4 + tmp5;
-            tmp11 = tmp5 + tmp6;
-            tmp12 = tmp6 + tmp7;
-
-            Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
-            Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
-            Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
-            Vector4 z3 = tmp11 * mm128_F_0_7071;
-
-            Vector4 z11 = tmp7 + z3;
-            Vector4 z13 = tmp7 - z3;
-
-            Unsafe.Add(ref blockRef, 10) = z13 + z2;
-            Unsafe.Add(ref blockRef, 6) = z13 - z2;
-            Unsafe.Add(ref blockRef, 2) = z11 + z4;
-            Unsafe.Add(ref blockRef, 14) = z11 - z4;
-        }
+            // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
+            static void FDCT8x4_Vector4(ref Vector4 vecRef)
+            {
+                Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14);
+                Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14);
+                Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12);
+                Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12);
+                Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10);
+                Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10);
+                Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8);
+                Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8);
 
-        /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
-        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
-        {
-            block.TransposeInplace();
-            IDCT8x8(ref block, ref temp);
-            temp.TransposeInplace();
-            IDCT8x8(ref temp, ref block);
+                // Even part
+                Vector4 tmp10 = tmp0 + tmp3;
+                Vector4 tmp13 = tmp0 - tmp3;
+                Vector4 tmp11 = tmp1 + tmp2;
+                Vector4 tmp12 = tmp1 - tmp2;
 
-            // TODO: This can be fused into quantization table step
-            block.MultiplyInPlace(C_0_125);
-        }
+                Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11;
 
-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
+                Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+                Unsafe.Add(ref vecRef, 4) = tmp13 + z1;
+                Unsafe.Add(ref vecRef, 12) = tmp13 - z1;
 
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
 
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
+                Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+                Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+                Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+                Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+                Vector4 z11 = tmp7 + z3;
+                Vector4 z13 = tmp7 - z3;
+
+                Unsafe.Add(ref vecRef, 10) = z13 + z2;
+                Unsafe.Add(ref vecRef, 6) = z13 - z2;
+                Unsafe.Add(ref vecRef, 2) = z11 + z4;
+                Unsafe.Add(ref vecRef, 14) = z11 - z4;
+            }
         }
     }
 }