diff --git a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
index 855dda2f4b..8d20f80304 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Runtime.CompilerServices;
 
+#pragma warning disable IDE0078
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     /// <summary>
@@ -14,10 +15,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// </remarks>
     internal static class ScaledFloatingPointDCT
     {
-#pragma warning disable SA1310 // naming rules violation warnings
-        private const float F32_0_541196100 = 0.541196100f;
-        private const float F32_0_765366865 = 0.765366865f;
-        private const float F32_1_847759065 = 1.847759065f;
+#pragma warning disable SA1310
+        private const float FP32_0_541196100 = 0.541196100f;
+        private const float FP32_0_765366865 = 0.765366865f;
+        private const float FP32_1_847759065 = 1.847759065f;
+        private const float FP32_0_211164243 = 0.211164243f;
+        private const float FP32_1_451774981 = 1.451774981f;
+        private const float FP32_2_172734803 = 2.172734803f;
+        private const float FP32_1_061594337 = 1.061594337f;
+        private const float FP32_0_509795579 = 0.509795579f;
+        private const float FP32_0_601344887 = 0.601344887f;
+        private const float FP32_0_899976223 = 0.899976223f;
+        private const float FP32_2_562915447 = 2.562915447f;
+        private const float FP32_0_720959822 = 0.720959822f;
+        private const float FP32_0_850430095 = 0.850430095f;
+        private const float FP32_1_272758580 = 1.272758580f;
+        private const float FP32_3_624509785 = 3.624509785f;
 #pragma warning restore SA1310
 
         /// <summary>
@@ -39,76 +52,169 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             quantTable.TransposeInplace();
         }
 
-        public static float TransformIDCT_1x1(float dc, float dequantizer, float normalizationValue, float maxValue)
-            => MathF.Round(Numerics.Clamp((dc * dequantizer) + normalizationValue, 0, maxValue));
-
-        public static void TransformIDCT_2x2(ref Block8x8F block, ref Block8x8F dequantTable, float normalizationValue, float maxValue)
-        {
-            float tmp4 = block[0] * dequantTable[0];
-            float tmp5 = block[1] * dequantTable[1];
-            float tmp0 = tmp4 + tmp5;
-            float tmp2 = tmp4 - tmp5;
-
-            tmp4 = block[8] * dequantTable[8];
-            tmp5 = block[9] * dequantTable[9];
-            float tmp1 = tmp4 + tmp5;
-            float tmp3 = tmp4 - tmp5;
-
-            block[0] = MathF.Round(Numerics.Clamp(tmp0 + tmp1 + normalizationValue, 0, maxValue));
-            block[1] = MathF.Round(Numerics.Clamp(tmp0 - tmp1 + normalizationValue, 0, maxValue));
-            block[8] = MathF.Round(Numerics.Clamp(tmp2 + tmp3 + normalizationValue, 0, maxValue));
-            block[9] = MathF.Round(Numerics.Clamp(tmp2 - tmp3 + normalizationValue, 0, maxValue));
-        }
-
+        /// <summary>
+        /// Apply 2D floating point 'donwscaling' IDCT inplace producing
+        /// 8x8 -> 4x4 result.
+        /// </summary>
+        /// <remarks>
+        /// Resulting matrix is stored in the top left 4x4 part of the
+        /// <paramref name="block"/>.
+        /// </remarks>
+        /// <param name="block">Input block.</param>
+        /// <param name="dequantTable">Dequantization table adjusted by <see cref="AdjustToIDCT(ref Block8x8F)"/>.</param>
+        /// <param name="normalizationValue">Output range normalization value, 1/2 of the <paramref name="maxValue"/>.</param>
+        /// <param name="maxValue">Maximum value of the output range.</param>
         public static void TransformIDCT_4x4(ref Block8x8F block, ref Block8x8F dequantTable, float normalizationValue, float maxValue)
         {
-            for (int ctr = 0; ctr < 4; ctr++)
+            for (int ctr = 0; ctr < 8; ctr++)
             {
+                // Don't process row 4, second pass doesn't use it
+                if (ctr == 4)
+                {
+                    continue;
+                }
+
                 // Even part
-                float tmp0 = block[ctr * 8] * dequantTable[ctr * 8];
-                float tmp2 = block[(ctr * 8) + 2] * dequantTable[(ctr * 8) + 2];
+                float tmp0 = block[(ctr * 8) + 0] * dequantTable[(ctr * 8) + 0] * 2;
+
+                float z2 = block[(ctr * 8) + 2] * dequantTable[(ctr * 8) + 2];
+                float z3 = block[(ctr * 8) + 6] * dequantTable[(ctr * 8) + 6];
+
+                float tmp2 = (z2 * FP32_1_847759065) + (z3 * -FP32_0_765366865);
 
                 float tmp10 = tmp0 + tmp2;
                 float tmp12 = tmp0 - tmp2;
 
                 // Odd part
-                float z2 = block[(ctr * 8) + 1] * dequantTable[(ctr * 8) + 1];
-                float z3 = block[(ctr * 8) + 3] * dequantTable[(ctr * 8) + 3];
-
-                float z1 = (z2 + z3) * F32_0_541196100;
-                tmp0 = z1 + (z2 * F32_0_765366865);
-                tmp2 = z1 - (z3 * F32_1_847759065);
-
-                /* Final output stage */
-                block[ctr + 4] = tmp10 + tmp0;
-                block[ctr + 28] = tmp10 - tmp0;
-                block[ctr + 12] = tmp12 + tmp2;
-                block[ctr + 20] = tmp12 - tmp2;
+                float z1 = block[(ctr * 8) + 7] * dequantTable[(ctr * 8) + 7];
+                z2 = block[(ctr * 8) + 5] * dequantTable[(ctr * 8) + 5];
+                z3 = block[(ctr * 8) + 3] * dequantTable[(ctr * 8) + 3];
+                float z4 = block[(ctr * 8) + 1] * dequantTable[(ctr * 8) + 1];
+
+                tmp0 = (z1 * -FP32_0_211164243) +
+                       (z2 * FP32_1_451774981) +
+                       (z3 * -FP32_2_172734803) +
+                       (z4 * FP32_1_061594337);
+
+                tmp2 = (z1 * -FP32_0_509795579) +
+                       (z2 * -FP32_0_601344887) +
+                       (z3 * FP32_0_899976223) +
+                       (z4 * FP32_2_562915447);
+
+                // temporal result is saved to +4 shifted indices
+                // because result is saved into the top left 2x2 region of the
+                // input block
+                block[(ctr * 8) + 0 + 4] = (tmp10 + tmp2) / 2;
+                block[(ctr * 8) + 3 + 4] = (tmp10 - tmp2) / 2;
+                block[(ctr * 8) + 1 + 4] = (tmp12 + tmp0) / 2;
+                block[(ctr * 8) + 2 + 4] = (tmp12 - tmp0) / 2;
             }
 
             for (int ctr = 0; ctr < 4; ctr++)
             {
                 // Even part
-                float tmp0 = block[(ctr * 8) + 0 + 4];
-                float tmp2 = block[(ctr * 8) + 2 + 4];
+                float tmp0 = block[ctr + (8 * 0) + 4] * 2;
+
+                float tmp2 = (block[ctr + (8 * 2) + 4] * FP32_1_847759065) + (block[ctr + (8 * 6) + 4] * -FP32_0_765366865);
 
                 float tmp10 = tmp0 + tmp2;
                 float tmp12 = tmp0 - tmp2;
 
                 // Odd part
-                float z2 = block[(ctr * 8) + 1 + 4];
-                float z3 = block[(ctr * 8) + 3 + 4];
-
-                float z1 = (z2 + z3) * F32_0_541196100;
-                tmp0 = z1 + (z2 * F32_0_765366865);
-                tmp2 = z1 - (z3 * F32_1_847759065);
-
-                /* Final output stage */
-                block[(ctr * 8) + 0] = MathF.Round(Numerics.Clamp(tmp10 + tmp0 + normalizationValue, 0, maxValue));
-                block[(ctr * 8) + 3] = MathF.Round(Numerics.Clamp(tmp10 - tmp0 + normalizationValue, 0, maxValue));
-                block[(ctr * 8) + 1] = MathF.Round(Numerics.Clamp(tmp12 + tmp2 + normalizationValue, 0, maxValue));
-                block[(ctr * 8) + 2] = MathF.Round(Numerics.Clamp(tmp12 - tmp2 + normalizationValue, 0, maxValue));
+                float z1 = block[ctr + (8 * 7) + 4];
+                float z2 = block[ctr + (8 * 5) + 4];
+                float z3 = block[ctr + (8 * 3) + 4];
+                float z4 = block[ctr + (8 * 1) + 4];
+
+                tmp0 = (z1 * -FP32_0_211164243) +
+                       (z2 * FP32_1_451774981) +
+                       (z3 * -FP32_2_172734803) +
+                       (z4 * FP32_1_061594337);
+
+                tmp2 = (z1 * -FP32_0_509795579) +
+                       (z2 * -FP32_0_601344887) +
+                       (z3 * FP32_0_899976223) +
+                       (z4 * FP32_2_562915447);
+
+                // Save results to the top left 4x4 subregion
+                block[(ctr * 8) + 0] = MathF.Round(Numerics.Clamp(((tmp10 + tmp2) / 2) + normalizationValue, 0, maxValue));
+                block[(ctr * 8) + 3] = MathF.Round(Numerics.Clamp(((tmp10 - tmp2) / 2) + normalizationValue, 0, maxValue));
+                block[(ctr * 8) + 1] = MathF.Round(Numerics.Clamp(((tmp12 + tmp0) / 2) + normalizationValue, 0, maxValue));
+                block[(ctr * 8) + 2] = MathF.Round(Numerics.Clamp(((tmp12 - tmp0) / 2) + normalizationValue, 0, maxValue));
             }
         }
+
+        /// <summary>
+        /// Apply 2D floating point 'donwscaling' IDCT inplace producing
+        /// 8x8 -> 2x2 result.
+        /// </summary>
+        /// <remarks>
+        /// Resulting matrix is stored in the top left 2x2 part of the
+        /// <paramref name="block"/>.
+        /// </remarks>
+        /// <param name="block">Input block.</param>
+        /// <param name="dequantTable">Dequantization table adjusted by <see cref="AdjustToIDCT(ref Block8x8F)"/>.</param>
+        /// <param name="normalizationValue">Output range normalization value, 1/2 of the <paramref name="maxValue"/>.</param>
+        /// <param name="maxValue">Maximum value of the output range.</param>
+        public static void TransformIDCT_2x2(ref Block8x8F block, ref Block8x8F dequantTable, float normalizationValue, float maxValue)
+        {
+            for (int ctr = 0; ctr < 8; ctr++)
+            {
+                // Don't process rows 2/4/6, second pass doesn't use it
+                if (ctr == 2 || ctr == 4 || ctr == 6)
+                {
+                    continue;
+                }
+
+                // Even part
+                float tmp0;
+                float z1 = block[(ctr * 8) + 0] * dequantTable[(ctr * 8) + 0];
+                float tmp10 = z1 * 4;
+
+                // Odd part
+                z1 = block[(ctr * 8) + 7] * dequantTable[(ctr * 8) + 7];
+                tmp0 = z1 * -FP32_0_720959822;
+                z1 = block[(ctr * 8) + 5] * dequantTable[(ctr * 8) + 5];
+                tmp0 += z1 * FP32_0_850430095;
+                z1 = block[(ctr * 8) + 3] * dequantTable[(ctr * 8) + 3];
+                tmp0 += z1 * -FP32_1_272758580;
+                z1 = block[(ctr * 8) + 1] * dequantTable[(ctr * 8) + 1];
+                tmp0 += z1 * FP32_3_624509785;
+
+                // temporal result is saved to +2 shifted indices
+                // because result is saved into the top left 2x2 region of the
+                // input block
+                block[(ctr * 8) + 2] = (tmp10 + tmp0) / 4;
+                block[(ctr * 8) + 3] = (tmp10 - tmp0) / 4;
+            }
+
+            for (int ctr = 0; ctr < 2; ctr++)
+            {
+                // Even part
+                float tmp10 = block[ctr + (8 * 0) + 2] * 4;
+
+                // Odd part
+                float tmp0 = (block[ctr + (8 * 7) + 2] * -FP32_0_720959822) +
+                       (block[ctr + (8 * 5) + 2] * FP32_0_850430095) +
+                       (block[ctr + (8 * 3) + 2] * -FP32_1_272758580) +
+                       (block[ctr + (8 * 1) + 2] * FP32_3_624509785);
+
+                // Save results to the top left 2x2 subregion
+                block[(ctr * 8) + 0] = MathF.Round(Numerics.Clamp(((tmp10 + tmp0) / 4) + normalizationValue, 0, maxValue));
+                block[(ctr * 8) + 1] = MathF.Round(Numerics.Clamp(((tmp10 - tmp0) / 4) + normalizationValue, 0, maxValue));
+            }
+        }
+
+        /// <summary>
+        /// Apply 2D floating point 'donwscaling' IDCT inplace producing
+        /// 8x8 -> 1x1 result.
+        /// </summary>
+        /// <param name="dc">Direct current term value from input block.</param>
+        /// <param name="dequantizer">Dequantization value.</param>
+        /// <param name="normalizationValue">Output range normalization value, 1/2 of the <paramref name="maxValue"/>.</param>
+        /// <param name="maxValue">Maximum value of the output range.</param>
+        public static float TransformIDCT_1x1(float dc, float dequantizer, float normalizationValue, float maxValue)
+            => MathF.Round(Numerics.Clamp((dc * dequantizer) + normalizationValue, 0, maxValue));
     }
 }
+#pragma warning restore IDE0078
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index c14d38b07d..6fdd497dc4 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -107,9 +107,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 this.CompareBlocks(expected, srcBlock, 1f);
             }
 
-            // Inverse transform
-            // This test covers entire IDCT conversion chain
-            // This test checks all hardware implementations
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -193,32 +190,34 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 srcBlock.TransposeInplace();
                 ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
 
-                Block8x8F DEBUG_VARIABLE = Block8x8F.Load(expectedDest);
-
-                var comparer = new ApproximateFloatComparer(1f);
-
                 Span<float> expectedSpan = expectedDest.AsSpan();
                 Span<float> actualSpan = srcBlock.ToArray().AsSpan();
 
-                AssertEquality_4x4(expectedSpan, actualSpan, comparer);
-                AssertEquality_4x4(expectedSpan.Slice(2), actualSpan.Slice(8), comparer);
-                AssertEquality_4x4(expectedSpan.Slice(4), actualSpan.Slice(16), comparer);
-                AssertEquality_4x4(expectedSpan.Slice(6), actualSpan.Slice(24), comparer);
+                // resulting matrix is 4x4
+                for (int y = 0; y < 4; y++)
+                {
+                    for (int x = 0; x < 4; x++)
+                    {
+                        AssertScaledElementEquality(expectedSpan.Slice((y * 16) + (x * 2)), actualSpan.Slice((y * 8) + x));
+                    }
+                }
 
-                static void AssertEquality_4x4(Span<float> expected, Span<float> actual, ApproximateFloatComparer comparer)
+                static void AssertScaledElementEquality(Span<float> expected, Span<float> actual)
                 {
-                    float average_4x4 = 0f;
+                    float average2x2 = 0f;
                     for (int y = 0; y < 2; y++)
                     {
                         int y8 = y * 8;
                         for (int x = 0; x < 2; x++)
                         {
                             float clamped = Numerics.Clamp(expected[y8 + x] + NormalizationValue, 0, MaxOutputValue);
-                            average_4x4 += clamped;
+                            average2x2 += clamped;
                         }
                     }
 
-                    average_4x4 /= 4f;
+                    average2x2 = MathF.Round(average2x2 / 4f);
+
+                    Assert.Equal((int)average2x2, (int)actual[0]);
                 }
             }
 
@@ -253,25 +252,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 srcBlock.TransposeInplace();
                 ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
 
-                Block8x8F DEBUG_VARIABLE = Block8x8F.Load(expectedDest);
-
-                var comparer = new ApproximateFloatComparer(0.1f);
+                Span<float> expectedSpan = expectedDest.AsSpan();
+                Span<float> actualSpan = srcBlock.ToArray().AsSpan();
 
-                // top-left
-                float topLeftExpected = (float)Math.Round(Numerics.Clamp(expectedDest[0] + NormalizationValue, 0, MaxOutputValue));
-                Assert.Equal(topLeftExpected, srcBlock[0], comparer);
+                // resulting matrix is 2x2
+                for (int y = 0; y < 2; y++)
+                {
+                    for (int x = 0; x < 2; x++)
+                    {
+                        AssertScaledElementEquality(expectedSpan.Slice((y * 32) + (x * 4)), actualSpan.Slice((y * 8) + x));
+                    }
+                }
 
-                // top-right
-                float topRightExpected = (float)Math.Round(Numerics.Clamp(expectedDest[7] + NormalizationValue, 0, MaxOutputValue));
-                Assert.Equal(topRightExpected, srcBlock[1], comparer);
+                static void AssertScaledElementEquality(Span<float> expected, Span<float> actual)
+                {
+                    float average4x4 = 0f;
+                    for (int y = 0; y < 4; y++)
+                    {
+                        int y8 = y * 8;
+                        for (int x = 0; x < 4; x++)
+                        {
+                            float clamped = Numerics.Clamp(expected[y8 + x] + NormalizationValue, 0, MaxOutputValue);
+                            average4x4 += clamped;
+                        }
+                    }
 
-                // bot-left
-                float botLeftExpected = (float)Math.Round(Numerics.Clamp(expectedDest[56] + NormalizationValue, 0, MaxOutputValue));
-                Assert.Equal(botLeftExpected, srcBlock[8], comparer);
+                    average4x4 = MathF.Round(average4x4 / 16f);
 
-                // bot-right
-                float botRightExpected = (float)Math.Round(Numerics.Clamp(expectedDest[63] + NormalizationValue, 0, MaxOutputValue));
-                Assert.Equal(botRightExpected, srcBlock[9], comparer);
+                    Assert.Equal((int)average4x4, (int)actual[0]);
+                }
             }
 
             [Theory]
@@ -302,21 +311,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 // testee
                 // IDCT implementation tranforms blocks after transposition
-                srcBlock.TransposeInplace();
+                // But DC lays on main diagonal which is not changed by transposition
                 float actual = ScaledFloatingPointDCT.TransformIDCT_1x1(
                     srcBlock[0],
                     dequantMatrix[0],
                     NormalizationValue,
                     MaxOutputValue);
 
-                float expected = (float)Math.Round(Numerics.Clamp(expectedDest[0] + NormalizationValue, 0, MaxOutputValue));
+                float expected = MathF.Round(Numerics.Clamp(expectedDest[0] + NormalizationValue, 0, MaxOutputValue));
 
-                Assert.Equal(actual, expected, new ApproximateFloatComparer(0.1f));
+                Assert.Equal((int)actual, (int)expected);
             }
 
-            // Forward transform
-            // This test covers entire FDCT conversion chain
-            // This test checks all hardware implementations
             [Theory]
             [InlineData(1)]
             [InlineData(2)]