diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 085cd4a291..15f212b400 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         /// </summary>
         public Block8x8F SourceBlock;
 
-        /// <summary>
-        /// Temporal block to store intermediate computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock;
-
         /// <summary>
         /// The quantization table as <see cref="Block8x8F"/>.
         /// </summary>
@@ -45,7 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             this.subSamplingDivisors = component.SubSamplingDivisors;
 
             this.SourceBlock = default;
-            this.WorkspaceBlock = default;
         }
 
         /// <summary>
@@ -71,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             // Dequantize:
             block.MultiplyInPlace(ref this.DequantiazationTable);
 
-            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
+            FastFloatingPointDCT.TransformIDCT(ref block);
 
             // To conform better to libjpeg we actually NEED TO loose precision here.
             // This is because they store blocks as Int16 between all the operations.
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 0a49d20cd4..3a6eb4f8bf 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,9 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics.X86;
-#endif
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
 using SixLabors.ImageSharp.Tests.TestUtilities;
@@ -17,6 +14,20 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
     [Trait("Format", "Jpg")]
     public static class DCTTests
     {
+        private const int MaxAllowedValue = short.MaxValue;
+        private const int MinAllowedValue = short.MinValue;
+
+        internal static Block8x8F CreateBlockFromScalar(float value)
+        {
+            Block8x8F result = default;
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                result[i] = value;
+            }
+
+            return result;
+        }
+
         public class FastFloatingPoint : JpegFixture
         {
             public FastFloatingPoint(ITestOutputHelper output)
@@ -24,130 +35,77 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             {
             }
 
-            // Reference tests
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
             [InlineData(3)]
             public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
             {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
 
                 var srcBlock = Block8x8F.Load(sourceArray);
 
+                // reference
                 Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);
 
-                var temp = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
-
-                this.CompareBlocks(expected, srcBlock, 1f);
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToAccurate(int seed)
-            {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                // testee
+                // Part of the IDCT calculations is fused into the quantization step
+                // We must multiply input block with adjusted no-quantization matrix
+                // before applying IDCT
+                Block8x8F dequantMatrix = CreateBlockFromScalar(1);
 
-                var srcBlock = Block8x8F.Load(sourceArray);
+                // Dequantization using unit matrix - no values are upscaled
+                // as quant matrix is all 1's
+                // This step is needed to apply adjusting multipliers to the input block
+                FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
+                srcBlock.MultiplyInPlace(ref dequantMatrix);
 
-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
+                // IDCT implementation tranforms blocks after transposition
+                srcBlock.TransposeInplace();
 
-                var temp = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
+                // IDCT calculation
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock);
 
                 this.CompareBlocks(expected, srcBlock, 1f);
             }
 
-            // Inverse transform
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void IDCT8x4_LeftPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
-
-                // testee
-                FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            public void IDCT8x4_RightPart(int seed)
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToAccurate(int seed)
             {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
+                float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
 
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
+                var srcBlock = Block8x8F.Load(sourceArray);
 
                 // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
 
                 // testee
-                FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void IDCT8x8_Avx(int seed)
-            {
-#if SUPPORTS_RUNTIME_INTRINSICS
-                if (!Avx.IsSupported)
-                {
-                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
-                }
-
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                Block8x8F srcBlock = default;
-                srcBlock.LoadFrom(src);
+                // Part of the IDCT calculations is fused into the quantization step
+                // We must multiply input block with adjusted no-quantization matrix
+                // before applying IDCT
+                Block8x8F dequantMatrix = CreateBlockFromScalar(1);
 
-                Block8x8F destBlock = default;
+                // Dequantization using unit matrix - no values are upscaled
+                // as quant matrix is all 1's
+                // This step is needed to apply adjusting multipliers to the input block
+                FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
+                srcBlock.MultiplyInPlace(ref dequantMatrix);
 
-                float[] expectedDest = new float[64];
+                // IDCT implementation tranforms blocks after transposition
+                srcBlock.TransposeInplace();
 
-                // reference, left part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
+                // IDCT calculation
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock);
 
-                // reference, right part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee, whole 8x8
-                FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
-
-                float[] actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-#endif
+                this.CompareBlocks(expected, srcBlock, 1f);
             }
 
+            // Inverse transform
+            // This test covers entire IDCT conversion chain
+            // This test checks all hardware implementations
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -157,41 +115,53 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 {
                     int seed = FeatureTestRunner.Deserialize<int>(serialized);
 
-                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
                     var srcBlock = default(Block8x8F);
                     srcBlock.LoadFrom(src);
 
-                    var expectedDest = new float[64];
-                    var temp1 = new float[64];
-                    var temp2 = default(Block8x8F);
+                    float[] expectedDest = new float[64];
+                    float[] temp = new float[64];
 
                     // reference
-                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp);
 
                     // testee
-                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);
+                    // Part of the IDCT calculations is fused into the quantization step
+                    // We must multiply input block with adjusted no-quantization matrix
+                    // before applying IDCT
+                    Block8x8F dequantMatrix = CreateBlockFromScalar(1);
+
+                    // Dequantization using unit matrix - no values are upscaled
+                    // as quant matrix is all 1's
+                    // This step is needed to apply adjusting multipliers to the input block
+                    FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
+                    srcBlock.MultiplyInPlace(ref dequantMatrix);
+
+                    // IDCT implementation tranforms blocks after transposition
+                    srcBlock.TransposeInplace();
 
-                    var actualDest = new float[64];
-                    srcBlock.ScaledCopyTo(actualDest);
+                    // IDCT calculation
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock);
+
+                    float[] actualDest = srcBlock.ToArray();
 
                     Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
                 }
 
-                // 3 paths:
+                // 4 paths:
                 // 1. AllowAll - call avx/fma implementation
-                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call fallback code of Vector4 implementation
-                //
-                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                // 2. DisableFMA - call avx without fma implementation
+                // 3. DisableAvx - call sse Vector4 implementation
+                // 4. DisableHWIntrinsic - call scalar fallback implementation
                 FeatureTestRunner.RunWithHwIntrinsicsFeature(
                     RunTest,
                     seed,
-                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
             }
 
             // Forward transform
-            // This test covers entire FDCT conversions chain
-            // This test checks all implementations: intrinsic and scalar fallback
+            // This test covers entire FDCT conversion chain
+            // This test checks all hardware implementations
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -201,7 +171,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 {
                     int seed = FeatureTestRunner.Deserialize<int>(serialized);
 
-                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
                     var block = default(Block8x8F);
                     block.LoadFrom(src);
 
@@ -212,23 +182,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
                     // testee
-                    // Part of the FDCT calculations is fused into the quantization step
-                    // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
                     FastFloatingPointDCT.TransformFDCT(ref block);
-                    for (int i = 0; i < 64; i++)
-                    {
-                        block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
-                    }
+
+                    // Part of the IDCT calculations is fused into the quantization step
+                    // We must multiply input block with adjusted no-quantization matrix
+                    // after applying FDCT
+                    Block8x8F quantMatrix = CreateBlockFromScalar(1);
+                    FastFloatingPointDCT.AdjustToFDCT(ref quantMatrix);
+                    block.MultiplyInPlace(ref quantMatrix);
 
                     float[] actualDest = block.ToArray();
 
                     Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
                 }
 
-                // 3 paths:
+                // 4 paths:
                 // 1. AllowAll - call avx/fma implementation
-                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call sse implementation
+                // 2. DisableFMA - call avx without fma implementation
+                // 3. DisableAvx - call sse Vector4 implementation
                 // 4. DisableHWIntrinsic - call scalar fallback implementation
                 FeatureTestRunner.RunWithHwIntrinsicsFeature(
                     RunTest,