New FDCT method, reciprocal quantization

5 years ago · 2f143bf9d3
10 changed files with 599 additions and 652 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@ -3,6 +3,7 @@

 #if SUPPORTS_RUNTIME_INTRINSICS
 using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            0, 1, 4, 5, 2, 3, 6, 7
        };

-        private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
        {
            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");

@ -53,8 +54,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

                for (int i = 0; i < 8; i += 2)
                {
-                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));

                    Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
                    row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
@ -64,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

-        private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
        {
            DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");

@ -75,13 +76,81 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

            for (int i = 0; i < 16; i += 2)
            {
-                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));

                Vector128<short> row = Sse2.PackSignedSaturate(left, right);
                Unsafe.Add(ref destBase, i / 2) = row;
            }
        }
+
+        private void TransposeAvx()
+        {
+            // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+            Vector256<float> r0 = Avx.InsertVector128(
+                this.V0,
+                Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
+                1);
+
+            Vector256<float> r1 = Avx.InsertVector128(
+               this.V1,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
+               1);
+
+            Vector256<float> r2 = Avx.InsertVector128(
+               this.V2,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
+               1);
+
+            Vector256<float> r3 = Avx.InsertVector128(
+               this.V3,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
+               1);
+
+            Vector256<float> r4 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
+               1);
+
+            Vector256<float> r5 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
+               1);
+
+            Vector256<float> r6 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
+               1);
+
+            Vector256<float> r7 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
+               1);
+
+            Vector256<float> t0 = Avx.UnpackLow(r0, r1);
+            Vector256<float> t2 = Avx.UnpackLow(r2, r3);
+            Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
+            this.V0 = Avx.Blend(t0, v, 0xCC);
+            this.V1 = Avx.Blend(t2, v, 0x33);
+
+            Vector256<float> t4 = Avx.UnpackLow(r4, r5);
+            Vector256<float> t6 = Avx.UnpackLow(r6, r7);
+            v = Avx.Shuffle(t4, t6, 0x4E);
+            this.V4 = Avx.Blend(t4, v, 0xCC);
+            this.V5 = Avx.Blend(t6, v, 0x33);
+
+            Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
+            Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
+            v = Avx.Shuffle(t1, t3, 0x4E);
+            this.V2 = Avx.Blend(t1, v, 0xCC);
+            this.V3 = Avx.Blend(t3, v, 0x33);
+
+            Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
+            Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
+            v = Avx.Shuffle(t5, t7, 0x4E);
+            this.V6 = Avx.Blend(t5, v, 0xCC);
+            this.V7 = Avx.Blend(t7, v, 0x33);
+        }
    }
 }
 #endif
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -413,41 +413,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx2.IsSupported)
            {
-                DivideIntoInt16_Avx2(ref block, ref qt, ref dest);
+                MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
                ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
            }
            else if (Ssse3.IsSupported)
            {
-                DivideIntoInt16_Sse2(ref block, ref qt, ref dest);
+                MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
                ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
            }
            else
 #endif
            {
-                Divide(ref block, ref qt);
+                Multiply(ref block, ref qt);
                block.RoundInto(ref dest);
            }
        }

        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void Divide(ref Block8x8F a, ref Block8x8F b)
-        {
-            a.V0L /= b.V0L;
-            a.V0R /= b.V0R;
-            a.V1L /= b.V1L;
-            a.V1R /= b.V1R;
-            a.V2L /= b.V2L;
-            a.V2R /= b.V2R;
-            a.V3L /= b.V3L;
-            a.V3R /= b.V3R;
-            a.V4L /= b.V4L;
-            a.V4R /= b.V4R;
-            a.V5L /= b.V5L;
-            a.V5R /= b.V5R;
-            a.V6L /= b.V6L;
-            a.V6R /= b.V6R;
-            a.V7L /= b.V7L;
-            a.V7R /= b.V7R;
+        private static void Multiply(ref Block8x8F a, ref Block8x8F b)
+        {
+            a.V0L *= b.V0L;
+            a.V0R *= b.V0R;
+            a.V1L *= b.V1L;
+            a.V1R *= b.V1R;
+            a.V2L *= b.V2L;
+            a.V2R *= b.V2R;
+            a.V3L *= b.V3L;
+            a.V3R *= b.V3R;
+            a.V4L *= b.V4L;
+            a.V4R *= b.V4R;
+            a.V5L *= b.V5L;
+            a.V5R *= b.V5R;
+            a.V6L *= b.V6L;
+            a.V6R *= b.V6R;
+            a.V7L *= b.V7L;
+            a.V7R *= b.V7R;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -608,154 +608,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Transpose the block into the destination block.
+        /// Transpose the block inplace.
        /// </summary>
-        /// <param name="d">The destination block</param>
        [MethodImpl(InliningOptions.ShortMethod)]
-        public void TransposeInto(ref Block8x8F d)
+        public void Transpose()
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx.IsSupported)
            {
-                // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-                Vector256<float> r0 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-                   1);
-
-                Vector256<float> r1 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-                   1);
-
-                Vector256<float> r2 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-                   1);
-
-                Vector256<float> r3 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-                   1);
-
-                Vector256<float> r4 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-                   1);
-
-                Vector256<float> r5 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-                   1);
-
-                Vector256<float> r6 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-                   1);
-
-                Vector256<float> r7 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-                   1);
-
-                Vector256<float> t0 = Avx.UnpackLow(r0, r1);
-                Vector256<float> t2 = Avx.UnpackLow(r2, r3);
-                Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                d.V0 = Avx.Blend(t0, v, 0xCC);
-                d.V1 = Avx.Blend(t2, v, 0x33);
-
-                Vector256<float> t4 = Avx.UnpackLow(r4, r5);
-                Vector256<float> t6 = Avx.UnpackLow(r6, r7);
-                v = Avx.Shuffle(t4, t6, 0x4E);
-                d.V4 = Avx.Blend(t4, v, 0xCC);
-                d.V5 = Avx.Blend(t6, v, 0x33);
-
-                Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
-                Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
-                v = Avx.Shuffle(t1, t3, 0x4E);
-                d.V2 = Avx.Blend(t1, v, 0xCC);
-                d.V3 = Avx.Blend(t3, v, 0x33);
-
-                Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
-                Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
-                v = Avx.Shuffle(t5, t7, 0x4E);
-                d.V6 = Avx.Blend(t5, v, 0xCC);
-                d.V7 = Avx.Blend(t7, v, 0x33);
+                this.TransposeAvx();
            }
            else
 #endif
            {
-                d.V0L.X = this.V0L.X;
-                d.V1L.X = this.V0L.Y;
-                d.V2L.X = this.V0L.Z;
-                d.V3L.X = this.V0L.W;
-                d.V4L.X = this.V0R.X;
-                d.V5L.X = this.V0R.Y;
-                d.V6L.X = this.V0R.Z;
-                d.V7L.X = this.V0R.W;
-
-                d.V0L.Y = this.V1L.X;
-                d.V1L.Y = this.V1L.Y;
-                d.V2L.Y = this.V1L.Z;
-                d.V3L.Y = this.V1L.W;
-                d.V4L.Y = this.V1R.X;
-                d.V5L.Y = this.V1R.Y;
-                d.V6L.Y = this.V1R.Z;
-                d.V7L.Y = this.V1R.W;
-
-                d.V0L.Z = this.V2L.X;
-                d.V1L.Z = this.V2L.Y;
-                d.V2L.Z = this.V2L.Z;
-                d.V3L.Z = this.V2L.W;
-                d.V4L.Z = this.V2R.X;
-                d.V5L.Z = this.V2R.Y;
-                d.V6L.Z = this.V2R.Z;
-                d.V7L.Z = this.V2R.W;
-
-                d.V0L.W = this.V3L.X;
-                d.V1L.W = this.V3L.Y;
-                d.V2L.W = this.V3L.Z;
-                d.V3L.W = this.V3L.W;
-                d.V4L.W = this.V3R.X;
-                d.V5L.W = this.V3R.Y;
-                d.V6L.W = this.V3R.Z;
-                d.V7L.W = this.V3R.W;
-
-                d.V0R.X = this.V4L.X;
-                d.V1R.X = this.V4L.Y;
-                d.V2R.X = this.V4L.Z;
-                d.V3R.X = this.V4L.W;
-                d.V4R.X = this.V4R.X;
-                d.V5R.X = this.V4R.Y;
-                d.V6R.X = this.V4R.Z;
-                d.V7R.X = this.V4R.W;
-
-                d.V0R.Y = this.V5L.X;
-                d.V1R.Y = this.V5L.Y;
-                d.V2R.Y = this.V5L.Z;
-                d.V3R.Y = this.V5L.W;
-                d.V4R.Y = this.V5R.X;
-                d.V5R.Y = this.V5R.Y;
-                d.V6R.Y = this.V5R.Z;
-                d.V7R.Y = this.V5R.W;
-
-                d.V0R.Z = this.V6L.X;
-                d.V1R.Z = this.V6L.Y;
-                d.V2R.Z = this.V6L.Z;
-                d.V3R.Z = this.V6L.W;
-                d.V4R.Z = this.V6R.X;
-                d.V5R.Z = this.V6R.Y;
-                d.V6R.Z = this.V6R.Z;
-                d.V7R.Z = this.V6R.W;
-
-                d.V0R.W = this.V7L.X;
-                d.V1R.W = this.V7L.Y;
-                d.V2R.W = this.V7L.Z;
-                d.V3R.W = this.V7L.W;
-                d.V4R.W = this.V7R.X;
-                d.V5R.W = this.V7R.Y;
-                d.V6R.W = this.V7R.Z;
-                d.V7R.W = this.V7R.W;
+                this.TransposeScalar();
+            }
+        }
+
+        /// <summary>
+        /// Scalar inplace transpose implementation for <see cref="Transpose"/>
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void TransposeScalar()
+        {
+            float tmp;
+            int horIndex, verIndex;
+
+            // We don't care about the last row as it consists of a single element
+            // Which won't be swapped with anything
+            for (int i = 0; i < 7; i++)
+            {
+                // We don't care about the first element in each row as it's not swapped
+                for (int j = i + 1; j < 8; j++)
+                {
+                    horIndex = (i * 8) + j;
+                    verIndex = (j * 8) + i;
+                    tmp = this[horIndex];
+                    this[horIndex] = this[verIndex];
+                    this[verIndex] = tmp;
+                }
            }
        }

--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            // Dequantize:
            block.MultiplyInPlace(ref this.DequantiazationTable);

-            FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock);
+            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);

            // To conform better to libjpeg we actually NEED TO loose precision here.
            // This is because they store blocks as Int16 between all the operations.
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        private int bitCount;

-        private Block8x8F temporalBlock;
-        private Block8x8 temporalShortBlock;
+        private Block8x8 tempBlock;

        /// <summary>
        /// The output stream. All attempted writes after the first error become no-ops.
@ -130,6 +129,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
+            }
+
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
@ -190,6 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
+            }
+
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
@ -256,6 +269,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+            }
+
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
@ -301,6 +320,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+            }
+
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
@ -365,14 +390,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            ref Block8x8F block,
            ref Block8x8F quant)
        {
-            ref Block8x8F refTemp = ref this.temporalBlock;
-            ref Block8x8 spectralBlock = ref this.temporalShortBlock;
+            ref Block8x8 spectralBlock = ref this.tempBlock;

            // Shifting level from 0..255 to -128..127
            block.AddInPlace(-128f);

            // Discrete cosine transform
-            FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp);
+            FastFloatingPointDCT.TransformFDCT(ref block);

            // Quantization
            Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@ -0,0 +1,210 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Collections.Generic;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Text;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class FastFloatingPointDCT
+    {
+        /// <summary>
+        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Current FDCT implementation expects its results to be multiplied by
+        /// a reciprocal quantization table. Values in this table must be divided
+        /// by quantization table values scaled with quality settings.
+        /// </para>
+        /// <para>
+        /// These values were calculates with this formula:
+        /// <code>
+        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+        /// </code>
+        /// Where:
+        /// <code>
+        /// scalefactor[0] = 1
+        /// </code>
+        /// <code>
+        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+        /// </code>
+        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
+        /// </para>
+        /// </remarks>
+        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
+        {
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+        };
+
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
+        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
+        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
+
+        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
+        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
+        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
+        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
+#pragma warning restore SA1310, SA1311, IDE1006
+
+        /// <summary>
+        /// Apply floating point FDCT inplace using simd operations.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransformSimd(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
+
+            // First pass - process rows
+            block.Transpose();
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_avx(ref block);
+            }
+            else if (Sse.IsSupported)
+            {
+                // Left part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+
+                // Right part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+            }
+
+            // Second pass - process columns
+            block.Transpose();
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_avx(ref block);
+            }
+            else if (Sse.IsSupported)
+            {
+                // Left part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+
+                // Right part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+            }
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Requires Sse support.
+        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+        /// </remarks>
+        /// <param name="blockRef">Input reference to the first </param>
+        public static void FDCT8x4_sse(ref Vector128<float> blockRef)
+        {
+            DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
+
+            Vector128<float> tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
+            Vector128<float> tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
+            Vector128<float> tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
+            Vector128<float> tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
+            Vector128<float> tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
+            Vector128<float> tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
+            Vector128<float> tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
+            Vector128<float> tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
+
+            // Even part
+            Vector128<float> tmp10 = Sse.Add(tmp0, tmp3);
+            Vector128<float> tmp13 = Sse.Subtract(tmp0, tmp3);
+            Vector128<float> tmp11 = Sse.Add(tmp1, tmp2);
+            Vector128<float> tmp12 = Sse.Subtract(tmp1, tmp2);
+
+            Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11);
+            Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11);
+
+            Vector128<float> z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071);
+            Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1);
+            Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1);
+
+            // Odd part
+            tmp10 = Sse.Add(tmp4, tmp5);
+            tmp11 = Sse.Add(tmp5, tmp6);
+            tmp12 = Sse.Add(tmp6, tmp7);
+
+            Vector128<float> z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826);
+            Vector128<float> z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5);
+            Vector128<float> z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5);
+            Vector128<float> z3 = Sse.Multiply(tmp11, mm128_F_0_7071);
+
+            Vector128<float> z11 = Sse.Add(tmp7, z3);
+            Vector128<float> z13 = Sse.Subtract(tmp7, z3);
+
+            Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2);
+            Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2);
+            Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4);
+            Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4);
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Requires Avx support.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        public static void FDCT8x8_avx(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
+            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
+            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
+            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
+            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
+            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
+            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
+            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+
+            // Even part
+            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
+            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
+            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
+            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
+
+            block.V0 = Avx.Add(tmp10, tmp11);
+            block.V4 = Avx.Subtract(tmp10, tmp11);
+
+            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+            block.V2 = Avx.Add(tmp13, z1);
+            block.V6 = Avx.Subtract(tmp13, z1);
+
+            // Odd part
+            tmp10 = Avx.Add(tmp4, tmp5);
+            tmp11 = Avx.Add(tmp5, tmp6);
+            tmp12 = Avx.Add(tmp6, tmp7);
+
+            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+            Vector256<float> z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5);
+            Vector256<float> z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5);
+            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+            Vector256<float> z11 = Avx.Add(tmp7, z3);
+            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
+
+            block.V5 = Avx.Add(z13, z2);
+            block.V3 = Avx.Subtract(z13, z2);
+            block.V1 = Avx.Add(z11, z4);
+            block.V7 = Avx.Subtract(z11, z4);
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@ -46,11 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

 #if SUPPORTS_RUNTIME_INTRINSICS
        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
-        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
-        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
-        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
-        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);

        private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
        private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
@ -62,250 +58,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
        private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
        private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
-
-        private static readonly Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
 #endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
-        private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
-
-        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 c0 = s.V0L;
-            Vector4 c1 = s.V7L;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6L;
-            c0 = s.V1L;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5L;
-            c0 = s.V2L;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3L;
-            c1 = s.V4L;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0L = c0 + c1;
-            d.V4L = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2L = (w0 * c2) + (w1 * c3);
-            d.V6L = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3L = c0 - c2;
-            d.V5L = c3 - c1;
-
-            float invsqrt2 = 0.707107f;
-            c0 = (c0 + c2) * invsqrt2;
-            c3 = (c3 + c1) * invsqrt2;
-
-            d.V1L = c0 + c3;
-            d.V7L = c0 - c3;
-        }
-
-        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 c0 = s.V0R;
-            Vector4 c1 = s.V7R;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6R;
-            c0 = s.V1R;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5R;
-            c0 = s.V2R;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3R;
-            c1 = s.V4R;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0R = c0 + c1;
-            d.V4R = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2R = (w0 * c2) + (w1 * c3);
-            d.V6R = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3R = c0 - c2;
-            d.V5R = c3 - c1;
-
-            c0 = (c0 + c2) * InvSqrt2;
-            c3 = (c3 + c1) * InvSqrt2;
-
-            d.V1R = c0 + c3;
-            d.V7R = c0 - c3;
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
-            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
-            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
-            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
-            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
-            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
-            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
-            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
-
-            Vector256<float> c0 = Avx.Add(t0, t3);
-            Vector256<float> c1 = Avx.Add(t1, t2);
-
-            // 0 4
-            d.V0 = Avx.Add(c0, c1);
-            d.V4 = Avx.Subtract(c0, c1);
-
-            Vector256<float> c3 = Avx.Subtract(t0, t3);
-            Vector256<float> c2 = Avx.Subtract(t1, t2);
-
-            // 2 6
-            d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
-            d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
-
-            c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
-            c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
-
-            c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
-            c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
-
-            // 3 5
-            d.V3 = Avx.Subtract(c0, c2);
-            d.V5 = Avx.Subtract(c3, c1);
-
-            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
-            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
-
-            // 1 7
-            d.V1 = Avx.Add(c0, c3);
-            d.V7 = Avx.Subtract(c0, c3);
-#endif
-        }
-
-        /// <summary>
-        /// Performs 8x8 matrix Forward Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                FDCT8x4_LeftPart(ref s, ref d);
-                FDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Apply floating point FDCT from src into dest
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller for optimization</param>
-        public static void TransformFDCT(
-            ref Block8x8F src,
-            ref Block8x8F dest,
-            ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref dest);
-
-            dest.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref dest);
-
-            dest.MultiplyInPlace(C_0_125);
-        }
-
-        /// <summary>
-        /// Apply floating point FDCT inplace.
-        /// </summary>
-        /// <param name="matrix">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp)
-        {
-            matrix.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref matrix);
-
-            matrix.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref matrix);
-
-            matrix.MultiplyInPlace(C_0_125);
-        }
-
        /// <summary>
        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
        /// </summary>
@ -501,40 +255,148 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        /// <param name="block">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
        {
-            src.TransposeInto(ref temp);
-
-            IDCT8x8(ref temp, ref dest);
-            dest.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref dest);
+            block.Transpose();
+            IDCT8x8(ref block, ref temp);
+            temp.Transpose();
+            IDCT8x8(ref temp, ref block);

            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
+            block.MultiplyInPlace(C_0_125);
        }

        /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// Apply 2D floating point FDCT inplace using scalar operations.
        /// </summary>
-        /// <param name="matrix">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp)
+        /// <remarks>
+        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransformScalar(ref Block8x8F block)
        {
-            block.TransposeInto(ref temp);
+            const int dctSize = 8;

-            IDCT8x8(ref temp, ref block);
-            block.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref block);
+            float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+            float tmp10, tmp11, tmp12, tmp13;
+            float z1, z2, z3, z4, z5, z11, z13;

-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            block.MultiplyInPlace(C_0_125);
+            // First pass - process rows
+            ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
+            {
+                tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
+                tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
+                tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
+                tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
+                tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
+                tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
+                tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
+                tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+            }
+
+            // Second pass - process columns
+            dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
+            {
+                tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
+                tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, 1);
+            }
+        }
+
+        /// <summary>
+        /// Apply 2D floating point FDCT inplace.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        public static void TransformFDCT(ref Block8x8F block)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported || Sse.IsSupported)
+            {
+                ForwardTransformSimd(ref block);
+            }
+            else
+#endif
+            {
+                ForwardTransformScalar(ref block);
+            }
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@ -10,10 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    internal static partial class ZigZag
    {
+#pragma warning disable SA1309 // naming rules violation warnings
        /// <summary>
        /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
        /// </summary>
-        private const byte Z = 0xff;
+        private const byte _ = 0xff;
+#pragma warning restore SA1309

        /// <summary>
        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
@ -22,82 +24,82 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
        {
            // 0_A
-            0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z,
+            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
            // 0_B
-            Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5,
+            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
            // 0_C
-            Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,

            // 1_A
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11,
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
            // 1_B
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
            // 1_C
-            2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z,
+            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
            // 1_D
-            Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
            // 1_E
-            Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,

            // 2_B
-            8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
            // 2_C
-            Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
            // 2_D
-            Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
            // 2_E
-            Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5,
+            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
            // 2_F
-            Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z,
+            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
            // 2_G
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,

            // 3_A
-            Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
            // 3_B
-            Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z,
+            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
            // 3_C
-            Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z,
+            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
            // 3_D/4_E
-            6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9,
+            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,

            // 4_F
-            Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z,
+            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
            // 4_G
-            Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z,
+            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
            // 4_H
-            Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,

            // 5_B
-            Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
            // 5_C
-            Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
            // 5_D
-            10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z,
+            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
            // 5_E
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
            // 5_F
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
            // 5_G
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
+            _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,

            // 6_D
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
            // 6_E
-            Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z,
+            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
            // 6_F
-            Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13,
+            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
            // 6_G
-            Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
            // 6_H
-            4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,

            // 7_F
-            Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
            // 7_G
-            10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z,
+            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
            // 7_H
-            Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15
+            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
        };

        /// <summary>
@ -110,55 +112,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,

                // 01_AB - inner-lane
-                0, 1, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   6, 7, 12, 13,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   4, 5, 6, 7,
+                0, 1, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   6, 7, 12, 13,  _, _, _, _,   _, _, _, _,   _, _, 10, 11,   4, 5, 6, 7,

                // 01_CD/23_GH - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,

                // 01_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   Z, Z, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   _, _, 10, 11,   4, 5, _, _,   _, _, _, _,

                // 01_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   0, 1, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, _, _,   _, _, _, _,   _, _, _, _,

                // 23_AB/45_CD/67_EF - cross-lane
-                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,
+                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,

                // 23_AB - inner-lane
-                4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   2, 3, 8, 9,   Z, Z, Z, Z,
+                4, 5, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   2, 3, 8, 9,   _, _, _, _,

                // 23_CD - inner-lane
-                Z, Z, 6, 7,   12, 13, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 12, 13,
+                _, _, 6, 7,   12, 13, _, _,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   6, 7, 12, 13,

                // 23_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,

                // 23_GH - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,

                // 45_AB - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,   _, _, _, _,

                // 45_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   Z, Z, 2, 3,   8, 9, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   _, _, 2, 3,   8, 9, _, _,   _, _, _, _,

                // 45_EF - cross-lane
-                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   Z, Z, Z, Z,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   _, _, _, _,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,

                // 45_EF - inner-lane
-                2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,
+                2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,  _, _, _, _,   _, _, _, _,   _, _, 2, 3,   8, 9, _, _,

                // 45_GH - inner-lane
-                Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 6, 7,
+                _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 6, 7,

                // 67_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, 10, 11,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,

                // 67_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, 6, 7,   0, 1, Z, Z,   2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 6, 7,   0, 1, _, _,   2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,

                // 67_GH - inner-lane
-                8, 9, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, 6, 7,   12, 13, 14, 15
+                8, 9, 10, 11,   4, 5, _, _,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, 6, 7,   12, 13, 14, 15
        };

        /// <summary>
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@ -12,15 +12,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
        private static readonly Block8x8F Source = Create8x8FloatData();

        [Benchmark]
-        public void TransposeInto()
-        {
-            var dest = default(Block8x8F);
-            Source.TransposeInto(ref dest);
-        }
+        public void TransposeInto() => Source.Transpose();

        private static Block8x8F Create8x8FloatData()
        {
-            var result = new float[64];
+            float[] result = new float[64];
            for (int i = 0; i < 8; i++)
            {
                for (int j = 0; j < 8; j++)
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -164,52 +164,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        }

        [Fact]
-        public void TransposeInto()
+        public void Transpose()
        {
            static void RunTest()
            {
                float[] expected = Create8x8FloatData();
                ReferenceImplementations.Transpose8x8(expected);

-                var source = default(Block8x8F);
-                source.LoadFrom(Create8x8FloatData());
+                var block8x8 = default(Block8x8F);
+                block8x8.LoadFrom(Create8x8FloatData());

-                var dest = default(Block8x8F);
-                source.TransposeInto(ref dest);
+                block8x8.Transpose();

                float[] actual = new float[64];
-                dest.ScaledCopyTo(actual);
+                block8x8.ScaledCopyTo(actual);

                Assert.Equal(expected, actual);
            }

            FeatureTestRunner.RunWithHwIntrinsicsFeature(
                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
-        }
-
-        private class BufferHolder
-        {
-            public Block8x8F Buffer;
-        }
-
-        [Fact]
-        public void TransposeInto_Benchmark()
-        {
-            var source = new BufferHolder();
-            source.Buffer.LoadFrom(Create8x8FloatData());
-            var dest = new BufferHolder();
-
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ...");
-            var sw = Stopwatch.StartNew();
-
-            for (int i = 0; i < Times; i++)
-            {
-                source.Buffer.TransposeInto(ref dest.Buffer);
-            }
-
-            sw.Stop();
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
        }

        private static float[] Create8x8ColorCropTestData()
@ -281,16 +256,21 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
            Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);

+            // Reference implementation quantizes given block via division
            Block8x8 expected = default;
            ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);

+            // Actual current implementation quantizes given block via multiplication
+            // With quantization table reciprocal
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quant[i] = 1f / quant[i];
+            }
+
            Block8x8 actual = default;
            Block8x8F.Quantize(ref source, ref actual, ref quant);

-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                Assert.Equal(expected[i], actual[i]);
-            }
+            this.CompareBlocks(expected, actual, 1);
        }

        [Fact]
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@ -2,6 +2,9 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
 #endif
@ -33,15 +36,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            {
                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);

-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);

-                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);

                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);

-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
            }

            [Theory]
@ -52,15 +54,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            {
                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);

-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);

-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);

                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);

-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
            }

            // Inverse transform
@ -167,8 +168,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    var srcBlock = default(Block8x8F);
                    srcBlock.LoadFrom(src);

-                    var destBlock = default(Block8x8F);
-
                    var expectedDest = new float[64];
                    var temp1 = new float[64];
                    var temp2 = default(Block8x8F);
@ -177,10 +176,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);

                    // testee
-                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);

                    var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    srcBlock.ScaledCopyTo(actualDest);

                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
                }
@ -198,95 +197,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            }

            // Forward transform
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_LeftPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_RightPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x8_Avx(int seed)
-            {
-#if SUPPORTS_RUNTIME_INTRINSICS
-                var skip = !Avx.IsSupported;
-#else
-                var skip = true;
-#endif
-                if (skip)
-                {
-                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
-                }
-
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference, left part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // reference, right part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee, whole 8x8
-                FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
+            // This test covers entire FDCT conversions chain
+            // This test checks all implementations: intrinsic and scalar fallback
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
@ -297,37 +209,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    int seed = FeatureTestRunner.Deserialize<int>(serialized);

                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                    var srcBlock = default(Block8x8F);
-                    srcBlock.LoadFrom(src);
-
-                    var destBlock = default(Block8x8F);
+                    var block = default(Block8x8F);
+                    block.LoadFrom(src);

-                    var expectedDest = new float[64];
-                    var temp1 = new float[64];
-                    var temp2 = default(Block8x8F);
+                    float[] expectedDest = new float[64];
+                    float[] temp1 = new float[64];

                    // reference
                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);

                    // testee
-                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2);
+                    // Part of the FDCT calculations is fused into the quantization step
+                    // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
+                    FastFloatingPointDCT.TransformFDCT(ref block);
+                    for (int i = 0; i < 64; i++)
+                    {
+                        block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
+                    }

-                    var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    float[] actualDest = block.ToArray();

-                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                    Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
                }

                // 3 paths:
                // 1. AllowAll - call avx/fma implementation
                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call fallback code of Vector4 implementation
-                //
-                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                // 3. DisableAvx - call sse implementation
+                // 4. DisableHWIntrinsic - call scalar fallback implementation
                FeatureTestRunner.RunWithHwIntrinsicsFeature(
                    RunTest,
                    seed,
-                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
            }
        }
    }