Merge pull request #1761 from br3aker/jpeg-encoder-optimization

Jpeg encoder optimization
4 years ago · 2f903c7c9b
35 changed files with 1977 additions and 1551 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@ -2,17 +2,22 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
-using System.Diagnostics;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using System.Text;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// Represents a Jpeg block with <see cref="short"/> coefficients.
+    /// 8x8 matrix of <see cref="short"/> coefficients.
    /// </summary>
    // ReSharper disable once InconsistentNaming
+    [StructLayout(LayoutKind.Explicit)]
    internal unsafe struct Block8x8 : IEquatable<Block8x8>
    {
        /// <summary>
@ -20,24 +25,44 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// </summary>
        public const int Size = 64;

+#pragma warning disable IDE0051 // Remove unused private member
        /// <summary>
-        /// A fixed size buffer holding the values.
-        /// See: <see>
-        ///         <cref>https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers</cref>
-        ///     </see>
+        /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes.
        /// </summary>
+        /// <remarks>
+        /// This is not used directly in the code.
+        /// </remarks>
+        [FieldOffset(0)]
        private fixed short data[Size];
-
-        /// <summary>
-        /// Initializes a new instance of the <see cref="Block8x8"/> struct.
-        /// </summary>
-        /// <param name="coefficients">A <see cref="Span{T}"/> of coefficients</param>
-        public Block8x8(Span<short> coefficients)
-        {
-            ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
-            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(coefficients));
-            Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short));
-        }
+#pragma warning restore IDE0051
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [FieldOffset(0)]
+        public Vector128<short> V0;
+        [FieldOffset(16)]
+        public Vector128<short> V1;
+        [FieldOffset(32)]
+        public Vector128<short> V2;
+        [FieldOffset(48)]
+        public Vector128<short> V3;
+        [FieldOffset(64)]
+        public Vector128<short> V4;
+        [FieldOffset(80)]
+        public Vector128<short> V5;
+        [FieldOffset(96)]
+        public Vector128<short> V6;
+        [FieldOffset(112)]
+        public Vector128<short> V7;
+
+        [FieldOffset(0)]
+        public Vector256<short> V01;
+        [FieldOffset(32)]
+        public Vector256<short> V23;
+        [FieldOffset(64)]
+        public Vector256<short> V45;
+        [FieldOffset(96)]
+        public Vector256<short> V67;
+#endif

        /// <summary>
        /// Gets or sets a <see cref="short"/> value at the given index
@ -49,7 +74,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            get
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                return Unsafe.Add(ref selfRef, idx);
            }
@ -57,7 +83,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            set
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                Unsafe.Add(ref selfRef, idx) = value;
            }
@ -75,15 +102,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            set => this[(y * 8) + x] = value;
        }

-        public static bool operator ==(Block8x8 left, Block8x8 right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right);

-        public static bool operator !=(Block8x8 left, Block8x8 right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right);

        /// <summary>
        /// Multiply all elements by a given <see cref="int"/>
@ -149,34 +170,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return result;
        }

-        /// <summary>
-        /// Pointer-based "Indexer" (getter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <returns>The scaleVec value at the specified index</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static short GetScalarAt(Block8x8* blockPtr, int idx)
-        {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            return fp[idx];
-        }
-
-        /// <summary>
-        /// Pointer-based "Indexer" (setter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <param name="value">Value</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void SetScalarAt(Block8x8* blockPtr, int idx, short value)
+        public static Block8x8 Load(Span<short> data)
        {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            fp[idx] = value;
+            Unsafe.SkipInit(out Block8x8 result);
+            result.LoadFrom(data);
+            return result;
        }

        /// <summary>
@ -194,7 +192,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// </summary>
        public short[] ToArray()
        {
-            var result = new short[Size];
+            short[] result = new short[Size];
            this.CopyTo(result);
            return result;
        }
@ -206,7 +204,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        {
            ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
            ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<short, byte>(destination));
-            Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short));
+            Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short));
        }

        /// <summary>
@ -220,6 +218,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

+        /// <summary>
+        /// Load raw 16bit integers from source.
+        /// </summary>
+        /// <param name="source">Source</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public void LoadFrom(Span<short> source)
+        {
+            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
+            ref byte destRef = ref Unsafe.As<Block8x8, byte>(ref this);
+
+            Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short));
+        }
+
        /// <summary>
        /// Cast and copy <see cref="Size"/> <see cref="int"/>-s from the beginning of 'source' span.
        /// </summary>
@ -231,13 +242,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
        /// <inheritdoc />
        public override string ToString()
        {
@ -271,15 +275,66 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Block8x8 other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other);

        /// <inheritdoc />
-        public override int GetHashCode()
+        public override int GetHashCode() => (this[0] * 31) + this[1];
+
+        /// <summary>
+        /// Returns index of the last non-zero element in given matrix.
+        /// </summary>
+        /// <returns>
+        /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
+        /// </returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public nint GetLastNonZeroIndex()
        {
-            return (this[0] * 31) + this[1];
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                Vector256<short> zero16 = Vector256<short>.Zero;
+
+                ref Vector256<short> mcuStride = ref Unsafe.As<Block8x8, Vector256<short>>(ref this);
+
+                for (nint i = 3; i >= 0; i--)
+                {
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte());
+
+                    if (areEqual != equalityMask)
+                    {
+                        // Each 2 bits represents comparison operation for each 2-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 15 - (lzcnt / 2);
+                        return (i * 16) + strideRelativeIndex;
+                    }
+                }
+
+                return -1;
+            }
+            else
+#endif
+            {
+                nint index = Size - 1;
+                ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
+
+                while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0)
+                {
+                    index--;
+                }
+
+                return index;
+            }
        }

        /// <summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@ -0,0 +1,149 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal partial struct Block8x8F
+    {
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+
+        private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
+
+        private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            ref Vector256<float> aBase = ref a.V0;
+            ref Vector256<float> bBase = ref b.V0;
+
+            ref Vector256<short> destRef = ref dest.V01;
+
+            for (nint i = 0; i < 8; i += 2)
+            {
+                Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
+
+                Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
+            }
+        }
+
+        private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+            ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
+            ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
+
+            ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
+
+            for (int i = 0; i < 16; i += 2)
+            {
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector128<short> row = Sse2.PackSignedSaturate(left, right);
+                Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row;
+            }
+        }
+
+        private void TransposeInplace_Avx()
+        {
+            // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+            Vector256<float> r0 = Avx.InsertVector128(
+                this.V0,
+                Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
+                1);
+
+            Vector256<float> r1 = Avx.InsertVector128(
+               this.V1,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
+               1);
+
+            Vector256<float> r2 = Avx.InsertVector128(
+               this.V2,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
+               1);
+
+            Vector256<float> r3 = Avx.InsertVector128(
+               this.V3,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
+               1);
+
+            Vector256<float> r4 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
+               1);
+
+            Vector256<float> r5 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
+               1);
+
+            Vector256<float> r6 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
+               1);
+
+            Vector256<float> r7 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
+               1);
+
+            Vector256<float> t0 = Avx.UnpackLow(r0, r1);
+            Vector256<float> t2 = Avx.UnpackLow(r2, r3);
+            Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
+            this.V0 = Avx.Blend(t0, v, 0xCC);
+            this.V1 = Avx.Blend(t2, v, 0x33);
+
+            Vector256<float> t4 = Avx.UnpackLow(r4, r5);
+            Vector256<float> t6 = Avx.UnpackLow(r6, r7);
+            v = Avx.Shuffle(t4, t6, 0x4E);
+            this.V4 = Avx.Blend(t4, v, 0xCC);
+            this.V5 = Avx.Blend(t6, v, 0x33);
+
+            Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
+            Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
+            v = Avx.Shuffle(t1, t3, 0x4E);
+            this.V2 = Avx.Blend(t1, v, 0xCC);
+            this.V3 = Avx.Blend(t3, v, 0x33);
+
+            Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
+            Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
+            v = Avx.Shuffle(t5, t7, 0x4E);
+            this.V6 = Avx.Blend(t5, v, 0xCC);
+            this.V7 = Avx.Blend(t7, v, 0x33);
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 using System.Numerics;
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -16,7 +16,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// Represents a Jpeg block with <see cref="float"/> coefficients.
+    /// 8x8 matrix of <see cref="float"/> coefficients.
    /// </summary>
    [StructLayout(LayoutKind.Explicit)]
    internal partial struct Block8x8F : IEquatable<Block8x8F>
@ -66,30 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        public Vector4 V7L;
        [FieldOffset(240)]
        public Vector4 V7R;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        /// <summary>
-        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
-        /// </summary>
-        public const int RowCount = 8;
-
-        [FieldOffset(0)]
-        public Vector256<float> V0;
-        [FieldOffset(32)]
-        public Vector256<float> V1;
-        [FieldOffset(64)]
-        public Vector256<float> V2;
-        [FieldOffset(96)]
-        public Vector256<float> V3;
-        [FieldOffset(128)]
-        public Vector256<float> V4;
-        [FieldOffset(160)]
-        public Vector256<float> V5;
-        [FieldOffset(192)]
-        public Vector256<float> V6;
-        [FieldOffset(224)]
-        public Vector256<float> V7;
-#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented

        /// <summary>
@ -102,7 +78,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            get
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
                return Unsafe.Add(ref selfRef, (nint)(uint)idx);
            }
@ -110,7 +86,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            set
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
                Unsafe.Add(ref selfRef, (nint)(uint)idx) = value;
            }
@ -188,13 +164,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return result;
        }

-        /// <summary>
-        /// Fill the block with defaults (zeroes).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public void Clear()
-            => this = default; // The cheapest way to do this in C#:
-
        /// <summary>
        /// Load raw 32bit floating point data from source.
        /// </summary>
@ -302,7 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

        public float[] ToArray()
        {
-            var result = new float[Size];
+            float[] result = new float[Size];
            this.ScaledCopyTo(result);
            return result;
        }
@ -434,102 +403,37 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Quantize the block.
+        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
        /// </summary>
-        /// <param name="blockPtr">The block pointer.</param>
-        /// <param name="qtPtr">The qt pointer.</param>
-        /// <param name="unzigPtr">Unzig pointer</param>
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
-        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Size; qtIndex++)
-            {
-                byte blockIndex = unzigPtr[qtIndex];
-                float* unzigPos = b + blockIndex;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
-            }
-        }
-
-        /// <summary>
-        /// Quantize 'block' into 'dest' using the 'qt' quantization table:
-        /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values.
-        /// To finish the rounding it's enough to (int)-cast these values.
-        /// </summary>
-        /// <param name="block">Source block</param>
-        /// <param name="dest">Destination block</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        public static unsafe void Quantize(
-            ref Block8x8F block,
-            ref Block8x8F dest,
-            ref Block8x8F qt,
-            ref ZigZag unZig)
+        /// <param name="block">Source block.</param>
+        /// <param name="dest">Destination block.</param>
+        /// <param name="qt">The quantization table.</param>
+        public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
        {
-            for (int zig = 0; zig < Size; zig++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
            {
-                dest[zig] = block[unZig[zig]];
+                MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingAvx2(ref dest);
            }
-
-            DivideRoundAll(ref dest, ref qt);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
+            else if (Ssse3.IsSupported)
            {
-                var vnegOne = Vector256.Create(-1f);
-                var vadd = Vector256.Create(.5F);
-                var vone = Vector256.Create(1f);
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
-                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
-                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
-                }
+                MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingSsse3(ref dest);
            }
            else
 #endif
            {
-                a.V0L = DivideRound(a.V0L, b.V0L);
-                a.V0R = DivideRound(a.V0R, b.V0R);
-                a.V1L = DivideRound(a.V1L, b.V1L);
-                a.V1R = DivideRound(a.V1R, b.V1R);
-                a.V2L = DivideRound(a.V2L, b.V2L);
-                a.V2R = DivideRound(a.V2R, b.V2R);
-                a.V3L = DivideRound(a.V3L, b.V3L);
-                a.V3R = DivideRound(a.V3R, b.V3R);
-                a.V4L = DivideRound(a.V4L, b.V4L);
-                a.V4R = DivideRound(a.V4R, b.V4R);
-                a.V5L = DivideRound(a.V5L, b.V5L);
-                a.V5R = DivideRound(a.V5R, b.V5R);
-                a.V6L = DivideRound(a.V6L, b.V6L);
-                a.V6R = DivideRound(a.V6R, b.V6R);
-                a.V7L = DivideRound(a.V7L, b.V7L);
-                a.V7R = DivideRound(a.V7R, b.V7R);
+                for (int i = 0; i < Size; i++)
+                {
+                    int idx = ZigZag.ZigZagOrder[i];
+                    float quantizedVal = block[idx] * qt[idx];
+                    quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
+                    dest[i] = (short)quantizedVal;
+                }
            }
        }

-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
-        {
-            var neg = new Vector4(-1);
-            var add = new Vector4(.5F);
-
-            // sign(dividend) = max(min(dividend, 1), -1)
-            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
-            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
-            return (dividend / divisor) + (sign * add);
-        }
-
        public void RoundInto(ref Block8x8 dest)
        {
            for (int i = 0; i < Size; i++)
@ -627,6 +531,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            Unsafe.Add(ref dRef, 7) = bottom;
        }

+        /// <summary>
+        /// Compares entire 8x8 block to a single scalar value.
+        /// </summary>
+        /// <param name="value">Value to compare to.</param>
+        public bool EqualsToScalar(int value)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                var targetVector = Vector256.Create(value);
+                ref Vector256<float> blockStride = ref this.V0;
+
+                for (int i = 0; i < RowCount; i++)
+                {
+                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
+                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+#endif
+            {
+                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
+
+                for (int i = 0; i < Size; i++)
+                {
+                    if ((int)Unsafe.Add(ref scalars, i) != value)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+        }
+
        /// <inheritdoc />
        public bool Equals(Block8x8F other)
            => this.V0L == other.V0L
@ -663,213 +608,89 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return sb.ToString();
        }

-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
-        {
-            row += off;
-            row = Vector.Max(row, Vector<float>.Zero);
-            row = Vector.Min(row, max);
-            return row.FastRound();
-        }
-
-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
        /// <summary>
-        /// Transpose the block into the destination block.
+        /// Transpose the block inplace.
        /// </summary>
-        /// <param name="d">The destination block</param>
        [MethodImpl(InliningOptions.ShortMethod)]
-        public void TransposeInto(ref Block8x8F d)
+        public void TransposeInplace()
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx.IsSupported)
            {
-                // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-                Vector256<float> r0 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-                   1);
-
-                Vector256<float> r1 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-                   1);
-
-                Vector256<float> r2 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-                   1);
-
-                Vector256<float> r3 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-                   1);
-
-                Vector256<float> r4 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-                   1);
-
-                Vector256<float> r5 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-                   1);
-
-                Vector256<float> r6 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-                   1);
-
-                Vector256<float> r7 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-                   1);
-
-                Vector256<float> t0 = Avx.UnpackLow(r0, r1);
-                Vector256<float> t2 = Avx.UnpackLow(r2, r3);
-                Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                d.V0 = Avx.Blend(t0, v, 0xCC);
-                d.V1 = Avx.Blend(t2, v, 0x33);
-
-                Vector256<float> t4 = Avx.UnpackLow(r4, r5);
-                Vector256<float> t6 = Avx.UnpackLow(r6, r7);
-                v = Avx.Shuffle(t4, t6, 0x4E);
-                d.V4 = Avx.Blend(t4, v, 0xCC);
-                d.V5 = Avx.Blend(t6, v, 0x33);
-
-                Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
-                Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
-                v = Avx.Shuffle(t1, t3, 0x4E);
-                d.V2 = Avx.Blend(t1, v, 0xCC);
-                d.V3 = Avx.Blend(t3, v, 0x33);
-
-                Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
-                Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
-                v = Avx.Shuffle(t5, t7, 0x4E);
-                d.V6 = Avx.Blend(t5, v, 0xCC);
-                d.V7 = Avx.Blend(t7, v, 0x33);
+                this.TransposeInplace_Avx();
            }
            else
 #endif
            {
-                d.V0L.X = this.V0L.X;
-                d.V1L.X = this.V0L.Y;
-                d.V2L.X = this.V0L.Z;
-                d.V3L.X = this.V0L.W;
-                d.V4L.X = this.V0R.X;
-                d.V5L.X = this.V0R.Y;
-                d.V6L.X = this.V0R.Z;
-                d.V7L.X = this.V0R.W;
-
-                d.V0L.Y = this.V1L.X;
-                d.V1L.Y = this.V1L.Y;
-                d.V2L.Y = this.V1L.Z;
-                d.V3L.Y = this.V1L.W;
-                d.V4L.Y = this.V1R.X;
-                d.V5L.Y = this.V1R.Y;
-                d.V6L.Y = this.V1R.Z;
-                d.V7L.Y = this.V1R.W;
-
-                d.V0L.Z = this.V2L.X;
-                d.V1L.Z = this.V2L.Y;
-                d.V2L.Z = this.V2L.Z;
-                d.V3L.Z = this.V2L.W;
-                d.V4L.Z = this.V2R.X;
-                d.V5L.Z = this.V2R.Y;
-                d.V6L.Z = this.V2R.Z;
-                d.V7L.Z = this.V2R.W;
-
-                d.V0L.W = this.V3L.X;
-                d.V1L.W = this.V3L.Y;
-                d.V2L.W = this.V3L.Z;
-                d.V3L.W = this.V3L.W;
-                d.V4L.W = this.V3R.X;
-                d.V5L.W = this.V3R.Y;
-                d.V6L.W = this.V3R.Z;
-                d.V7L.W = this.V3R.W;
-
-                d.V0R.X = this.V4L.X;
-                d.V1R.X = this.V4L.Y;
-                d.V2R.X = this.V4L.Z;
-                d.V3R.X = this.V4L.W;
-                d.V4R.X = this.V4R.X;
-                d.V5R.X = this.V4R.Y;
-                d.V6R.X = this.V4R.Z;
-                d.V7R.X = this.V4R.W;
-
-                d.V0R.Y = this.V5L.X;
-                d.V1R.Y = this.V5L.Y;
-                d.V2R.Y = this.V5L.Z;
-                d.V3R.Y = this.V5L.W;
-                d.V4R.Y = this.V5R.X;
-                d.V5R.Y = this.V5R.Y;
-                d.V6R.Y = this.V5R.Z;
-                d.V7R.Y = this.V5R.W;
-
-                d.V0R.Z = this.V6L.X;
-                d.V1R.Z = this.V6L.Y;
-                d.V2R.Z = this.V6L.Z;
-                d.V3R.Z = this.V6L.W;
-                d.V4R.Z = this.V6R.X;
-                d.V5R.Z = this.V6R.Y;
-                d.V6R.Z = this.V6R.Z;
-                d.V7R.Z = this.V6R.W;
-
-                d.V0R.W = this.V7L.X;
-                d.V1R.W = this.V7L.Y;
-                d.V2R.W = this.V7L.Z;
-                d.V3R.W = this.V7L.W;
-                d.V4R.W = this.V7R.X;
-                d.V5R.W = this.V7R.Y;
-                d.V6R.W = this.V7R.Z;
-                d.V7R.W = this.V7R.W;
+                this.TransposeInplace_Scalar();
            }
        }

        /// <summary>
-        /// Compares entire 8x8 block to a single scalar value.
+        /// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
        /// </summary>
-        /// <param name="value">Value to compare to.</param>
-        public bool EqualsToScalar(int value)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void TransposeInplace_Scalar()
+        {
+            ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
+
+            // row #0
+            Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+            Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+            Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+            Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+            Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+            Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+            Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+            // row #1
+            Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+            Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+            Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+            Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+            Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+            Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+            // row #2
+            Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+            Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+            Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+            Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+            Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+            // row #3
+            Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+            Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+            Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+            Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+            // row #4
+            Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+            Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+            Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+            // row #5
+            Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+            Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+            // row #6
+            Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+            static void Swap(ref float a, ref float b)
            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
-                var targetVector = Vector256.Create(value);
-                ref Vector256<float> blockStride = ref this.V0;
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
-                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
-                    {
-                        return false;
-                    }
-                }
-
-                return true;
+                float tmp = a;
+                a = b;
+                b = tmp;
            }
-#endif
-            {
-                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
-
-                for (int i = 0; i < Size; i++)
-                {
-                    if ((int)Unsafe.Add(ref scalars, i) != value)
-                    {
-                        return false;
-                    }
-                }
+        }

-                return true;
-            }
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
+        {
+            row += off;
+            row = Vector.Max(row, Vector<float>.Zero);
+            row = Vector.Min(row, max);
+            return row.FastRound();
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@ -58,11 +58,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        private readonly HuffmanTable[] acHuffmanTables;

-        /// <summary>
-        /// The unzig data.
-        /// </summary>
-        private ZigZag dctZigZag;
-
        private HuffmanScanBuffer scanBuffer;

        private readonly SpectralConverter spectralConverter;
@ -80,7 +75,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            SpectralConverter converter,
            CancellationToken cancellationToken)
        {
-            this.dctZigZag = ZigZag.CreateUnzigTable();
            this.stream = stream;
            this.spectralConverter = converter;
            this.cancellationToken = cancellationToken;
@ -483,7 +477,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            ref short blockDataRef = ref Unsafe.As<Block8x8, short>(ref block);
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;

            // DC
            int t = buffer.DecodeHuffman(ref dcTable);
@ -508,7 +501,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                {
                    i += r;
                    s = buffer.Receive(s);
-                    Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s;
+                    Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
                }
                else
                {
@ -562,7 +555,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                }

                ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-                ref ZigZag zigzag = ref this.dctZigZag;
                int start = this.SpectralStart;
                int end = this.SpectralEnd;
                int low = this.SuccessiveLow;
@ -578,7 +570,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                    if (s != 0)
                    {
                        s = buffer.Receive(s);
-                        Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low);
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
                    }
                    else
                    {
@ -608,7 +600,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            // Refinement scan for these AC coefficients
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;
            int start = this.SpectralStart;
            int end = this.SpectralEnd;

@ -655,7 +646,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    do
                    {
-                        ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
                        if (coef != 0)
                        {
                            buffer.CheckBits();
@ -681,7 +672,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    if ((s != 0) && (k < 64))
                    {
-                        Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s;
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
                    }
                }
            }
@ -690,7 +681,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            {
                for (; k <= end; k++)
                {
-                    ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);

                    if (coef != 0)
                    {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        IJpegComponent[] Components { get; }

        /// <summary>
-        /// Gets the quantization tables, in zigzag order.
+        /// Gets the quantization tables, in natural order.
        /// </summary>
        Block8x8F[] QuantizationTables { get; }
    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@ -19,14 +19,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public Block8x8F SourceBlock;

        /// <summary>
-        /// Temporal block 1 to store intermediate and/or final computation results.
+        /// Temporal block to store intermediate computation results.
        /// </summary>
-        public Block8x8F WorkspaceBlock1;
-
-        /// <summary>
-        /// Temporal block 2 to store intermediate and/or final computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock2;
+        public Block8x8F WorkspaceBlock;

        /// <summary>
        /// The quantization table as <see cref="Block8x8F"/>.
@ -46,12 +41,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
        {
            int qtIndex = component.QuantizationTableIndex;
-            this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]);
+            this.DequantiazationTable = decoder.QuantizationTables[qtIndex];
            this.subSamplingDivisors = component.SubSamplingDivisors;

            this.SourceBlock = default;
-            this.WorkspaceBlock1 = default;
-            this.WorkspaceBlock2 = default;
+            this.WorkspaceBlock = default;
        }

        /// <summary>
@ -71,20 +65,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            int destAreaStride,
            float maximumValue)
        {
-            ref Block8x8F b = ref this.SourceBlock;
-            b.LoadFrom(ref sourceBlock);
+            ref Block8x8F block = ref this.SourceBlock;
+            block.LoadFrom(ref sourceBlock);

            // Dequantize:
-            b.MultiplyInPlace(ref this.DequantiazationTable);
+            block.MultiplyInPlace(ref this.DequantiazationTable);

-            FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);
+            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);

            // To conform better to libjpeg we actually NEED TO loose precision here.
            // This is because they store blocks as Int16 between all the operations.
            // To be "more accurate", we need to emulate this by rounding!
-            this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue);
+            block.NormalizeColorsAndRoundInPlace(maximumValue);

-            this.WorkspaceBlock1.ScaledCopyTo(
+            block.ScaledCopyTo(
                ref destAreaOrigin,
                destAreaStride,
                this.subSamplingDivisors.Width,
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@ -39,6 +39,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// <param name="frame">The jpeg frame with the color space to convert to.</param>
        /// <param name="jpegData">The raw JPEG data.</param>
        /// <returns>The color converter.</returns>
-        public virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
+        protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@ -5,10 +5,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
    /// <summary>
    /// A compiled look-up table representation of a huffmanSpec.
-    /// Each value maps to a int32 of which the 24 most significant bits hold the
-    /// codeword in bits and the 8 least significant bits hold the codeword size.
    /// The maximum codeword size is 16 bits.
    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Each value maps to a int32 of which the 24 most significant bits hold the
+    /// codeword in bits and the 8 least significant bits hold the codeword size.
+    /// </para>
+    /// <para>
+    /// Code value occupies 24 most significant bits as integer value.
+    /// This value is shifted to the MSB position for performance reasons.
+    /// For example, decimal value 10 is stored like this:
+    /// <code>
+    /// MSB                                LSB
+    /// 1010 0000 00000000 00000000 | 00000100
+    /// </code>
+    /// This was done to eliminate extra binary shifts in the encoder.
+    /// While code length is represented as 8 bit integer value
+    /// </para>
+    /// </remarks>
    internal readonly struct HuffmanLut
    {
        /// <summary>
@ -54,7 +69,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                int len = i + 1;
                for (int j = 0; j < spec.Count[i]; j++)
                {
-                    this.Values[spec.Values[k]] = len | (code << 8);
+                    this.Values[spec.Values[k]] = len | (code << (32 - len));
                    code++;
                    k++;
                }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -1,12 +1,11 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using System.IO;
+using System.Numerics;
 using System.Runtime.CompilerServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
+using System.Runtime.InteropServices;
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@ -16,49 +15,118 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
    internal class HuffmanScanEncoder
    {
        /// <summary>
-        /// Compiled huffman tree to encode given values.
+        /// Maximum number of bytes encoded jpeg 8x8 block can occupy.
+        /// It's highly unlikely for block to occupy this much space - it's a theoretical limit.
        /// </summary>
-        /// <remarks>Yields codewords by index consisting of [run length | bitsize].</remarks>
-        private HuffmanLut[] huffmanTables;
+        /// <remarks>
+        /// Where 16 is maximum huffman code binary length according to itu
+        /// specs. 10 is maximum value binary length, value comes from discrete
+        /// cosine tranform with value range: [-1024..1023]. Block stores
+        /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get
+        /// the number of bytes. This value is then multiplied by
+        /// <see cref="MaxBytesPerBlockMultiplier"/> for performance reasons.
+        /// </remarks>
+        private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier;

        /// <summary>
-        /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+        /// Multiplier used within cache buffers size calculation.
        /// </summary>
        /// <remarks>
-        /// This is subject to change, 1024 seems to be the best value in terms of performance.
-        /// <see cref="Emit(int, int)"/> expects it to be at least 8 (see comments in method body).
+        /// <para>
+        /// Theoretically, <see cref="MaxBytesPerBlock"/> bytes buffer can fit
+        /// exactly one minimal coding unit. In reality, coding blocks occupy much
+        /// less space than the theoretical maximum - this can be exploited.
+        /// If temporal buffer size is multiplied by at least 2, second half of
+        /// the resulting buffer will be used as an overflow 'guard' if next
+        /// block would occupy maximum number of bytes. While first half may fit
+        /// many blocks before needing to flush.
+        /// </para>
+        /// <para>
+        /// This is subject to change. This can be equal to 1 but recomended
+        /// value is 2 or even greater - futher benchmarking needed.
+        /// </para>
        /// </remarks>
-        private const int EmitBufferSizeInBytes = 1024;
+        private const int MaxBytesPerBlockMultiplier = 2;

        /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+        /// <see cref="streamWriteBuffer"/> size multiplier.
        /// </summary>
-        private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+        /// <remarks>
+        /// Jpeg specification requiers to insert 'stuff' bytes after each
+        /// 0xff byte value. Worst case scenarion is when all bytes are 0xff.
+        /// While it's highly unlikely (if not impossible) to get such
+        /// combination, it's theoretically possible so buffer size must be guarded.
+        /// </remarks>
+        private const int OutputBufferLengthMultiplier = 2;

        /// <summary>
-        /// Number of filled bytes in <see cref="emitBuffer"/> buffer
+        /// Compiled huffman tree to encode given values.
        /// </summary>
-        private int emitLen = 0;
+        /// <remarks>Yields codewords by index consisting of [run length | bitsize].</remarks>
+        private HuffmanLut[] huffmanTables;

        /// <summary>
        /// Emitted bits 'micro buffer' before being transferred to the <see cref="emitBuffer"/>.
        /// </summary>
-        private int accumulatedBits;
+        private uint accumulatedBits;
+
+        /// <summary>
+        /// Buffer for temporal storage of huffman rle encoding bit data.
+        /// </summary>
+        /// <remarks>
+        /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer.
+        /// This process does NOT include inserting stuff bytes.
+        /// </remarks>
+        private readonly uint[] emitBuffer;
+
+        /// <summary>
+        /// Buffer for temporal storage which is then written to the output stream.
+        /// </summary>
+        /// <remarks>
+        /// Encoding bits from <see cref="emitBuffer"/> are copied to this byte buffer including stuff bytes.
+        /// </remarks>
+        private readonly byte[] streamWriteBuffer;

        /// <summary>
        /// Number of jagged bits stored in <see cref="accumulatedBits"/>
        /// </summary>
        private int bitCount;

-        private Block8x8F temporalBlock1;
-        private Block8x8F temporalBlock2;
+        private int emitWriteIndex;
+
+        private Block8x8 tempBlock;

        /// <summary>
        /// The output stream. All attempted writes after the first error become no-ops.
        /// </summary>
        private readonly Stream target;

-        public HuffmanScanEncoder(Stream outputStream) => this.target = outputStream;
+        /// <summary>
+        /// Initializes a new instance of the <see cref="HuffmanScanEncoder"/> class.
+        /// </summary>
+        /// <param name="blocksPerCodingUnit">Amount of encoded 8x8 blocks per single jpeg macroblock.</param>
+        /// <param name="outputStream">Output stream for saving encoded data.</param>
+        public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream)
+        {
+            int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit;
+            this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)];
+            this.emitWriteIndex = this.emitBuffer.Length;
+
+            this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier];
+
+            this.target = outputStream;
+        }
+
+        /// <summary>
+        /// Gets a value indicating whether <see cref="emitBuffer"/> is full
+        /// and must be flushed using <see cref="FlushToStream()"/>
+        /// before encoding next 8x8 coding block.
+        /// </summary>
+        private bool IsStreamFlushNeeded
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2;
+        }

        /// <summary>
        /// Encodes the image with no subsampling.
@ -71,9 +139,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@ -97,26 +166,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -131,9 +202,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@ -158,34 +230,35 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YLeft,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);

                        prevDCY = this.WriteBlock(
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YRight,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);
                    }

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -198,9 +271,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0;
@ -223,12 +296,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -236,14 +313,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        /// <typeparam name="TPixel">The pixel format.</typeparam>
        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee.</param>
+        /// <param name="quantTable">Quantization table provided by the callee.</param>
        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F quantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref quantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCR = 0, prevDCG = 0, prevDCB = 0;
@ -267,26 +344,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCR,
                        ref pixelConverter.R,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref quantTable);

                    prevDCG = this.WriteBlock(
                        QuantIndex.Luminance,
                        prevDCG,
                        ref pixelConverter.G,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref quantTable);

                    prevDCB = this.WriteBlock(
                        QuantIndex.Luminance,
                        prevDCB,
                        ref pixelConverter.B,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref quantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -296,47 +375,53 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        /// <param name="index">The quantization table index.</param>
        /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <param name="block">Source block.</param>
+        /// <param name="quant">Quantization table.</param>
        /// <returns>The <see cref="int"/>.</returns>
        private int WriteBlock(
            QuantIndex index,
            int prevDC,
-            ref Block8x8F src,
-            ref Block8x8F quant,
-            ref ZigZag unZig)
+            ref Block8x8F block,
+            ref Block8x8F quant)
        {
-            ref Block8x8F refTemp1 = ref this.temporalBlock1;
-            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8 spectralBlock = ref this.tempBlock;

-            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+            // Shifting level from 0..255 to -128..127
+            block.AddInPlace(-128f);

-            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+            // Discrete cosine transform
+            FastFloatingPointDCT.TransformFDCT(ref block);
+
+            // Quantization
+            Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);

            // Emit the DC delta.
-            int dc = (int)refTemp2[0];
-            this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC);
+            int dc = spectralBlock[0];
+            this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);

            // Emit the AC components.
            int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;

+            nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
+
            int runLength = 0;
-            int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2);
-            for (int zig = 1; zig <= lastValuableIndex; zig++)
+            ref short blockRef = ref Unsafe.As<Block8x8, short>(ref spectralBlock);
+            for (nint zig = 1; zig <= lastValuableIndex; zig++)
            {
-                int ac = (int)refTemp2[zig];
+                const int zeroRun1 = 1 << 4;
+                const int zeroRun16 = 16 << 4;

+                int ac = Unsafe.Add(ref blockRef, zig);
                if (ac == 0)
                {
-                    runLength++;
+                    runLength += zeroRun1;
                }
                else
                {
-                    while (runLength > 15)
+                    while (runLength >= zeroRun16)
                    {
                        this.EmitHuff(acHuffTable, 0xf0);
-                        runLength -= 16;
+                        runLength -= zeroRun16;
                    }

                    this.EmitHuffRLE(acHuffTable, runLength, ac);
@ -356,100 +441,89 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        /// <summary>
-        /// Emits the least significant count of bits to the stream write buffer.
-        /// The precondition is bits
-        /// <example>
-        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
-        /// </example>
-        /// .
+        /// Emits the most significant count of bits to the buffer.
        /// </summary>
-        /// <param name="bits">The packed bits.</param>
-        /// <param name="count">The number of bits</param>
+        /// <remarks>
+        /// <para>
+        /// Supports up to 32 count of bits but, generally speaking, jpeg
+        /// standard assures that there won't be more than 16 bits per single
+        /// value.
+        /// </para>
+        /// <para>
+        /// Emitting algorithm uses 3 intermediate buffers for caching before
+        /// writing to the stream:
+        /// <list type="number">
+        /// <item>
+        /// <term>uint32</term>
+        /// <description>
+        /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits
+        /// are assembled to whole bytes via this intermediate buffer.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>uint32[]</term>
+        /// <description>
+        /// Assembled bytes from uint32 buffer are saved into this buffer.
+        /// uint32 buffer values are saved using indices from the last to the first.
+        /// As bytes are saved to the memory as 4-byte packages endianness matters:
+        /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the
+        /// first eliminates all operations to extract separate bytes. This only works for
+        /// little-endian machines (there are no known examples of big-endian users atm).
+        /// For big-endians this approach is slower due to the separate byte extraction.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>byte[]</term>
+        /// <description>
+        /// Byte buffer used only during <see cref="FlushToStream(int)"/> method.
+        /// </description>
+        /// </item>
+        /// </list>
+        /// </para>
+        /// </remarks>
+        /// <param name="bits">Bits to emit, must be shifted to the left.</param>
+        /// <param name="count">Bits count stored in the bits parameter.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(int bits, int count)
+        private void Emit(uint bits, int count)
        {
+            this.accumulatedBits |= bits >> this.bitCount;
+
            count += this.bitCount;
-            bits <<= 32 - count;
-            bits |= this.accumulatedBits;

-            // Only write if more than 8 bits.
-            if (count >= 8)
+            if (count >= 32)
            {
-                // Track length
-                while (count >= 8)
-                {
-                    byte b = (byte)(bits >> 24);
-                    this.emitBuffer[this.emitLen++] = b;
-
-                    // Adding stuff byte
-                    // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte)
-                    // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker
-                    if (b == byte.MaxValue)
-                    {
-                        this.emitBuffer[this.emitLen++] = byte.MinValue;
-                    }
-
-                    bits <<= 8;
-                    count -= 8;
-                }
+                this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits;
+                this.accumulatedBits = bits << (32 - this.bitCount);

-                // This can emit 4 times of:
-                // 1 byte guaranteed
-                // 1 extra byte.MinValue byte if previous one was byte.MaxValue
-                // Thus writing (1 + 1) * 4 = 8 bytes max
-                // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
-                if (this.emitLen > EmitBufferSizeInBytes - 8)
-                {
-                    this.target.Write(this.emitBuffer, 0, this.emitLen);
-                    this.emitLen = 0;
-                }
+                count -= 32;
            }

-            this.accumulatedBits = bits;
            this.bitCount = count;
        }

        /// <summary>
-        /// Emits the given value with the given Huffman encoder.
+        /// Emits the given value with the given Huffman table.
        /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="table">Huffman table.</param>
+        /// <param name="value">Value to encode.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        private void EmitHuff(int[] table, int value)
        {
            int x = table[value];
-            this.Emit(x >> 8, x & 0xff);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitDirectCurrentTerm(int[] table, int value)
-        {
-            int a = value;
-            int b = value;
-            if (a < 0)
-            {
-                a = -value;
-                b = value - 1;
-            }
-
-            int bt = GetHuffmanEncodingLength((uint)a);
-
-            this.EmitHuff(table, bt);
-            if (bt > 0)
-            {
-                this.Emit(b & ((1 << bt) - 1), bt);
-            }
+            this.Emit((uint)x & 0xffff_ff00u, x & 0xff);
        }

        /// <summary>
-        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+        /// Emits given value via huffman rle encoding.
        /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="runLength">The number of copies to encode.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="table">Huffman table.</param>
+        /// <param name="runLength">The number of preceding zeroes, preshifted by 4 to the left.</param>
+        /// <param name="value">Value to encode.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        private void EmitHuffRLE(int[] table, int runLength, int value)
        {
+            DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits");
+
            int a = value;
            int b = value;
            if (a < 0)
@ -458,25 +532,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                b = value - 1;
            }

-            int bt = GetHuffmanEncodingLength((uint)a);
+            int valueLen = GetHuffmanEncodingLength((uint)a);

-            this.EmitHuff(table, (runLength << 4) | bt);
-            this.Emit(b & ((1 << bt) - 1), bt);
-        }
+            // Huffman prefix code
+            int huffPackage = table[runLength | valueLen];
+            int prefixLen = huffPackage & 0xff;
+            uint prefix = (uint)huffPackage & 0xffff_0000u;

-        /// <summary>
-        /// Writes remaining bytes from internal buffer to the target stream.
-        /// </summary>
-        /// <remarks>Pads last byte with 1's if necessary</remarks>
-        private void FlushInternalBuffer()
-        {
-            // pad last byte with 1's
-            int padBitsCount = 8 - (this.bitCount % 8);
-            if (padBitsCount != 0)
-            {
-                this.Emit((1 << padBitsCount) - 1, padBitsCount);
-                this.target.Write(this.emitBuffer, 0, this.emitLen);
-            }
+            // Actual encoded value
+            uint encodedValue = (uint)b << (32 - valueLen);
+
+            // Doing two binary shifts to get rid of leading 1's in negative value case
+            this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen);
        }

        /// <summary>
@ -498,7 +565,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            // Lzcnt would return 32 for input value of 0 - no need to check that with branching
            // Fallback code if Lzcnt is not supported still use if-check
            // But most modern CPUs support this instruction so this should not be a problem
-            return 32 - System.Numerics.BitOperations.LeadingZeroCount(value);
+            return 32 - BitOperations.LeadingZeroCount(value);
 #else
            // Ideally:
            // if 0 - return 0 in this case
@ -515,65 +582,108 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        /// <summary>
-        /// Returns index of the last non-zero element in given mcu block.
-        /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support.
-        /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop.
-        /// This method is guaranteed to return either -1 or 0 if all elements are zero.
+        /// General method for flushing cached spectral data bytes to
+        /// the ouput stream respecting stuff bytes.
        /// </summary>
        /// <remarks>
-        /// This is an internal operation supposed to be used only in <see cref="HuffmanScanEncoder"/> class for jpeg encoding.
+        /// Bytes cached via <see cref="Emit"/> are stored in 4-bytes blocks
+        /// which makes this method endianness dependent.
        /// </remarks>
-        /// <param name="mcu">Mcu block.</param>
-        /// <returns>Index of the last non-zero element.</returns>
        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static int GetLastValuableElementIndex(ref Block8x8F mcu)
+        private void FlushToStream(int endIndex)
        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+            Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());

-                Vector256<int> zero8 = Vector256<int>.Zero;
+            int writeIdx = 0;
+            int startIndex = emitBytes.Length - 1;

-                ref Vector256<float> mcuStride = ref mcu.V0;
-
-                for (int i = 7; i >= 0; i--)
+            // Some platforms may fail to eliminate this if-else branching
+            // Even if it happens - buffer is flushed in big packs,
+            // branching overhead shouldn't be noticeable
+            if (BitConverter.IsLittleEndian)
+            {
+                // For little endian case bytes are ordered and can be
+                // safely written to the stream with stuff bytes
+                // First byte is cached on the most significant index
+                // so we are going from the end of the array to its beginning:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0]
+                for (int i = startIndex; i >= endIndex; i--)
                {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
+                    byte value = emitBytes[i];
+                    this.streamWriteBuffer[writeIdx++] = value;

-                    // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
-                    if (areEqual != equalityMask)
+                    // Inserting stuff byte
+                    if (value == 0xff)
                    {
-                        // last index in the stride, we go from the end to the start of the stride
-                        int startIndex = i * 8;
-                        int index = startIndex + 7;
-                        ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-                        while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0)
-                        {
-                            index--;
-                        }
-
-                        // this implementation will return -1 if all ac components are zero and dc are zero
-                        return index;
+                        this.streamWriteBuffer[writeIdx++] = 0x00;
                    }
                }
-
-                return -1;
            }
            else
-#endif
            {
-                int index = Block8x8F.Size - 1;
-                ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-
-                while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
+                // For big endian case bytes are ordered in 4-byte packs
+                // which are ordered like bytes in the little endian case by in 4-byte packs:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3]
+                // So we must write each 4-bytes in 'natural order'
+                for (int i = startIndex; i >= endIndex; i -= 4)
                {
-                    index--;
-                }
+                    // This loop is caused by the nature of underlying byte buffer
+                    // implementation and indeed causes performace by somewhat 5%
+                    // compared to little endian scenario
+                    // Even with this performance drop this cached buffer implementation
+                    // is faster than individually writing bytes using binary shifts and binary and(s)
+                    for (int j = i - 3; j <= i; j++)
+                    {
+                        byte value = emitBytes[j];
+                        this.streamWriteBuffer[writeIdx++] = value;

-                // this implementation will return 0 if all ac components and dc are zero
-                return index;
+                        // Inserting stuff byte
+                        if (value == 0xff)
+                        {
+                            this.streamWriteBuffer[writeIdx++] = 0x00;
+                        }
+                    }
+                }
            }
+
+            this.target.Write(this.streamWriteBuffer, 0, writeIdx);
+        }
+
+        /// <summary>
+        /// Flushes spectral data bytes after encoding all channel blocks
+        /// in a single jpeg macroblock using <see cref="WriteBlock"/>.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only if <see cref="IsStreamFlushNeeded"/> is true
+        /// only during the macroblocks encoding routine.
+        /// </remarks>
+        private void FlushToStream()
+        {
+            this.FlushToStream(this.emitWriteIndex * 4);
+            this.emitWriteIndex = this.emitBuffer.Length;
+        }
+
+        /// <summary>
+        /// Flushes final cached bits to the stream padding 1's to
+        /// complement full bytes.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only once at the end of the encoding routine.
+        /// <see cref="IsStreamFlushNeeded"/> check is not needed.
+        /// </remarks>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void FlushRemainingBytes()
+        {
+            // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
+            // And writing only valuable count of bytes count we want to write to the output stream
+            int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
+            uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
+            this.emitBuffer[--this.emitWriteIndex] = packedBytes;
+
+            // Flush cached bytes to the output stream with padding bits
+            this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@ -0,0 +1,161 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class FastFloatingPointDCT
+    {
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
+        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
+        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
+
+        private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
+        private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
+        private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
+        private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
+        private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
+        private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f);
+        private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f);
+        private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f);
+        private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f);
+        private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
+        private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
+#pragma warning restore SA1310, SA1311, IDE1006
+
+        /// <summary>
+        /// Apply floating point FDCT inplace using simd operations.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransform_Avx(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+            // First pass - process rows
+            block.TransposeInplace();
+            FDCT8x8_Avx(ref block);
+
+            // Second pass - process columns
+            block.TransposeInplace();
+            FDCT8x8_Avx(ref block);
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Requires Avx support.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        public static void FDCT8x8_Avx(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
+            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
+            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
+            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
+            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
+            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
+            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
+            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+
+            // Even part
+            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
+            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
+            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
+            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
+
+            block.V0 = Avx.Add(tmp10, tmp11);
+            block.V4 = Avx.Subtract(tmp10, tmp11);
+
+            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+            block.V2 = Avx.Add(tmp13, z1);
+            block.V6 = Avx.Subtract(tmp13, z1);
+
+            // Odd part
+            tmp10 = Avx.Add(tmp4, tmp5);
+            tmp11 = Avx.Add(tmp5, tmp6);
+            tmp12 = Avx.Add(tmp6, tmp7);
+
+            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+            Vector256<float> z11 = Avx.Add(tmp7, z3);
+            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
+
+            block.V5 = Avx.Add(z13, z2);
+            block.V3 = Avx.Subtract(z13, z2);
+            block.V1 = Avx.Add(z11, z4);
+            block.V7 = Avx.Subtract(z11, z4);
+        }
+
+        /// <summary>
+        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+            Vector256<float> my1 = s.V1;
+            Vector256<float> my7 = s.V7;
+            Vector256<float> mz0 = Avx.Add(my1, my7);
+
+            Vector256<float> my3 = s.V3;
+            Vector256<float> mz2 = Avx.Add(my3, my7);
+            Vector256<float> my5 = s.V5;
+            Vector256<float> mz1 = Avx.Add(my3, my5);
+            Vector256<float> mz3 = Avx.Add(my1, my5);
+
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
+
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
+            mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
+            mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
+
+            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
+            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
+            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
+            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
+
+            Vector256<float> my2 = s.V2;
+            Vector256<float> my6 = s.V6;
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
+            Vector256<float> my0 = s.V0;
+            Vector256<float> my4 = s.V4;
+            mz0 = Avx.Add(my0, my4);
+            mz1 = Avx.Subtract(my0, my4);
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
+
+            my0 = Avx.Add(mz0, mz3);
+            my3 = Avx.Subtract(mz0, mz3);
+            my1 = Avx.Add(mz1, mz2);
+            my2 = Avx.Subtract(mz1, mz2);
+
+            d.V0 = Avx.Add(my0, mb0);
+            d.V7 = Avx.Subtract(my0, mb0);
+            d.V1 = Avx.Add(my1, mb1);
+            d.V6 = Avx.Subtract(my1, mb1);
+            d.V2 = Avx.Add(my2, mb2);
+            d.V5 = Avx.Subtract(my2, mb2);
+            d.V3 = Avx.Add(my3, mb3);
+            d.V4 = Avx.Subtract(my3, mb3);
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@ -1,11 +1,9 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

-using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif

@ -19,283 +17,304 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
    {
 #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
        private const float C_1_175876 = 1.175875602f;
-
        private const float C_1_961571 = -1.961570560f;
-
        private const float C_0_390181 = -0.390180644f;
-
        private const float C_0_899976 = -0.899976223f;
-
        private const float C_2_562915 = -2.562915447f;
-
        private const float C_0_298631 = 0.298631336f;
-
        private const float C_2_053120 = 2.053119869f;
-
        private const float C_3_072711 = 3.072711026f;
-
        private const float C_1_501321 = 1.501321110f;
-
        private const float C_0_541196 = 0.541196100f;
-
        private const float C_1_847759 = -1.847759065f;
-
        private const float C_0_765367 = 0.765366865f;

        private const float C_0_125 = 0.1250f;

-#if SUPPORTS_RUNTIME_INTRINSICS
-        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
-        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
-        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
-        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
-        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
-        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
-
-        private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
-        private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
-        private static readonly Vector256<float> C_V_n0_8999 = Vector256.Create(-0.899976223f);
-        private static readonly Vector256<float> C_V_n2_5629 = Vector256.Create(-2.562915447f);
-        private static readonly Vector256<float> C_V_0_2986 = Vector256.Create(0.298631336f);
-        private static readonly Vector256<float> C_V_2_0531 = Vector256.Create(2.053119869f);
-        private static readonly Vector256<float> C_V_3_0727 = Vector256.Create(3.072711026f);
-        private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
-        private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
-        private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
-
-        private static readonly Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
-#endif
+#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
+        private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
+        private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
+        private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
+#pragma warning restore SA1311, IDE1006
+
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
-        private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);

        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
+        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        /// <remarks>
+        /// <para>
+        /// Current FDCT implementation expects its results to be multiplied by
+        /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
+        /// table must be divided by quantization table values scaled with quality settings.
+        /// </para>
+        /// <para>
+        /// These values were calculates with this formula:
+        /// <code>
+        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+        /// </code>
+        /// Where:
+        /// <code>
+        /// scalefactor[0] = 1
+        /// </code>
+        /// <code>
+        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+        /// </code>
+        /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
+        /// </para>
+        /// </remarks>
+        internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
        {
-            Vector4 c0 = s.V0L;
-            Vector4 c1 = s.V7L;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6L;
-            c0 = s.V1L;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5L;
-            c0 = s.V2L;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3L;
-            c1 = s.V4L;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0L = c0 + c1;
-            d.V4L = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2L = (w0 * c2) + (w1 * c3);
-            d.V6L = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3L = c0 - c2;
-            d.V5L = c3 - c1;
-
-            float invsqrt2 = 0.707107f;
-            c0 = (c0 + c2) * invsqrt2;
-            c3 = (c3 + c1) * invsqrt2;
-
-            d.V1L = c0 + c3;
-            d.V7L = c0 - c3;
-        }
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+        };

        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
+        /// Adjusts given quantization table to be complient with FDCT implementation.
        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        /// <remarks>
+        /// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
+        /// </remarks>
+        /// <param name="quantizationtable">Quantization table to adjust.</param>
+        public static void AdjustToFDCT(ref Block8x8F quantizationtable)
        {
-            Vector4 c0 = s.V0R;
-            Vector4 c1 = s.V7R;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6R;
-            c0 = s.V1R;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5R;
-            c0 = s.V2R;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3R;
-            c1 = s.V4R;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0R = c0 + c1;
-            d.V4R = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2R = (w0 * c2) + (w1 * c3);
-            d.V6R = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3R = c0 - c2;
-            d.V5R = c3 - c1;
-
-            c0 = (c0 + c2) * InvSqrt2;
-            c3 = (c3 + c1) * InvSqrt2;
-
-            d.V1R = c0 + c3;
-            d.V7R = c0 - c3;
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+            }
        }

        /// <summary>
-        /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
+        /// Apply 2D floating point FDCT inplace.
        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        /// <param name="block">Input matrix.</param>
+        public static void TransformFDCT(ref Block8x8F block)
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
-            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
-            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
-            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
-            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
-            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
-            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
-            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
-
-            Vector256<float> c0 = Avx.Add(t0, t3);
-            Vector256<float> c1 = Avx.Add(t1, t2);
-
-            // 0 4
-            d.V0 = Avx.Add(c0, c1);
-            d.V4 = Avx.Subtract(c0, c1);
-
-            Vector256<float> c3 = Avx.Subtract(t0, t3);
-            Vector256<float> c2 = Avx.Subtract(t1, t2);
-
-            // 2 6
-            d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
-            d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
-
-            c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
-            c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
-
-            c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
-            c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
-
-            // 3 5
-            d.V3 = Avx.Subtract(c0, c2);
-            d.V5 = Avx.Subtract(c3, c1);
-
-            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
-            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
-
-            // 1 7
-            d.V1 = Avx.Add(c0, c3);
-            d.V7 = Avx.Subtract(c0, c3);
+            if (Avx.IsSupported)
+            {
+                ForwardTransform_Avx(ref block);
+            }
+            else
 #endif
+            if (Vector.IsHardwareAccelerated)
+            {
+                ForwardTransform_Vector4(ref block);
+            }
+            else
+            {
+                ForwardTransform_Scalar(ref block);
+            }
        }

        /// <summary>
-        /// Performs 8x8 matrix Forward Discrete Cosine Transform
+        /// Apply 2D floating point FDCT inplace using scalar operations.
        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        /// <remarks>
+        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransform_Scalar(ref Block8x8F block)
        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
+            const int dctSize = 8;
+
+            float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+            float tmp10, tmp11, tmp12, tmp13;
+            float z1, z2, z3, z4, z5, z11, z13;
+
+            // First pass - process rows
+            ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
            {
-                FDCT8x8_Avx(ref s, ref d);
+                tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
+                tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
+                tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
+                tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
+                tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
+                tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
+                tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
+                tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, dctSize);
            }
-            else
-#endif
+
+            // Second pass - process columns
+            dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
            {
-                FDCT8x4_LeftPart(ref s, ref d);
-                FDCT8x4_RightPart(ref s, ref d);
+                tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
+                tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, 1);
            }
        }

        /// <summary>
-        /// Apply floating point FDCT from src into dest
+        /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller for optimization</param>
-        /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
-        public static void TransformFDCT(
-            ref Block8x8F src,
-            ref Block8x8F dest,
-            ref Block8x8F temp,
-            bool offsetSourceByNeg128 = true)
+        /// <remarks>
+        /// This implementation must be called only if hardware supports 4
+        /// floating point numbers vector. Otherwise explicit scalar
+        /// implementation <see cref="ForwardTransform_Scalar"/> is faster
+        /// because it does not rely on matrix transposition.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransform_Vector4(ref Block8x8F block)
        {
-            src.TransposeInto(ref temp);
-            if (offsetSourceByNeg128)
-            {
-                temp.AddInPlace(-128F);
-            }
+            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");

-            FDCT8x8(ref temp, ref dest);
+            // First pass - process rows
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);

-            dest.TransposeInto(ref temp);
+            // Second pass - process columns
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);
+        }

-            FDCT8x8(ref temp, ref dest);
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
+        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+        /// </remarks>
+        /// <param name="blockRef">Input reference to the first </param>
+        private static void FDCT8x4_Vector4(ref Vector4 blockRef)
+        {
+            Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
+            Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
+
+            // Even part
+            Vector4 tmp10 = tmp0 + tmp3;
+            Vector4 tmp13 = tmp0 - tmp3;
+            Vector4 tmp11 = tmp1 + tmp2;
+            Vector4 tmp12 = tmp1 - tmp2;
+
+            Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+            Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
+
+            Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+            Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
+            Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+            Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+            Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+            Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+            Vector4 z11 = tmp7 + z3;
+            Vector4 z13 = tmp7 - z3;
+
+            Unsafe.Add(ref blockRef, 10) = z13 + z2;
+            Unsafe.Add(ref blockRef, 6) = z13 - z2;
+            Unsafe.Add(ref blockRef, 2) = z11 + z4;
+            Unsafe.Add(ref blockRef, 14) = z11 - z4;
+        }

-            dest.MultiplyInPlace(C_0_125);
+        /// <summary>
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
+        {
+            block.TransposeInplace();
+            IDCT8x8(ref block, ref temp);
+            temp.TransposeInplace();
+            IDCT8x8(ref temp, ref block);
+
+            // TODO: This can be fused into quantization table step
+            block.MultiplyInPlace(C_0_125);
        }

        /// <summary>
@ -303,7 +322,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// </summary>
        /// <param name="s">Source</param>
        /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx.IsSupported)
@ -432,83 +451,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            d.V3R = my3 + mb3;
            d.V4R = my3 - mb3;
        }
-
-        /// <summary>
-        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> my1 = s.V1;
-            Vector256<float> my7 = s.V7;
-            Vector256<float> mz0 = Avx.Add(my1, my7);
-
-            Vector256<float> my3 = s.V3;
-            Vector256<float> mz2 = Avx.Add(my3, my7);
-            Vector256<float> my5 = s.V5;
-            Vector256<float> mz1 = Avx.Add(my3, my5);
-            Vector256<float> mz3 = Avx.Add(my1, my5);
-
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
-
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
-            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
-            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
-
-            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
-            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
-            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
-            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
-
-            Vector256<float> my2 = s.V2;
-            Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
-            Vector256<float> my0 = s.V0;
-            Vector256<float> my4 = s.V4;
-            mz0 = Avx.Add(my0, my4);
-            mz1 = Avx.Subtract(my0, my4);
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
-
-            my0 = Avx.Add(mz0, mz3);
-            my3 = Avx.Subtract(mz0, mz3);
-            my1 = Avx.Add(mz1, mz2);
-            my2 = Avx.Subtract(mz1, mz2);
-
-            d.V0 = Avx.Add(my0, mb0);
-            d.V7 = Avx.Subtract(my0, mb0);
-            d.V1 = Avx.Add(my1, mb1);
-            d.V6 = Avx.Subtract(my1, mb1);
-            d.V2 = Avx.Add(my2, mb2);
-            d.V5 = Avx.Subtract(my2, mb2);
-            d.V3 = Avx.Add(my3, mb3);
-            d.V4 = Avx.Subtract(my3, mb3);
-#endif
-        }
-
-        /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-
-            IDCT8x8(ref temp, ref dest);
-            dest.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref dest);
-
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
-        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
@ -39,53 +39,59 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        public const int QualityEstimationConfidenceUpperThreshold = 98;

        /// <summary>
-        /// Gets the unscaled luminance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled luminance quantization table.
        /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
        // The C# compiler emits this as a compile-time constant embedded in the PE file.
        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+        public static ReadOnlySpan<byte> LuminanceTable => new byte[]
        {
-            16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-            40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-            57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-            109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-            100, 120, 92, 101, 103, 99,
+            16, 11, 10, 16,  24,  40,  51,  61,
+            12, 12, 14, 19,  26,  58,  60,  55,
+            14, 13, 16, 24,  40,  57,  69,  56,
+            14, 17, 22, 29,  51,  87,  80,  62,
+            18, 22, 37, 56,  68, 109, 103,  77,
+            24, 35, 55, 64,  81, 104, 113,  92,
+            49, 64, 78, 87, 103, 121, 120, 101,
+            72, 92, 95, 98, 112, 100, 103,  99,
        };

        /// <summary>
-        /// Gets the unscaled chrominance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled chrominance quantization table.
        /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
        // The C# compiler emits this as a compile-time constant embedded in the PE file.
        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+        public static ReadOnlySpan<byte> ChrominanceTable => new byte[]
        {
-            17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+            17, 18, 24, 47, 99, 99, 99, 99,
+            18, 21, 26, 66, 99, 99, 99, 99,
+            24, 26, 56, 99, 99, 99, 99, 99,
+            47, 66, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
            99, 99, 99, 99, 99, 99, 99, 99,
        };

        /// Ported from JPEGsnoop:
        /// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694
        /// <summary>
-        /// Estimates jpeg quality based on quantization table in zig-zag order.
+        /// Estimates jpeg quality based on standard quantization table.
        /// </summary>
        /// <remarks>
-        /// This technically can be used with any given table but internal decoder code uses ITU spec tables:
-        /// <see cref="UnscaledQuant_Luminance"/> and <see cref="UnscaledQuant_Chrominance"/>.
+        /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables:
+        /// <see cref="LuminanceTable"/> and <see cref="ChrominanceTable"/>.
        /// </remarks>
        /// <param name="table">Input quantization table.</param>
-        /// <param name="target">Quantization to estimate against.</param>
-        /// <returns>Estimated quality</returns>
+        /// <param name="target">Natural order quantization table to estimate against.</param>
+        /// <returns>Estimated quality.</returns>
        public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan<byte> target)
        {
            // This method can be SIMD'ified if standard table is injected as Block8x8F.
@ -106,11 +112,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            int quality;
            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                float coeff = table[i];
-                int coeffInteger = (int)coeff;
+                int coeff = (int)table[i];

                // Coefficients are actually int16 casted to float numbers so there's no truncating error.
-                if (coeffInteger != 0)
+                if (coeff != 0)
                {
                    comparePercent = 100.0 * (table[i] / target[i]);
                }
@ -152,7 +157,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <returns>Estimated quality</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable)
-            => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance);
+            => EstimateQuality(ref luminanceTable, LuminanceTable);

        /// <summary>
        /// Estimates jpeg quality based on quantization table in zig-zag order.
@ -161,7 +166,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <returns>Estimated quality</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable)
-            => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance);
+            => EstimateQuality(ref chrominanceTable, ChrominanceTable);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static int QualityToScale(int quality)
@ -185,10 +190,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Block8x8F ScaleLuminanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Block8x8F ScaleChrominanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable);
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@ -0,0 +1,300 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class ZigZag
+    {
+#pragma warning disable SA1309 // naming rules violation warnings
+        /// <summary>
+        /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
+        /// </summary>
+        private const byte _ = 0xff;
+#pragma warning restore SA1309
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
+        {
+            // row0
+            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
+            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
+            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
+
+            // row1
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
+            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
+            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
+
+            // row2
+            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
+            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
+
+            // row3
+            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
+            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
+            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
+            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
+
+            // row4
+            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
+            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
+            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
+
+            // row5
+            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
+            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
+
+            // row6
+            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
+            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
+            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
+
+            // row7
+            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
+            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
+        {
+                // 01_AB/01_EF/23_CD - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
+
+                // 01_AB - inner-lane
+                0, 1, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   6, 7, 12, 13,  _, _, _, _,   _, _, _, _,   _, _, 10, 11,   4, 5, 6, 7,
+
+                // 01_CD/23_GH - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,
+
+                // 01_CD - inner-lane
+                _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   _, _, 10, 11,   4, 5, _, _,   _, _, _, _,
+
+                // 01_EF - inner-lane
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, _, _,   _, _, _, _,   _, _, _, _,
+
+                // 23_AB/45_CD/67_EF - cross-lane
+                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
+
+                // 23_AB - inner-lane
+                4, 5, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   2, 3, 8, 9,   _, _, _, _,
+
+                // 23_CD - inner-lane
+                _, _, 6, 7,   12, 13, _, _,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   6, 7, 12, 13,
+
+                // 23_EF - inner-lane
+                _, _, _, _,   _, _, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+
+                // 23_GH - inner-lane
+                _, _, _, _,   _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+
+                // 45_AB - inner-lane
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
+
+                // 45_CD - inner-lane
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   _, _, 2, 3,   8, 9, _, _,   _, _, _, _,
+
+                // 45_EF - cross-lane
+                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   _, _, _, _,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+
+                // 45_EF - inner-lane
+                2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,  _, _, _, _,   _, _, _, _,   _, _, 2, 3,   8, 9, _, _,
+
+                // 45_GH - inner-lane
+                _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 6, 7,
+
+                // 67_CD - inner-lane
+                _, _, _, _,   _, _, _, _,   _, _, 10, 11,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+
+                // 67_EF - inner-lane
+                _, _, _, _,   _, _, 6, 7,   0, 1, _, _,   2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,
+
+                // 67_GH - inner-lane
+                8, 9, 10, 11,   4, 5, _, _,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, 6, 7,   12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
+        {
+            DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
+
+            fixed (byte* maskPtr = SseShuffleMasks)
+            {
+                Vector128<byte> rowA = block.V0.AsByte();
+                Vector128<byte> rowB = block.V1.AsByte();
+                Vector128<byte> rowC = block.V2.AsByte();
+                Vector128<byte> rowD = block.V3.AsByte();
+                Vector128<byte> rowE = block.V4.AsByte();
+                Vector128<byte> rowF = block.V5.AsByte();
+                Vector128<byte> rowG = block.V6.AsByte();
+                Vector128<byte> rowH = block.V7.AsByte();
+
+                // row0 - A0  A1  B0  C0  B1  A2  A3  B2
+                Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
+                Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(rowA0, rowB0);
+                Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
+                row0 = Sse2.Or(row0, rowC0);
+
+                // row1 - C1  D0  E0  D1  C2  B3  A4  A5
+                Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
+                Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(rowA1, rowC1);
+                Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
+                row1 = Sse2.Or(row1, rowD1);
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
+
+                // row2
+                Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
+                Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(rowE2, rowF2);
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
+
+                // row3
+                Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
+                Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(rowA3, rowB3);
+                Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
+                row3 = Sse2.Or(row3, rowC3);
+                Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
+                Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
+                row3 = Sse2.Or(row3, rowD3);
+
+                // row4
+                Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
+                Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(rowE4, rowF4);
+                Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
+                row4 = Sse2.Or(row4, rowG4);
+                Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
+                row4 = Sse2.Or(row4, rowH4);
+
+                // row5
+                Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
+                Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(rowC5, rowD5);
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
+
+                // row6
+                Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
+                Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(rowE6, rowF6);
+                Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
+                row6 = Sse2.Or(row6, rowH6);
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
+
+                // row7
+                Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
+                Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(rowG7, rowH7);
+                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
+
+                block.V0 = row0;
+                block.V1 = row1;
+                block.V2 = row2;
+                block.V3 = row3;
+                block.V4 = row4;
+                block.V5 = row5;
+                block.V6 = row6;
+                block.V7 = row7;
+            }
+        }
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+            {
+                Vector256<byte> rowsAB = block.V01.AsByte();
+                Vector256<byte> rowsCD = block.V23.AsByte();
+                Vector256<byte> rowsEF = block.V45.AsByte();
+                Vector256<byte> rowsGH = block.V67.AsByte();
+
+                // rows 0 1
+                Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
+
+                Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
+                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
+
+                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+                Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
+
+                // rows 2 3
+                Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
+                Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
+
+                Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+
+                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
+                Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+
+                Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
+
+                // rows 4 5
+                Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
+                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
+                Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
+
+                Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
+                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
+
+                Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+
+                Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
+
+                // rows 6 7
+                Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
+
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
+                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
+
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
+                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+
+                Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+
+                block.V01 = row01.AsInt16();
+                block.V23 = row23.AsInt16();
+                block.V45 = row45.AsInt16();
+                block.V67 = row67.AsInt16();
+            }
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@ -2,21 +2,15 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
-    /// <summary>
-    /// Holds the Jpeg UnZig array in a value/stack type.
-    /// Unzig maps from the zigzag ordering to the natural ordering. For example,
-    /// unzig[3] is the column and row of the fourth element in zigzag order. The
-    /// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-    /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct ZigZag
+    internal static partial class ZigZag
    {
        /// <summary>
+        /// Gets span of zig-zag ordering indices.
+        /// </summary>
+        /// <remarks>
        /// When reading corrupted data, the Huffman decoders could attempt
        /// to reference an entry beyond the end of this array (if the decoded
        /// zero run length reaches past the end of the block).  To prevent
@ -25,20 +19,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// to be stored in location 63 of the block, not somewhere random.
        /// The worst case would be a run-length of 15, which means we need 16
        /// fake entries.
-        /// </summary>
-        private const int Size = 64 + 16;
-
-        /// <summary>
-        /// Copy of <see cref="Unzig"/> in a value type
-        /// </summary>
-        public fixed byte Data[Size];
-
-        /// <summary>
-        /// Gets the unzigs map, which maps from the zigzag ordering to the natural ordering.
-        /// For example, unzig[3] is the column and row of the fourth element in zigzag order.
-        /// The value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-        /// </summary>
-        private static ReadOnlySpan<byte> Unzig => new byte[]
+        /// </remarks>
+        public static ReadOnlySpan<byte> ZigZagOrder => new byte[]
        {
            0,  1,  8, 16,  9,  2,  3, 10,
            17, 24, 32, 25, 18, 11,  4,  5,
@ -48,53 +30,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            29, 22, 15, 23, 30, 37, 44, 51,
            58, 59, 52, 45, 38, 31, 39, 46,
            53, 60, 61, 54, 47, 55, 62, 63,
-            63, 63, 63, 63, 63, 63, 63, 63, // Extra entries for safety in decoder
+
+            // Extra entries for safety in decoder
+            63, 63, 63, 63, 63, 63, 63, 63,
            63, 63, 63, 63, 63, 63, 63, 63
        };
-
-        /// <summary>
-        /// Returns the value at the given index
-        /// </summary>
-        /// <param name="idx">The index</param>
-        /// <returns>The <see cref="byte"/></returns>
-        public byte this[int idx]
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            get
-            {
-                ref byte self = ref Unsafe.As<ZigZag, byte>(ref this);
-                return Unsafe.Add(ref self, idx);
-            }
-        }
-
-        /// <summary>
-        /// Creates and fills an instance of <see cref="ZigZag"/> with Jpeg unzig indices
-        /// </summary>
-        /// <returns>The new instance</returns>
-        public static ZigZag CreateUnzigTable()
-        {
-            ZigZag result = default;
-            ref byte sourceRef = ref MemoryMarshal.GetReference(Unzig);
-            ref byte destinationRef = ref Unsafe.AsRef<byte>(result.Data);
-
-            Unzig.CopyTo(new Span<byte>(result.Data, Size));
-
-            return result;
-        }
-
-        /// <summary>
-        /// Apply Zigging to the given quantization table, so it will be sufficient to multiply blocks for dequantizing them.
-        /// </summary>
-        public static Block8x8F CreateDequantizationTable(ref Block8x8F qt)
-        {
-            Block8x8F result = default;
-
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                result[Unzig[i]] = qt[i];
-            }
-
-            return result;
-        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@ -887,9 +887,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                        stream.Read(this.temp, 0, 64);
                        remaining -= 64;

+                        // Parsing quantization table & saving it in natural order
                        for (int j = 0; j < 64; j++)
                        {
-                            table[j] = this.temp[j];
+                            table[ZigZag.ZigZagOrder[j]] = this.temp[j];
                        }

                        break;
@ -907,9 +908,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                        stream.Read(this.temp, 0, 128);
                        remaining -= 128;

+                        // Parsing quantization table & saving it in natural order
                        for (int j = 0; j < 64; j++)
                        {
-                            table[j] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
+                            table[ZigZag.ZigZagOrder[j]] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
                        }

                        break;
@ -1069,13 +1071,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                    // Types 0..1 DC..AC
                    if (tableType > 1)
                    {
-                        JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table type.");
+                        JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table type: {tableType}");
                    }

                    // Max tables of each type
                    if (tableIndex > 3)
                    {
-                        JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table index.");
+                        JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table index: {tableIndex}");
                    }

                    stream.Read(huffmanDataSpan, 0, 16);
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@ -131,28 +131,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
            this.WriteStartOfScan(componentCount, componentIds);

            // Write the scan compressed data.
-            var scanEncoder = new HuffmanScanEncoder(stream);
-            if (this.colorType == JpegColorType.Luminance)
-            {
-                // luminance quantization table only.
-                scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
-            }
-            else
-            {
-                // luminance and chrominance quantization tables.
-                switch (this.colorType)
-                {
-                    case JpegColorType.YCbCrRatio444:
-                    case JpegColorType.Luminance:
-                        scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
-                        break;
-                    case JpegColorType.YCbCrRatio420:
-                        scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
-                        break;
-                    case JpegColorType.Rgb:
-                        scanEncoder.EncodeRgb(image, ref luminanceQuantTable, cancellationToken);
-                        break;
-                }
+            switch (this.colorType)
+            {
+                case JpegColorType.YCbCrRatio444:
+                    new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.YCbCrRatio420:
+                    new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.Luminance:
+                    new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.Rgb:
+                    new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken);
+                    break;
+                default:
+                    // all other non-supported color types are checked at the start of this method
+                    break;
            }

            // Write the End Of Image marker.
@ -193,7 +188,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
            dqt[offset++] = (byte)i;
            for (int j = 0; j < Block8x8F.Size; j++)
            {
-                dqt[offset++] = (byte)quant[j];
+                dqt[offset++] = (byte)quant[ZigZag.ZigZagOrder[j]];
            }
        }

@ -735,11 +730,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
        /// Initializes quantization tables.
        /// </summary>
        /// <remarks>
+        /// <para>
+        /// Zig-zag ordering is NOT applied to the resulting tables.
+        /// </para>
+        /// <para>
        /// We take quality values in a hierarchical order:
        /// 1. Check if encoder has set quality
-        /// 2. Check if metadata has special table for encoding
-        /// 3. Check if metadata has set quality
-        /// 4. Take default quality value - 75
+        /// 2. Check if metadata has set quality
+        /// 3. Take default quality value - 75
+        /// </para>
        /// </remarks>
        /// <param name="componentCount">Color components count.</param>
        /// <param name="metadata">Jpeg metadata instance.</param>
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
@ -65,22 +65,21 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
                scanDecoder.ResetInterval = 0;
                jpegDecoder.ParseStream(stream, scanDecoder, CancellationToken.None);

-                using var image = new Image<Rgb24>(this.configuration, spectralConverter.PixelBuffer, new ImageMetadata());
-                CopyImageBytesToBuffer(buffer, image);
+                CopyImageBytesToBuffer(buffer, spectralConverter.PixelBuffer);
            }
            else
            {
                using var image = Image.Load<Rgb24>(stream);
-                CopyImageBytesToBuffer(buffer, image);
+                CopyImageBytesToBuffer(buffer, image.Frames.RootFrame.PixelBuffer);
            }
        }

-        private static void CopyImageBytesToBuffer(Span<byte> buffer, Image<Rgb24> image)
+        private static void CopyImageBytesToBuffer(Span<byte> buffer, Buffer2D<Rgb24> pixelBuffer)
        {
            int offset = 0;
-            for (int y = 0; y < image.Height; y++)
+            for (int y = 0; y < pixelBuffer.Height; y++)
            {
-                Span<Rgb24> pixelRowSpan = image.GetPixelRowSpan(y);
+                Span<Rgb24> pixelRowSpan = pixelBuffer.GetRowSpan(y);
                Span<byte> rgbBytes = MemoryMarshal.AsBytes(pixelRowSpan);
                rgbBytes.CopyTo(buffer.Slice(offset));
                offset += rgbBytes.Length;
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs
@ -28,6 +28,6 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
        }

        /// <inheritdoc/>
-        public override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision);
+        protected override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision);
    }
 }
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
@ -0,0 +1,50 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
+{
+    [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
+    public class Block8x8F_Quantize
+    {
+        private Block8x8F block = CreateFromScalar(1);
+        private Block8x8F quant = CreateFromScalar(1);
+        private Block8x8 result = default;
+
+        [Benchmark]
+        public short Quantize()
+        {
+            Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant);
+            return this.result[0];
+        }
+
+        private static Block8x8F CreateFromScalar(float scalar)
+        {
+            Block8x8F block = default;
+            for (int i = 0; i < 64; i++)
+            {
+                block[i] = scalar;
+            }
+
+            return block;
+        }
+    }
+}
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  2. SSE             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  3. AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+|   Method |             Job |     Mean |    Error |   StdDev | Ratio |
+|--------- |-----------------|---------:|---------:|---------:|------:|
+| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns |  1.00 |
+| Quantize |             SSE | 24.11 ns | 0.298 ns | 0.279 ns |  0.33 |
+| Quantize |             AVX | 15.90 ns | 0.074 ns | 0.065 ns |  0.22 |
+ */
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@ -9,29 +9,44 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
    [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
    public class Block8x8F_Transpose
    {
-        private static readonly Block8x8F Source = Create8x8FloatData();
+        private Block8x8F source = Create8x8FloatData();

        [Benchmark]
-        public void TransposeInto()
+        public float TransposeInplace()
        {
-            var dest = default(Block8x8F);
-            Source.TransposeInto(ref dest);
+            this.source.TransposeInplace();
+            return this.source[0];
        }

        private static Block8x8F Create8x8FloatData()
        {
-            var result = new float[64];
+            Block8x8F block = default;
            for (int i = 0; i < 8; i++)
            {
                for (int j = 0; j < 8; j++)
                {
-                    result[(i * 8) + j] = (i * 10) + j;
+                    block[(i * 8) + j] = (i * 10) + j;
                }
            }

-            var source = default(Block8x8F);
-            source.LoadFrom(result);
-            return source;
+            return block;
        }
    }
 }
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1237 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  2. SSE             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  3. AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+Runtime=.NET Core 3.1
+
+|           Method |             Job |      Mean |     Error |    StdDev | Ratio |
+|----------------- |----------------:|----------:|----------:|----------:|------:|
+| TransposeInplace | No HwIntrinsics | 12.531 ns | 0.0637 ns | 0.0565 ns |  1.00 |
+| TransposeInplace |             AVX |  5.767 ns | 0.0529 ns | 0.0495 ns |  0.46 |
+*/
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@ -111,24 +111,24 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 }

 /*
-BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042
 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-.NET Core SDK=6.0.100-preview.3.21202.5
-  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
-  DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]     : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT


 |                      Method | Quality |     Mean |    Error |   StdDev | Ratio |
 |---------------------------- |-------- |---------:|---------:|---------:|------:|
-| 'System.Drawing Jpeg 4:2:0' |      75 | 29.41 ms | 0.108 ms | 0.096 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |      75 | 26.30 ms | 0.131 ms | 0.109 ms |  0.89 |
-|     'ImageSharp Jpeg 4:4:4' |      75 | 36.70 ms | 0.303 ms | 0.269 ms |  1.25 |
+| 'System.Drawing Jpeg 4:2:0' |      75 | 30.04 ms | 0.540 ms | 0.479 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |      75 | 19.32 ms | 0.290 ms | 0.257 ms |  0.64 |
+|     'ImageSharp Jpeg 4:4:4' |      75 | 26.76 ms | 0.332 ms | 0.294 ms |  0.89 |
 |                             |         |          |          |          |       |
-| 'System.Drawing Jpeg 4:2:0' |      90 | 32.67 ms | 0.226 ms | 0.211 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |      90 | 33.56 ms | 0.237 ms | 0.222 ms |  1.03 |
-|     'ImageSharp Jpeg 4:4:4' |      90 | 44.82 ms | 0.250 ms | 0.234 ms |  1.37 |
+| 'System.Drawing Jpeg 4:2:0' |      90 | 32.82 ms | 0.184 ms | 0.163 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |      90 | 25.00 ms | 0.408 ms | 0.361 ms |  0.76 |
+|     'ImageSharp Jpeg 4:4:4' |      90 | 31.83 ms | 0.636 ms | 0.595 ms |  0.97 |
 |                             |         |          |          |          |       |
-| 'System.Drawing Jpeg 4:2:0' |     100 | 39.06 ms | 0.233 ms | 0.218 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |     100 | 40.23 ms | 0.225 ms | 0.277 ms |  1.03 |
-|     'ImageSharp Jpeg 4:4:4' |     100 | 63.35 ms | 0.486 ms | 0.431 ms |  1.62 |
+| 'System.Drawing Jpeg 4:2:0' |     100 | 39.30 ms | 0.359 ms | 0.318 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |     100 | 34.49 ms | 0.265 ms | 0.235 ms |  0.88 |
+|     'ImageSharp Jpeg 4:4:4' |     100 | 56.40 ms | 0.565 ms | 0.501 ms |  1.44 |
 */
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks
                    .WithId("1. No HwIntrinsics").AsBaseline());

 #if SUPPORTS_RUNTIME_INTRINSICS
-                if (Avx.IsSupported)
+                if (Sse.IsSupported)
                {
                    this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithId("2. AVX"));
+                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
+                        .WithId("2. SSE"));
                }

-                if (Sse.IsSupported)
+                if (Avx.IsSupported)
                {
                    this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
-                        .WithId("3. SSE"));
+                        .WithId("3. AVX"));
                }
 #endif
            }
--- a/tests/ImageSharp.Benchmarks/Program.cs
+++ b/tests/ImageSharp.Benchmarks/Program.cs
@ -1,8 +1,6 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

-using System.Reflection;
-
 using BenchmarkDotNet.Running;

 namespace SixLabors.ImageSharp.Benchmarks
@ -15,9 +13,8 @@ namespace SixLabors.ImageSharp.Benchmarks
        /// <param name="args">
        /// The arguments to pass to the program.
        /// </param>
-        public static void Main(string[] args)
-        {
-            new BenchmarkSwitcher(typeof(Program).GetTypeInfo().Assembly).Run(args);
-        }
+        public static void Main(string[] args) => BenchmarkSwitcher
+            .FromAssembly(typeof(Program).Assembly)
+            .Run(args);
    }
 }
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@ -1,6 +1,3 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
 using System;
 using SixLabors.ImageSharp.Tests.Formats.Jpg;
 using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations;
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -4,7 +4,9 @@
 // Uncomment this to turn unit tests into benchmarks:
 // #define BENCHMARKING
 using System;
-using System.Diagnostics;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics.X86;
+#endif

 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
@ -164,52 +166,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        }

        [Fact]
-        public void TransposeInto()
+        public void TransposeInplace()
        {
            static void RunTest()
            {
                float[] expected = Create8x8FloatData();
                ReferenceImplementations.Transpose8x8(expected);

-                var source = default(Block8x8F);
-                source.LoadFrom(Create8x8FloatData());
+                var block8x8 = default(Block8x8F);
+                block8x8.LoadFrom(Create8x8FloatData());

-                var dest = default(Block8x8F);
-                source.TransposeInto(ref dest);
+                block8x8.TransposeInplace();

                float[] actual = new float[64];
-                dest.ScaledCopyTo(actual);
+                block8x8.ScaledCopyTo(actual);

                Assert.Equal(expected, actual);
            }

            FeatureTestRunner.RunWithHwIntrinsicsFeature(
                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
-        }
-
-        private class BufferHolder
-        {
-            public Block8x8F Buffer;
-        }
-
-        [Fact]
-        public void TransposeInto_Benchmark()
-        {
-            var source = new BufferHolder();
-            source.Buffer.LoadFrom(Create8x8FloatData());
-            var dest = new BufferHolder();
-
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ...");
-            var sw = Stopwatch.StartNew();
-
-            for (int i = 0; i < Times; i++)
-            {
-                source.Buffer.TransposeInto(ref dest.Buffer);
-            }
-
-            sw.Stop();
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
        }

        private static float[] Create8x8ColorCropTestData()
@ -273,32 +250,44 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        }

        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public unsafe void Quantize(int seed)
+        [InlineData(1, 2)]
+        [InlineData(2, 1)]
+        public void Quantize(int srcSeed, int qtSeed)
        {
-            var block = default(Block8x8F);
-            block.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
-
-            var qt = default(Block8x8F);
-            qt.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
+            static void RunTest(string srcSeedSerialized, string qtSeedSerialized)
+            {
+                int srcSeed = FeatureTestRunner.Deserialize<int>(srcSeedSerialized);
+                int qtSeed = FeatureTestRunner.Deserialize<int>(qtSeedSerialized);

-            var unzig = ZigZag.CreateUnzigTable();
+                Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);

-            int* expectedResults = stackalloc int[Block8x8F.Size];
-            ReferenceImplementations.QuantizeRational(&block, expectedResults, &qt, unzig.Data);
+                // Quantization code is used only in jpeg where it's guaranteed that
+                // qunatization valus are greater than 1
+                // Quantize method supports negative numbers by very small numbers can cause troubles
+                Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed);

-            var actualResults = default(Block8x8F);
+                // Reference implementation quantizes given block via division
+                Block8x8 expected = default;
+                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);

-            Block8x8F.Quantize(ref block, ref actualResults, ref qt, ref unzig);
+                // Actual current implementation quantizes given block via multiplication
+                // With quantization table reciprocal
+                for (int i = 0; i < Block8x8F.Size; i++)
+                {
+                    quant[i] = 1f / quant[i];
+                }

-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                int expected = expectedResults[i];
-                int actual = (int)actualResults[i];
+                Block8x8 actual = default;
+                Block8x8F.Quantize(ref source, ref actual, ref quant);

-                Assert.Equal(expected, actual);
+                Assert.True(CompareBlocks(expected, actual, 1, out int diff), $"Blocks are not equal, diff={diff}");
            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                srcSeed,
+                qtSeed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE);
        }

        [Fact]
@ -368,48 +357,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
        }

-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void DequantizeBlock(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-            Block8x8F.DequantizeBlock(&actual, &qt, unzig.Data);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void ZigZag_CreateDequantizationTable_MultiplicationShouldQuantize(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-            Block8x8F zigQt = ZigZag.CreateDequantizationTable(ref qt);
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-
-            actual.MultiplyInPlace(ref zigQt);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
        [Fact]
        public void AddToAllInPlace()
        {
@ -462,7 +409,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg

            short[] data = Create8x8ShortData();

-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);

            Block8x8F dest = default;
            dest.LoadFromInt16Scalar(ref source);
@ -483,7 +430,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg

            short[] data = Create8x8ShortData();

-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);

            Block8x8F dest = default;
            dest.LoadFromInt16ExtendedAvx2(ref source);
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@ -1,9 +1,10 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
-
+using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;
 using Xunit.Abstractions;

@ -22,7 +23,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        {
            short[] data = Create8x8ShortData();

-            var block = new Block8x8(data);
+            var block = Block8x8.Load(data);

            for (int i = 0; i < Block8x8.Size; i++)
            {
@ -43,32 +44,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            Assert.Equal(42, block[42]);
        }

-        [Fact]
-        public unsafe void Indexer_GetScalarAt_SetScalarAt()
-        {
-            int sum;
-            var block = default(Block8x8);
-
-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                Block8x8.SetScalarAt(&block, i, (short)i);
-            }
-
-            sum = 0;
-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                sum += Block8x8.GetScalarAt(&block, i);
-            }
-
-            Assert.Equal(sum, 64 * 63 / 2);
-        }
-
        [Fact]
        public void AsFloatBlock()
        {
            short[] data = Create8x8ShortData();

-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);

            Block8x8F dest = source.AsFloatBlock();

@ -82,7 +63,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void ToArray()
        {
            short[] data = Create8x8ShortData();
-            var block = new Block8x8(data);
+            var block = Block8x8.Load(data);

            short[] result = block.ToArray();

@ -93,8 +74,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void Equality_WhenTrue()
        {
            short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);

            block1[0] = 42;
            block2[0] = 42;
@ -107,8 +88,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void Equality_WhenFalse()
        {
            short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);

            block1[0] = 42;
            block2[0] = 666;
@ -131,8 +112,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void TotalDifference()
        {
            short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);

            block2[10] += 7;
            block2[63] += 8;
@ -141,5 +122,159 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg

            Assert.Equal(15, d);
        }
+
+        [Fact]
+        public void GetLastNonZeroIndex_AllZero()
+        {
+            static void RunTest()
+            {
+                Block8x8 data = default;
+
+                nint expected = -1;
+
+                nint actual = data.GetLastNonZeroIndex();
+
+                Assert.Equal(expected, actual);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Fact]
+        public void GetLastNonZeroIndex_AllNonZero()
+        {
+            static void RunTest()
+            {
+                Block8x8 data = default;
+                for (int i = 0; i < Block8x8.Size; i++)
+                {
+                    data[i] = 10;
+                }
+
+                nint expected = Block8x8.Size - 1;
+
+                nint actual = data.GetLastNonZeroIndex();
+
+                Assert.Equal(expected, actual);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledSingle(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    int setIndex = rng.Next(1, Block8x8.Size);
+                    data[setIndex] = (short)rng.Next(-2000, 2000);
+
+                    nint expected = setIndex;
+
+                    nint actual = data.GetLastNonZeroIndex();
+
+                    Assert.Equal(expected, actual);
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledPartially(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    int lastIndex = rng.Next(1, Block8x8.Size);
+                    short fillValue = (short)rng.Next(-2000, 2000);
+                    for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++)
+                    {
+                        data[dataIndex] = fillValue;
+                    }
+
+                    int expected = lastIndex;
+
+                    nint actual = data.GetLastNonZeroIndex();
+
+                    Assert.Equal(expected, actual);
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledFragmented(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    short fillValue = (short)rng.Next(-2000, 2000);
+
+                    // first filled chunk
+                    int firstChunkStart = rng.Next(0, Block8x8.Size / 2);
+                    int firstChunkEnd = rng.Next(firstChunkStart, Block8x8.Size / 2);
+                    for (int dataIdx = firstChunkStart; dataIdx <= firstChunkEnd; dataIdx++)
+                    {
+                        data[dataIdx] = fillValue;
+                    }
+
+                    // second filled chunk, there might be a spot with zero(s) between first and second chunk
+                    int secondChunkStart = rng.Next(firstChunkEnd, Block8x8.Size);
+                    int secondChunkEnd = rng.Next(secondChunkStart, Block8x8.Size);
+                    for (int dataIdx = secondChunkStart; dataIdx <= secondChunkEnd; dataIdx++)
+                    {
+                        data[dataIdx] = fillValue;
+                    }
+
+                    int expected = secondChunkEnd;
+
+                    nint actual = data.GetLastNonZeroIndex();
+
+                    Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}");
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@ -33,15 +33,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            {
                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);

-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);

-                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);

                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);

-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
            }

            [Theory]
@ -52,15 +51,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            {
                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);

-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);

-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);

                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);

-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
            }

            // Inverse transform
@ -120,24 +118,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            public void IDCT8x8_Avx(int seed)
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                var skip = !Avx.IsSupported;
-#else
-                var skip = true;
-#endif
-
-                if (skip)
+                if (!Avx.IsSupported)
                {
                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
                }

                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
+                Block8x8F srcBlock = default;
                srcBlock.LoadFrom(src);

-                var destBlock = default(Block8x8F);
+                Block8x8F destBlock = default;

-                var expectedDest = new float[64];
+                float[] expectedDest = new float[64];

                // reference, left part
                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
@ -148,10 +140,11 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                // testee, whole 8x8
                FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);

-                var actualDest = new float[64];
+                float[] actualDest = new float[64];
                destBlock.ScaledCopyTo(actualDest);

                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+#endif
            }

            [Theory]
@ -167,8 +160,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    var srcBlock = default(Block8x8F);
                    srcBlock.LoadFrom(src);

-                    var destBlock = default(Block8x8F);
-
                    var expectedDest = new float[64];
                    var temp1 = new float[64];
                    var temp2 = default(Block8x8F);
@ -177,10 +168,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);

                    // testee
-                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);

                    var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    srcBlock.ScaledCopyTo(actualDest);

                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
                }
@ -198,95 +189,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            }

            // Forward transform
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_LeftPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_RightPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x8_Avx(int seed)
-            {
-#if SUPPORTS_RUNTIME_INTRINSICS
-                var skip = !Avx.IsSupported;
-#else
-                var skip = true;
-#endif
-                if (skip)
-                {
-                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
-                }
-
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference, left part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // reference, right part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee, whole 8x8
-                FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
+            // This test covers entire FDCT conversions chain
+            // This test checks all implementations: intrinsic and scalar fallback
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
@ -297,37 +201,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    int seed = FeatureTestRunner.Deserialize<int>(serialized);

                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                    var srcBlock = default(Block8x8F);
-                    srcBlock.LoadFrom(src);
-
-                    var destBlock = default(Block8x8F);
+                    var block = default(Block8x8F);
+                    block.LoadFrom(src);

-                    var expectedDest = new float[64];
-                    var temp1 = new float[64];
-                    var temp2 = default(Block8x8F);
+                    float[] expectedDest = new float[64];
+                    float[] temp1 = new float[64];

                    // reference
                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);

                    // testee
-                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+                    // Part of the FDCT calculations is fused into the quantization step
+                    // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
+                    FastFloatingPointDCT.TransformFDCT(ref block);
+                    for (int i = 0; i < 64; i++)
+                    {
+                        block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
+                    }

-                    var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    float[] actualDest = block.ToArray();

-                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                    Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
                }

                // 3 paths:
                // 1. AllowAll - call avx/fma implementation
                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call fallback code of Vector4 implementation
-                //
-                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                // 3. DisableAvx - call sse implementation
+                // 4. DisableHWIntrinsic - call scalar fallback implementation
                FeatureTestRunner.RunWithHwIntrinsicsFeature(
                    RunTest,
                    seed,
-                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
            }
        }
    }
--- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
@ -85,157 +85,5 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Assert.Equal(expected, actual);
            }
        }
-
-        [Fact]
-        public void GetLastValuableElementIndex_AllZero()
-        {
-            static void RunTest()
-            {
-                Block8x8F data = default;
-
-                int expectedLessThan = 1;
-
-                int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
-
-                Assert.True(actual < expectedLessThan);
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Fact]
-        public void GetLastValuableElementIndex_AllNonZero()
-        {
-            static void RunTest()
-            {
-                Block8x8F data = default;
-                for (int i = 0; i < Block8x8F.Size; i++)
-                {
-                    data[i] = 10;
-                }
-
-                int expected = Block8x8F.Size - 1;
-
-                int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
-
-                Assert.Equal(expected, actual);
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledSingle(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int setIndex = rng.Next(1, Block8x8F.Size);
-                    data[setIndex] = rng.Next();
-
-                    int expected = setIndex;
-
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledPartially(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int lastIndex = rng.Next(1, Block8x8F.Size);
-                    int fillValue = rng.Next();
-                    for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    int expected = lastIndex;
-
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledFragmented(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int fillValue = rng.Next();
-
-                    // first filled chunk
-                    int lastIndex1 = rng.Next(1, Block8x8F.Size / 2);
-                    for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    // second filled chunk, there might be a spot with zero(s) between first and second chunk
-                    int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size);
-                    for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    int expected = lastIndex2;
-
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
@ -21,7 +21,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Block8x8F table = JpegQuantization.ScaleLuminanceTable(quality);
                int estimatedQuality = JpegQuantization.EstimateLuminanceQuality(ref table);

-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate luminance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate luminance quality for standard table at quality level {quality}");
            }
        }

@ -35,7 +37,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Block8x8F table = JpegQuantization.ScaleChrominanceTable(quality);
                int estimatedQuality = JpegQuantization.EstimateChrominanceQuality(ref table);

-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate chrominance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate chrominance quality for standard table at quality level {quality}");
            }
        }
    }
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
@ -190,6 +190,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
            Assert.False(failed);
        }

+        internal static bool CompareBlocks(Block8x8 a, Block8x8 b, int tolerance, out int diff)
+        {
+            bool res = CompareBlocks(a.AsFloatBlock(), b.AsFloatBlock(), tolerance + 1e-5f, out float fdiff);
+            diff = (int)fdiff;
+            return res;
+        }
+
+        internal static bool CompareBlocks(Block8x8F a, Block8x8F b, float tolerance, out float diff) =>
+            CompareBlocks(a.ToArray(), b.ToArray(), tolerance, out diff);
+
+        internal static bool CompareBlocks(Span<float> a, Span<float> b, float tolerance, out float diff)
+        {
+            var comparer = new ApproximateFloatComparer(tolerance);
+            bool failed = false;
+
+            diff = 0;
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = a[i];
+                float actual = b[i];
+                diff += Math.Abs(expected - actual);
+
+                if (!comparer.Equals(expected, actual))
+                {
+                    failed = true;
+                }
+            }
+
+            return !failed;
+        }
+
        internal static JpegDecoderCore ParseJpegStream(string testFileName, bool metaDataOnly = false)
        {
            byte[] bytes = TestFile.Create(testFileName).Bytes;
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@ -53,7 +53,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
            {
                this.MinVal = Math.Min(this.MinVal, data.Min());
                this.MaxVal = Math.Max(this.MaxVal, data.Max());
-                this.SpectralBlocks[x, y] = new Block8x8(data);
+                this.SpectralBlocks[x, y] = Block8x8.Load(data);
            }

            public void LoadSpectralStride(Buffer2D<Block8x8> data, int strideIndex)
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@ -15,18 +15,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
    /// </summary>
    internal static partial class ReferenceImplementations
    {
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
+        public static void DequantizeBlock(ref Block8x8F block, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Block8x8F.Size; qtIndex++)
+            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                byte i = unzigPtr[qtIndex];
-                float* unzigPos = b + i;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
+                int zig = zigzag[i];
+                block[zig] *= qt[i];
            }
        }

@ -101,42 +95,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils

        /// <summary>
        /// Reference implementation to test <see cref="Block8x8F.Quantize"/>.
-        /// Rounding is done used an integer-based algorithm defined in <see cref="RationalRound(int,int)"/>.
        /// </summary>
-        /// <param name="src">The input block</param>
-        /// <param name="dest">The destination block of integers</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unzigPtr">Pointer to <see cref="ZigZag.Data"/> </param>
-        public static unsafe void QuantizeRational(Block8x8F* src, int* dest, Block8x8F* qt, byte* unzigPtr)
+        /// <param name="src">The input block.</param>
+        /// <param name="dest">The destination block of 16bit integers.</param>
+        /// <param name="qt">The quantization table.</param>
+        /// <param name="zigzag">Zig-Zag index sequence span.</param>
+        public static void Quantize(ref Block8x8F src, ref Block8x8 dest, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
        {
-            float* s = (float*)src;
-            float* q = (float*)qt;
-
-            for (int zig = 0; zig < Block8x8F.Size; zig++)
+            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                int a = (int)s[unzigPtr[zig]];
-                int b = (int)q[zig];
-
-                int val = RationalRound(a, b);
-                dest[zig] = val;
+                int zig = zigzag[i];
+                dest[i] = (short)Math.Round(src[zig] / qt[zig], MidpointRounding.AwayFromZero);
            }
        }
-
-        /// <summary>
-        /// Rounds a rational number defined as dividend/divisor into an integer.
-        /// </summary>
-        /// <param name="dividend">The dividend.</param>
-        /// <param name="divisor">The divisor.</param>
-        /// <returns>The rounded value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int RationalRound(int dividend, int divisor)
-        {
-            if (dividend >= 0)
-            {
-                return (dividend + (divisor >> 1)) / divisor;
-            }
-
-            return -((-dividend + (divisor >> 1)) / divisor);
-        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
@ -13,8 +13,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void ZigZagCanHandleAllPossibleCoefficients()
        {
            // Mimic the behaviour of the huffman scan decoder using all possible byte values
-            var block = new short[64];
-            var zigzag = ZigZag.CreateUnzigTable();
+            short[] block = new short[64];

            for (int h = 0; h < 255; h++)
            {
@ -27,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    if (s != 0)
                    {
                        i += r;
-                        block[zigzag[i++]] = (short)s;
+                        block[ZigZag.ZigZagOrder[i++]] = (short)s;
                    }
                    else
                    {
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@ -301,6 +301,52 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
            }
        }

+        /// <summary>
+        /// Runs the given test <paramref name="action"/> within an environment
+        /// where the given <paramref name="intrinsics"/> features.
+        /// </summary>
+        /// <param name="action">The test action to run.</param>
+        /// <param name="arg0">The value to pass as a parameter #0 to the test action.</param>
+        /// <param name="arg1">The value to pass as a parameter #1 to the test action.</param>
+        /// <param name="intrinsics">The intrinsics features.</param>
+        public static void RunWithHwIntrinsicsFeature<T>(
+            Action<string, string> action,
+            T arg0,
+            T arg1,
+            HwIntrinsics intrinsics)
+            where T : IConvertible
+        {
+            if (!RemoteExecutor.IsSupported)
+            {
+                return;
+            }
+
+            foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection())
+            {
+                var processStartInfo = new ProcessStartInfo();
+                if (intrinsic.Key != HwIntrinsics.AllowAll)
+                {
+                    processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+                    RemoteExecutor.Invoke(
+                        action,
+                        arg0.ToString(),
+                        arg1.ToString(),
+                        new RemoteInvokeOptions
+                        {
+                            StartInfo = processStartInfo
+                        })
+                        .Dispose();
+                }
+                else
+                {
+                    // Since we are running using the default architecture there is no
+                    // point creating the overhead of running the action in a separate process.
+                    action(arg0.ToString(), arg1.ToString());
+                }
+            }
+        }
+
        internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
        {
            // Loop through and translate the given values into COMPlus equivaluents