New zig-zag implementation

5 years ago · 6c5cf28ecd
17 changed files with 627 additions and 375 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@ -12,7 +12,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// 8x8 coefficients matrix of <see cref="short"/> type.
+    /// 8x8 matrix of <see cref="short"/> coefficients.
    /// </summary>
    // ReSharper disable once InconsistentNaming
    [StructLayout(LayoutKind.Explicit)]
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@ -0,0 +1,87 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal partial struct Block8x8F
+    {
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+
+        private static ReadOnlySpan<int> DivideIntoInt16_Avx2_ShuffleMask => new int[] {
+            0, 1, 4, 5, 2, 3, 6, 7
+        };
+
+        private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask)
+            {
+                Vector256<int> crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32();
+
+                ref Vector256<float> aBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref a);
+                ref Vector256<float> bBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref b);
+
+                ref Vector256<short> destBase = ref Unsafe.As<Block8x8, Vector256<short>>(ref dest);
+
+                for (int i = 0; i < 8; i += 2)
+                {
+                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                    Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                    row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
+
+                    Unsafe.Add(ref destBase, i / 2) = row;
+                }
+            }
+        }
+
+        private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+            ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
+            ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
+
+            ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
+
+            for (int i = 0; i < 16; i += 2)
+            {
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector128<short> row = Sse2.PackSignedSaturate(left, right);
+                Unsafe.Add(ref destBase, i / 2) = row;
+            }
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 using System.Numerics;
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -16,7 +16,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// 8x8 coefficients matrix of <see cref="float"/> type.
+    /// 8x8 matrix of <see cref="float"/> coefficients.
    /// </summary>
    [StructLayout(LayoutKind.Explicit)]
    internal partial struct Block8x8F : IEquatable<Block8x8F>
@ -66,30 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        public Vector4 V7L;
        [FieldOffset(240)]
        public Vector4 V7R;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        /// <summary>
-        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
-        /// </summary>
-        public const int RowCount = 8;
-
-        [FieldOffset(0)]
-        public Vector256<float> V0;
-        [FieldOffset(32)]
-        public Vector256<float> V1;
-        [FieldOffset(64)]
-        public Vector256<float> V2;
-        [FieldOffset(96)]
-        public Vector256<float> V3;
-        [FieldOffset(128)]
-        public Vector256<float> V4;
-        [FieldOffset(160)]
-        public Vector256<float> V5;
-        [FieldOffset(192)]
-        public Vector256<float> V6;
-        [FieldOffset(224)]
-        public Vector256<float> V7;
-#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented

        /// <summary>
@ -188,13 +164,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return result;
        }

-        /// <summary>
-        /// Fill the block with defaults (zeroes).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public void Clear()
-            => this = default; // The cheapest way to do this in C#:
-
        /// <summary>
        /// Load raw 32bit floating point data from source.
        /// </summary>
@ -302,7 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

        public float[] ToArray()
        {
-            var result = new float[Size];
+            float[] result = new float[Size];
            this.ScaledCopyTo(result);
            return result;
        }
@ -434,102 +403,37 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Quantize the block.
-        /// </summary>
-        /// <param name="blockPtr">The block pointer.</param>
-        /// <param name="qtPtr">The qt pointer.</param>
-        /// <param name="unzigPtr">Unzig pointer</param>
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
-        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Size; qtIndex++)
-            {
-                byte blockIndex = unzigPtr[qtIndex];
-                float* unzigPos = b + blockIndex;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
-            }
-        }
-
-        /// <summary>
-        /// Quantize 'block' into 'dest' using the 'qt' quantization table:
-        /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values.
-        /// To finish the rounding it's enough to (int)-cast these values.
+        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
        /// </summary>
-        /// <param name="block">Source block</param>
-        /// <param name="dest">Destination block</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        public static unsafe void Quantize(
-            ref Block8x8F block,
-            ref Block8x8F dest,
-            ref Block8x8F qt,
-            ref ZigZag unZig)
+        /// <param name="block">Source block.</param>
+        /// <param name="dest">Destination block.</param>
+        /// <param name="qt">The quantization table.</param>
+        public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
        {
-            for (int zig = 0; zig < Size; zig++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
            {
-                dest[zig] = block[unZig[zig]];
+                DivideIntoInt16_Avx2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
            }
-
-            DivideRoundAll(ref dest, ref qt);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
+            else if (Ssse3.IsSupported)
            {
-                var vnegOne = Vector256.Create(-1f);
-                var vadd = Vector256.Create(.5F);
-                var vone = Vector256.Create(1f);
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
-                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
-                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
-                }
+                DivideIntoInt16_Sse2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
            }
            else
 #endif
            {
-                a.V0L = DivideRound(a.V0L, b.V0L);
-                a.V0R = DivideRound(a.V0R, b.V0R);
-                a.V1L = DivideRound(a.V1L, b.V1L);
-                a.V1R = DivideRound(a.V1R, b.V1R);
-                a.V2L = DivideRound(a.V2L, b.V2L);
-                a.V2R = DivideRound(a.V2R, b.V2R);
-                a.V3L = DivideRound(a.V3L, b.V3L);
-                a.V3R = DivideRound(a.V3R, b.V3R);
-                a.V4L = DivideRound(a.V4L, b.V4L);
-                a.V4R = DivideRound(a.V4R, b.V4R);
-                a.V5L = DivideRound(a.V5L, b.V5L);
-                a.V5R = DivideRound(a.V5R, b.V5R);
-                a.V6L = DivideRound(a.V6L, b.V6L);
-                a.V6R = DivideRound(a.V6R, b.V6R);
-                a.V7L = DivideRound(a.V7L, b.V7L);
-                a.V7R = DivideRound(a.V7R, b.V7R);
+                for (int i = 0; i < Size; i++)
+                {
+                    // TODO: find a way to index block & qt matrices with natural order indices for performance?
+                    int zig = ZigZag.ZigZagOrder[i];
+                    float divRes = block[zig] / qt[zig];
+                    dest[i] = (short)(divRes + (divRes > 0 ? 0.5f : -0.5f));
+                }
            }
        }

-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
-        {
-            var neg = new Vector4(-1);
-            var add = new Vector4(.5F);
-
-            // sign(dividend) = max(min(dividend, 1), -1)
-            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
-            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
-            return (dividend / divisor) + (sign * add);
-        }
-
        public void RoundInto(ref Block8x8 dest)
        {
            for (int i = 0; i < Size; i++)
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@ -54,9 +54,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        private readonly HuffmanTable[] acHuffmanTables;

-        // The unzig data.
-        private ZigZag dctZigZag;
-
        private HuffmanScanBuffer scanBuffer;

        private readonly SpectralConverter spectralConverter;
@ -74,7 +71,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            SpectralConverter converter,
            CancellationToken cancellationToken)
        {
-            this.dctZigZag = ZigZag.CreateUnzigTable();
            this.stream = stream;
            this.spectralConverter = converter;
            this.cancellationToken = cancellationToken;
@ -477,7 +473,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            ref short blockDataRef = ref Unsafe.As<Block8x8, short>(ref block);
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;

            // DC
            int t = buffer.DecodeHuffman(ref dcTable);
@ -502,7 +497,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                {
                    i += r;
                    s = buffer.Receive(s);
-                    Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s;
+                    Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
                }
                else
                {
@ -556,7 +551,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                }

                ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-                ref ZigZag zigzag = ref this.dctZigZag;
                int start = this.SpectralStart;
                int end = this.SpectralEnd;
                int low = this.SuccessiveLow;
@ -572,7 +566,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                    if (s != 0)
                    {
                        s = buffer.Receive(s);
-                        Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low);
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
                    }
                    else
                    {
@ -602,7 +596,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            // Refinement scan for these AC coefficients
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;
            int start = this.SpectralStart;
            int end = this.SpectralEnd;

@ -649,7 +642,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    do
                    {
-                        ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
                        if (coef != 0)
                        {
                            buffer.CheckBits();
@ -675,7 +668,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    if ((s != 0) && (k < 64))
                    {
-                        Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s;
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
                    }
                }
            }
@ -684,7 +677,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            {
                for (; k <= end; k++)
                {
-                    ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);

                    if (coef != 0)
                    {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        IJpegComponent[] Components { get; }

        /// <summary>
-        /// Gets the quantization tables, in zigzag order.
+        /// Gets the quantization tables, in natural order.
        /// </summary>
        Block8x8F[] QuantizationTables { get; }
    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@ -46,7 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
        {
            int qtIndex = component.QuantizationTableIndex;
-            this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]);
+            this.DequantiazationTable = decoder.QuantizationTables[qtIndex];
            this.subSamplingDivisors = component.SubSamplingDivisors;

            this.SourceBlock = default;
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -96,6 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder

        private Block8x8F temporalBlock1;
        private Block8x8F temporalBlock2;
+        private Block8x8 temporalShortBlock;

        /// <summary>
        /// The output stream. All attempted writes after the first error become no-ops.
@ -132,8 +133,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        {
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

-            var unzig = ZigZag.CreateUnzigTable();
-
            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;

@ -156,22 +155,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    if (this.IsFlushNeeded)
                    {
@ -197,8 +193,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        {
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

-            var unzig = ZigZag.CreateUnzigTable();
-
            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
@ -222,30 +216,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YLeft,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);

                        prevDCY = this.WriteBlock(
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YRight,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);
                    }

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    if (this.IsFlushNeeded)
                    {
@ -269,8 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        {
            this.huffmanTables = HuffmanLut.TheHuffmanLut;

-            var unzig = ZigZag.CreateUnzigTable();
-
            // ReSharper disable once InconsistentNaming
            int prevDCY = 0;

@ -292,8 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);

                    if (this.IsFlushNeeded)
                    {
@ -320,28 +307,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            QuantIndex index,
            int prevDC,
            ref Block8x8F src,
-            ref Block8x8F quant,
-            ref ZigZag unZig)
+            ref Block8x8F quant)
        {
            ref Block8x8F refTemp1 = ref this.temporalBlock1;
            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8 spectralBlock = ref this.temporalShortBlock;

            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);

-            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+            Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant);

            // Emit the DC delta.
-            int dc = (int)refTemp2[0];
+            int dc = spectralBlock[0];
            this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);

            // Emit the AC components.
            int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;

            int runLength = 0;
-            int lastValuableIndex = refTemp2.GetLastNonZeroIndex();
+            int lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
            for (int zig = 1; zig <= lastValuableIndex; zig++)
            {
-                int ac = (int)refTemp2[zig];
+                int ac = spectralBlock[zig];

                if (ac == 0)
                {
--- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
@ -39,53 +39,59 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        public const int QualityEstimationConfidenceUpperThreshold = 98;

        /// <summary>
-        /// Gets the unscaled luminance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled luminance quantization table.
        /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
        // The C# compiler emits this as a compile-time constant embedded in the PE file.
        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+        public static ReadOnlySpan<byte> LuminanceTable => new byte[]
        {
-            16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-            40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-            57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-            109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-            100, 120, 92, 101, 103, 99,
+            16, 11, 10, 16,  24,  40,  51,  61,
+            12, 12, 14, 19,  26,  58,  60,  55,
+            14, 13, 16, 24,  40,  57,  69,  56,
+            14, 17, 22, 29,  51,  87,  80,  62,
+            18, 22, 37, 56,  68, 109, 103,  77,
+            24, 35, 55, 64,  81, 104, 113,  92,
+            49, 64, 78, 87, 103, 121, 120, 101,
+            72, 92, 95, 98, 112, 100, 103,  99,
        };

        /// <summary>
-        /// Gets the unscaled chrominance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled chrominance quantization table.
        /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
        // The C# compiler emits this as a compile-time constant embedded in the PE file.
        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+        public static ReadOnlySpan<byte> ChrominanceTable => new byte[]
        {
-            17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+            17, 18, 24, 47, 99, 99, 99, 99,
+            18, 21, 26, 66, 99, 99, 99, 99,
+            24, 26, 56, 99, 99, 99, 99, 99,
+            47, 66, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
            99, 99, 99, 99, 99, 99, 99, 99,
        };

        /// Ported from JPEGsnoop:
        /// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694
        /// <summary>
-        /// Estimates jpeg quality based on quantization table in zig-zag order.
+        /// Estimates jpeg quality based on standard quantization table.
        /// </summary>
        /// <remarks>
-        /// This technically can be used with any given table but internal decoder code uses ITU spec tables:
-        /// <see cref="UnscaledQuant_Luminance"/> and <see cref="UnscaledQuant_Chrominance"/>.
+        /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables:
+        /// <see cref="LuminanceTable"/> and <see cref="ChrominanceTable"/>.
        /// </remarks>
        /// <param name="table">Input quantization table.</param>
-        /// <param name="target">Quantization to estimate against.</param>
-        /// <returns>Estimated quality</returns>
+        /// <param name="target">Natural order quantization table to estimate against.</param>
+        /// <returns>Estimated quality.</returns>
        public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan<byte> target)
        {
            // This method can be SIMD'ified if standard table is injected as Block8x8F.
@ -106,11 +112,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            int quality;
            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                float coeff = table[i];
-                int coeffInteger = (int)coeff;
+                int coeff = (int)table[i];

                // Coefficients are actually int16 casted to float numbers so there's no truncating error.
-                if (coeffInteger != 0)
+                if (coeff != 0)
                {
                    comparePercent = 100.0 * (table[i] / target[i]);
                }
@ -152,7 +157,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <returns>Estimated quality</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable)
-            => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance);
+            => EstimateQuality(ref luminanceTable, LuminanceTable);

        /// <summary>
        /// Estimates jpeg quality based on quantization table in zig-zag order.
@ -161,7 +166,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <returns>Estimated quality</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable)
-            => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance);
+            => EstimateQuality(ref chrominanceTable, ChrominanceTable);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static int QualityToScale(int quality)
@ -185,10 +190,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Block8x8F ScaleLuminanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Block8x8F ScaleChrominanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable);
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@ -0,0 +1,404 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class ZigZag
+    {
+        /// <summary>
+        /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
+        /// </summary>
+        private const byte Z = 0xff;
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
+        {
+            // 0_A
+            0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z,
+            // 0_B
+            Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5,
+            // 0_C
+            Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 1_A
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11,
+            // 1_B
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z,
+            // 1_C
+            2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z,
+            // 1_D
+            Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 1_E
+            Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 2_B
+            8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_C
+            Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_D
+            Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_E
+            Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5,
+            // 2_F
+            Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z,
+            // 2_G
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z,
+
+            // 3_A
+            Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z,
+            // 3_B
+            Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z,
+            // 3_C
+            Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z,
+            // 3_D/4_E
+            6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9,
+
+            // 4_F
+            Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z,
+            // 4_G
+            Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z,
+            // 4_H
+            Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z,
+
+            // 5_B
+            Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 5_C
+            Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 5_D
+            10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z,
+            // 5_E
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z,
+            // 5_F
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z,
+            // 5_G
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
+
+            // 6_D
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z,
+            // 6_E
+            Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z,
+            // 6_F
+            Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13,
+            // 6_G
+            Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 6_H
+            4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 7_F
+            Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z,
+            // 7_G
+            10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z,
+            // 7_H
+            Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
+        {
+                // 01_AB/01_EF/23_CD - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
+
+                // 01_AB - inner-lane
+                0, 1, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   6, 7, 12, 13,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   4, 5, 6, 7,
+
+                // 01_CD/23_GH - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,
+
+                // 01_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   Z, Z, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,
+
+                // 01_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   0, 1, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 23_AB/45_CD/67_EF - cross-lane
+                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,
+
+                // 23_AB - inner-lane
+                4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   2, 3, 8, 9,   Z, Z, Z, Z,
+
+                // 23_CD - inner-lane
+                Z, Z, 6, 7,   12, 13, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 12, 13,
+
+                // 23_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 23_GH - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 45_AB - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 45_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   Z, Z, 2, 3,   8, 9, Z, Z,   Z, Z, Z, Z,
+
+                // 45_EF - cross-lane
+                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   Z, Z, Z, Z,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+
+                // 45_EF - inner-lane
+                2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,
+
+                // 45_GH - inner-lane
+                Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 6, 7,
+
+                // 67_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 67_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, 6, 7,   0, 1, Z, Z,   2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,
+
+                // 67_GH - inner-lane
+                8, 9, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, 6, 7,   12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
+        /// </summary>
+        /// <remarks>
+        /// Requires Ssse3 support.
+        /// </remarks>
+        /// <param name="source">Input matrix.</param>
+        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 source, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
+
+            fixed (byte* maskPtr = SseShuffleMasks)
+            {
+                Vector128<byte> A = source.V0.AsByte();
+                Vector128<byte> B = source.V1.AsByte();
+                Vector128<byte> C = source.V2.AsByte();
+                Vector128<byte> D = source.V3.AsByte();
+                Vector128<byte> E = source.V4.AsByte();
+                Vector128<byte> F = source.V5.AsByte();
+                Vector128<byte> G = source.V6.AsByte();
+                Vector128<byte> H = source.V7.AsByte();
+
+                // row0
+                Vector128<short> row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
+                Vector128<short> row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(row0_A, row0_B);
+                Vector128<short> row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
+                row0 = Sse2.Or(row0, row0_C);
+
+                // row1
+                Vector128<short> row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
+                Vector128<short> row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
+                Vector128<short> row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_C);
+                Vector128<short> row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_D);
+                Vector128<short> row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_E);
+
+                // row2
+                Vector128<short> row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
+                Vector128<short> row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(row2_B, row2_C);
+                Vector128<short> row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_D);
+                Vector128<short> row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_E);
+                Vector128<short> row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_F);
+                Vector128<short> row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_G);
+
+                // row3
+                Vector128<short> A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
+                Vector128<short> B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(A_3, B_3);
+                Vector128<short> C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                row3 = Sse2.Or(row3, C_3);
+                Vector128<byte> D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
+                Vector128<short> D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16();
+                row3 = Sse2.Or(row3, D_3);
+
+                // row4
+                Vector128<short> E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16();
+                Vector128<short> F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(E_4, F_4);
+                Vector128<short> G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, G_4);
+                Vector128<short> H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, H_4);
+
+                // row5
+                Vector128<short> B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
+                Vector128<short> C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(B_5, C_5);
+                Vector128<short> D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, D_5);
+                Vector128<short> E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, E_5);
+                Vector128<short> F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, F_5);
+                Vector128<short> G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, G_5);
+
+                // row6
+                Vector128<short> D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
+                Vector128<short> E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(D_6, E_6);
+                Vector128<short> F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, F_6);
+                Vector128<short> G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, G_6);
+                Vector128<short> H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, H_6);
+
+                // row7
+                Vector128<short> F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
+                Vector128<short> G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(F_7, G_7);
+                Vector128<short> H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
+                row7 = Sse2.Or(row7, H_7);
+
+                dest.V0 = row0;
+                dest.V1 = row1;
+                dest.V2 = row2;
+                dest.V3 = row3;
+                dest.V4 = row4;
+                dest.V5 = row5;
+                dest.V6 = row6;
+                dest.V7 = row7;
+            }
+        }
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
+        /// </summary>
+        /// <remarks>
+        /// Requires Avx2 support.
+        /// </remarks>
+        /// <param name="source">Input matrix.</param>
+        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 source, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+            {
+                // 18 loads
+                // 10 cross-lane shuffles (permutations)
+                // 14 shuffles
+                // 10 bitwise or's
+                // 4 stores
+
+                // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7
+                // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7
+                // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7
+                // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7
+                Vector256<byte> AB = source.V01.AsByte();
+                Vector256<byte> CD = source.V23.AsByte();
+                Vector256<byte> EF = source.V45.AsByte();
+                Vector256<byte> GH = source.V67.AsByte();
+
+                // row01 - A0  A1  B0  C0  B1  A2  A3  B2 | C1  D0  E0  D1  C2  B3  A4  A5
+                Vector256<int> AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+
+                // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X  X)  (X  X)
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row01_AB - (A0 A1) (B0  X) (B1 A2) (A3 B2) | (X  X)  (X  X)  (X  B3) (A4 A5)
+                row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
+
+                Vector256<int> CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
+
+                // row01_CD - (C0 C1) (X X)  (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X)
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
+                // row01_CD - (X  X)  (X C0) (X X) (X X) | (C1 D0) (X  D1)  (C2 X)  (X X)
+                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
+
+                // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X)  (X X) (X X)
+                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row01_EF - (X X) (X X) (X X) (X X) | (X  X)  (E0 X) (X X) (X X)
+                Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+                Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
+
+
+                // row23 - B4  C3  D2  E1  F0  G0  F1  E2 | D3  C4  B5  A6  A7  B6  C5  D4
+
+                Vector256<int> AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+
+                // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X)
+                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X)
+                Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+
+                // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X)
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4)
+                row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
+
+                // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+
+                // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X)
+                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
+                // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+
+                Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
+
+
+                // row45 - E3  F2  G1  H0  H1  G2  F3  E4 | D5  C6  B7  C7  D6  E5  F4  G3
+
+                // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X)
+                Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
+
+                // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X)
+                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X)
+                Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
+
+                Vector256<int> EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
+
+                // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X)
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
+                // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X)
+                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
+
+                // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3)
+                Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+
+                Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
+
+
+                // row67 - H2  H3  G4  F5  E6  D7  E7  F6 | G5  H4  H5  G6  F7  G7  H6  H7
+
+                // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
+
+                // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X)
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X)
+                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
+
+                // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7)
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
+                // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7)
+                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+
+                Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+
+                dest.V01 = row01.AsInt16();
+                dest.V23 = row23.AsInt16();
+                dest.V45 = row45.AsInt16();
+                dest.V67 = row67.AsInt16();
+            }
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@ -4,19 +4,17 @@
 using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
-    /// <summary>
-    /// Holds the Jpeg UnZig array in a value/stack type.
-    /// Unzig maps from the zigzag ordering to the natural ordering. For example,
-    /// unzig[3] is the column and row of the fourth element in zigzag order. The
-    /// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-    /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct ZigZag
+    internal static partial class ZigZag
    {
        /// <summary>
+        /// Gets span of zig-zag ordering indices.
+        /// </summary>
+        /// <remarks>
        /// When reading corrupted data, the Huffman decoders could attempt
        /// to reference an entry beyond the end of this array (if the decoded
        /// zero run length reaches past the end of the block).  To prevent
@ -25,20 +23,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// to be stored in location 63 of the block, not somewhere random.
        /// The worst case would be a run-length of 15, which means we need 16
        /// fake entries.
-        /// </summary>
-        private const int Size = 64 + 16;
-
-        /// <summary>
-        /// Copy of <see cref="Unzig"/> in a value type
-        /// </summary>
-        public fixed byte Data[Size];
-
-        /// <summary>
-        /// Gets the unzigs map, which maps from the zigzag ordering to the natural ordering.
-        /// For example, unzig[3] is the column and row of the fourth element in zigzag order.
-        /// The value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-        /// </summary>
-        private static ReadOnlySpan<byte> Unzig => new byte[]
+        /// </remarks>
+        public static ReadOnlySpan<byte> ZigZagOrder => new byte[]
        {
            0,  1,  8, 16,  9,  2,  3, 10,
            17, 24, 32, 25, 18, 11,  4,  5,
@ -48,53 +34,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            29, 22, 15, 23, 30, 37, 44, 51,
            58, 59, 52, 45, 38, 31, 39, 46,
            53, 60, 61, 54, 47, 55, 62, 63,
-            63, 63, 63, 63, 63, 63, 63, 63, // Extra entries for safety in decoder
+
+            // Extra entries for safety in decoder
+            63, 63, 63, 63, 63, 63, 63, 63,
            63, 63, 63, 63, 63, 63, 63, 63
        };
-
-        /// <summary>
-        /// Returns the value at the given index
-        /// </summary>
-        /// <param name="idx">The index</param>
-        /// <returns>The <see cref="byte"/></returns>
-        public byte this[int idx]
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            get
-            {
-                ref byte self = ref Unsafe.As<ZigZag, byte>(ref this);
-                return Unsafe.Add(ref self, idx);
-            }
-        }
-
-        /// <summary>
-        /// Creates and fills an instance of <see cref="ZigZag"/> with Jpeg unzig indices
-        /// </summary>
-        /// <returns>The new instance</returns>
-        public static ZigZag CreateUnzigTable()
-        {
-            ZigZag result = default;
-            ref byte sourceRef = ref MemoryMarshal.GetReference(Unzig);
-            ref byte destinationRef = ref Unsafe.AsRef<byte>(result.Data);
-
-            Unzig.CopyTo(new Span<byte>(result.Data, Size));
-
-            return result;
-        }
-
-        /// <summary>
-        /// Apply Zigging to the given quantization table, so it will be sufficient to multiply blocks for dequantizing them.
-        /// </summary>
-        public static Block8x8F CreateDequantizationTable(ref Block8x8F qt)
-        {
-            Block8x8F result = default;
-
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                result[Unzig[i]] = qt[i];
-            }
-
-            return result;
-        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@ -740,9 +740,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                        stream.Read(this.temp, 0, 64);
                        remaining -= 64;

+                        // Parsing quantization table & saving it in natural order
                        for (int j = 0; j < 64; j++)
                        {
-                            table[j] = this.temp[j];
+                            table[ZigZag.ZigZagOrder[j]] = this.temp[j];
                        }

                        break;
@ -760,9 +761,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                        stream.Read(this.temp, 0, 128);
                        remaining -= 128;

+                        // Parsing quantization table & saving it in natural order
                        for (int j = 0; j < 64; j++)
                        {
-                            table[j] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
+                            table[ZigZag.ZigZagOrder[j]] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
                        }

                        break;
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@ -151,7 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
            dqt[offset++] = (byte)i;
            for (int j = 0; j < Block8x8F.Size; j++)
            {
-                dqt[offset++] = (byte)quant[j];
+                dqt[offset++] = (byte)quant[ZigZag.ZigZagOrder[j]];
            }
        }

@ -635,11 +635,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
        /// Initializes quntization tables.
        /// </summary>
        /// <remarks>
+        /// <para>
+        /// Zig-zag ordering is NOT applied to the resulting tables.
+        /// </para>
+        /// <para>
        /// We take quality values in a hierarchical order:
        /// 1. Check if encoder has set quality
-        /// 2. Check if metadata has special table for encoding
-        /// 3. Check if metadata has set quality
-        /// 4. Take default quality value - 75
+        /// 2. Check if metadata has set quality
+        /// 3. Take default quality value - 75
+        /// </para>
        /// </remarks>
        /// <param name="componentCount">Color components count.</param>
        /// <param name="metadata">Jpeg metadata instance.</param>
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -272,32 +272,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            this.CompareBlocks(expected, actual, 0);
        }

+        // TODO: intrinsic tests
        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public unsafe void Quantize(int seed)
+        [InlineData(1, 2)]
+        [InlineData(2, 1)]
+        public void Quantize(int srcSeed, int qtSeed)
        {
-            var block = default(Block8x8F);
-            block.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
-
-            var qt = default(Block8x8F);
-            qt.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
-
-            var unzig = ZigZag.CreateUnzigTable();
+            Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
+            Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);

-            int* expectedResults = stackalloc int[Block8x8F.Size];
-            ReferenceImplementations.QuantizeRational(&block, expectedResults, &qt, unzig.Data);
+            Block8x8 expected = default;
+            ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);

-            var actualResults = default(Block8x8F);
+            Block8x8 actual = default;
+            Block8x8F.Quantize(ref source, ref actual, ref quant);

-            Block8x8F.Quantize(ref block, ref actualResults, ref qt, ref unzig);
-
-            for (int i = 0; i < Block8x8F.Size; i++)
+            for (int i = 0; i < Block8x8.Size; i++)
            {
-                int expected = expectedResults[i];
-                int actual = (int)actualResults[i];
-
-                Assert.Equal(expected, actual);
+                Assert.Equal(expected[i], actual[i]);
            }
        }

@ -368,48 +360,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
        }

-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void DequantizeBlock(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-            Block8x8F.DequantizeBlock(&actual, &qt, unzig.Data);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void ZigZag_CreateDequantizationTable_MultiplicationShouldQuantize(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-            Block8x8F zigQt = ZigZag.CreateDequantizationTable(ref qt);
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-
-            actual.MultiplyInPlace(ref zigQt);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
        [Fact]
        public void AddToAllInPlace()
        {
--- a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
@ -21,7 +21,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Block8x8F table = JpegQuantization.ScaleLuminanceTable(quality);
                int estimatedQuality = JpegQuantization.EstimateLuminanceQuality(ref table);

-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate luminance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate luminance quality for standard table at quality level {quality}");
            }
        }

@ -35,7 +37,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Block8x8F table = JpegQuantization.ScaleChrominanceTable(quality);
                int estimatedQuality = JpegQuantization.EstimateChrominanceQuality(ref table);

-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate chrominance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate chrominance quality for standard table at quality level {quality}");
            }
        }
    }
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@ -15,18 +15,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
    /// </summary>
    internal static partial class ReferenceImplementations
    {
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
+        public static void DequantizeBlock(ref Block8x8F block, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Block8x8F.Size; qtIndex++)
+            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                byte i = unzigPtr[qtIndex];
-                float* unzigPos = b + i;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
+                int zig = zigzag[i];
+                block[zig] *= qt[i];
            }
        }

@ -101,42 +95,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils

        /// <summary>
        /// Reference implementation to test <see cref="Block8x8F.Quantize"/>.
-        /// Rounding is done used an integer-based algorithm defined in <see cref="RationalRound(int,int)"/>.
        /// </summary>
-        /// <param name="src">The input block</param>
-        /// <param name="dest">The destination block of integers</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unzigPtr">Pointer to <see cref="ZigZag.Data"/> </param>
-        public static unsafe void QuantizeRational(Block8x8F* src, int* dest, Block8x8F* qt, byte* unzigPtr)
+        /// <param name="src">The input block.</param>
+        /// <param name="dest">The destination block of 16bit integers.</param>
+        /// <param name="qt">The quantization table.</param>
+        /// <param name="zigzag">Zig-Zag index sequence span.</param>
+        public static void Quantize(ref Block8x8F src, ref Block8x8 dest, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
        {
-            float* s = (float*)src;
-            float* q = (float*)qt;
-
-            for (int zig = 0; zig < Block8x8F.Size; zig++)
+            for (int i = 0; i < Block8x8F.Size; i++)
            {
-                int a = (int)s[unzigPtr[zig]];
-                int b = (int)q[zig];
-
-                int val = RationalRound(a, b);
-                dest[zig] = val;
+                int zig = zigzag[i];
+                dest[i] = (short)Math.Round(src[zig] / qt[zig], MidpointRounding.AwayFromZero);
            }
        }
-
-        /// <summary>
-        /// Rounds a rational number defined as dividend/divisor into an integer.
-        /// </summary>
-        /// <param name="dividend">The dividend.</param>
-        /// <param name="divisor">The divisor.</param>
-        /// <returns>The rounded value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int RationalRound(int dividend, int divisor)
-        {
-            if (dividend >= 0)
-            {
-                return (dividend + (divisor >> 1)) / divisor;
-            }
-
-            return -((-dividend + (divisor >> 1)) / divisor);
-        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
@ -13,8 +13,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
        public void ZigZagCanHandleAllPossibleCoefficients()
        {
            // Mimic the behaviour of the huffman scan decoder using all possible byte values
-            var block = new short[64];
-            var zigzag = ZigZag.CreateUnzigTable();
+            short[] block = new short[64];

            for (int h = 0; h < 255; h++)
            {
@ -27,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    if (s != 0)
                    {
                        i += r;
-                        block[zigzag[i++]] = (short)s;
+                        block[ZigZag.ZigZagOrder[i++]] = (short)s;
                    }
                    else
                    {