From e83cb95cb377df22d51ec0f95cb4ebf1ce122d27 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 17 Aug 2021 12:24:37 +0300
Subject: [PATCH 01/56] Moved stuff bytes injection to outer method

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 331da275c..778d6ccd8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -35,6 +35,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
 
+        private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2];
+
+        private const int BytesPerCodingUnit = 256 * 3;
+
         /// <summary>
         /// Number of filled bytes in <see cref="emitBuffer"/> buffer
         /// </summary>
@@ -116,6 +120,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Cr,
                         ref chrominanceQuantTable,
                         ref unzig);
+
+                    if (this.emitLen + BytesPerCodingUnit > EmitBufferSizeInBytes)
+                    {
+                        this.WriteToStream();
+                    }
                 }
             }
 
@@ -326,28 +335,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     byte b = (byte)(bits >> 24);
                     this.emitBuffer[this.emitLen++] = b;
 
-                    // Adding stuff byte
-                    // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte)
-                    // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker
-                    if (b == byte.MaxValue)
-                    {
-                        this.emitBuffer[this.emitLen++] = byte.MinValue;
-                    }
-
                     bits <<= 8;
                     count -= 8;
                 }
-
-                // This can emit 4 times of:
-                // 1 byte guaranteed
-                // 1 extra byte.MinValue byte if previous one was byte.MaxValue
-                // Thus writing (1 + 1) * 4 = 8 bytes max
-                // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
-                if (this.emitLen > EmitBufferSizeInBytes - 8)
-                {
-                    this.target.Write(this.emitBuffer, 0, this.emitLen);
-                    this.emitLen = 0;
-                }
             }
 
             this.accumulatedBits = bits;
@@ -520,5 +510,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 return index;
             }
         }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void WriteToStream()
+        {
+            int writeIdx = 0;
+            for (int i = 0; i < this.emitLen; i++)
+            {
+                byte value = this.emitBuffer[i];
+                this.streamWriteBuffer[writeIdx++] = value;
+                if (value == 0xff)
+                {
+                    this.streamWriteBuffer[writeIdx++] = 0x00;
+                }
+            }
+
+            this.target.Write(this.streamWriteBuffer, 0, writeIdx);
+            this.emitLen = 0;
+        }
     }
 }

From 739f5206404715ada29c62f5881cbdfb044f1232 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 17 Aug 2021 13:27:37 +0300
Subject: [PATCH 02/56] Optimized byte emition, ouput images are corrupted due
 to msb-lsb invalid order

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 55 +++++++++----------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 778d6ccd8..10eda9c5a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -1,8 +1,10 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
 using System.IO;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -33,7 +35,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// A buffer for reducing the number of stream writes when emitting Huffman tables.
         /// </summary>
-        private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+        private readonly uint[] emitBuffer = new uint[EmitBufferSizeInBytes / 4];
 
         private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2];
 
@@ -47,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
         /// </summary>
-        private int accumulatedBits;
+        private uint accumulatedBits;
 
         /// <summary>
         /// Number of jagged bits stored in <see cref="accumulatedBits"/>
@@ -121,7 +123,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref chrominanceQuantTable,
                         ref unzig);
 
-                    if (this.emitLen + BytesPerCodingUnit > EmitBufferSizeInBytes)
+                    if (this.emitLen + (BytesPerCodingUnit / 4) > EmitBufferSizeInBytes / 4)
                     {
                         this.WriteToStream();
                     }
@@ -320,27 +322,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="bits">The packed bits.</param>
         /// <param name="count">The number of bits</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(int bits, int count)
+        private void Emit(uint bits, int count)
         {
+            uint correctedBits = bits << (32 - count);
+
+            this.accumulatedBits |= correctedBits >> this.bitCount;
+
             count += this.bitCount;
-            bits <<= 32 - count;
-            bits |= this.accumulatedBits;
 
-            // Only write if more than 8 bits.
-            if (count >= 8)
+            if (count >= 32)
             {
-                // Track length
-                while (count >= 8)
-                {
-                    byte b = (byte)(bits >> 24);
-                    this.emitBuffer[this.emitLen++] = b;
+                this.emitBuffer[this.emitLen++] = this.accumulatedBits;
+                this.accumulatedBits = correctedBits << (32 - this.bitCount);
 
-                    bits <<= 8;
-                    count -= 8;
-                }
+                count -= 32;
             }
 
-            this.accumulatedBits = bits;
             this.bitCount = count;
         }
 
@@ -353,7 +350,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private void EmitHuff(int[] table, int value)
         {
             int x = table[value];
-            this.Emit(x >> 8, x & 0xff);
+            this.Emit((uint)x >> 8, x & 0xff);
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -372,7 +369,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.EmitHuff(table, bt);
             if (bt > 0)
             {
-                this.Emit(b & ((1 << bt) - 1), bt);
+                this.Emit((uint)(b & ((1 << bt) - 1)), bt);
             }
         }
 
@@ -396,7 +393,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int bt = GetHuffmanEncodingLength((uint)a);
 
             this.EmitHuff(table, (runLength << 4) | bt);
-            this.Emit(b & ((1 << bt) - 1), bt);
+            this.Emit((uint)(b & ((1 << bt) - 1)), bt);
         }
 
         /// <summary>
@@ -406,12 +403,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private void FlushInternalBuffer()
         {
             // pad last byte with 1's
-            int padBitsCount = 8 - (this.bitCount % 8);
-            if (padBitsCount != 0)
-            {
-                this.Emit((1 << padBitsCount) - 1, padBitsCount);
-                this.target.Write(this.emitBuffer, 0, this.emitLen);
-            }
+            //int padBitsCount = 8 - (this.bitCount % 8);
+            //if (padBitsCount != 0)
+            //{
+            //    this.Emit((1 << padBitsCount) - 1, padBitsCount);
+            //    this.target.Write(this.emitBuffer, 0, this.emitLen);
+            //}
         }
 
         /// <summary>
@@ -514,10 +511,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(InliningOptions.ShortMethod)]
         private void WriteToStream()
         {
+            Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
+
             int writeIdx = 0;
-            for (int i = 0; i < this.emitLen; i++)
+            for (int i = 0; i < this.emitLen * 4; i++)
             {
-                byte value = this.emitBuffer[i];
+                byte value = emitBytes[i];
                 this.streamWriteBuffer[writeIdx++] = value;
                 if (value == 0xff)
                 {

From 8a08259e09bfc92fb4b925834807cd2b712f730b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 18 Aug 2021 11:47:52 +0300
Subject: [PATCH 03/56] Fixed byte flush order, fixed last byte padding

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 45 +++++++++++++++----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 10eda9c5a..42a683539 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -41,10 +41,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
         private const int BytesPerCodingUnit = 256 * 3;
 
-        /// <summary>
-        /// Number of filled bytes in <see cref="emitBuffer"/> buffer
-        /// </summary>
-        private int emitLen = 0;
+        private int emitWriteIndex = (EmitBufferSizeInBytes / 4);
 
         /// <summary>
         /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
@@ -123,14 +120,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref chrominanceQuantTable,
                         ref unzig);
 
-                    if (this.emitLen + (BytesPerCodingUnit / 4) > EmitBufferSizeInBytes / 4)
+                    if (this.emitWriteIndex < this.emitBuffer.Length / 2)
                     {
                         this.WriteToStream();
                     }
                 }
             }
 
-            this.FlushInternalBuffer();
+            this.EmitFinalBits();
         }
 
         /// <summary>
@@ -311,6 +308,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             return dc;
         }
 
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void EmitFinalBits()
+        {
+            // Bytes count we want to write to the output stream
+            int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
+
+            // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
+            uint packedBytes = (this.accumulatedBits | (uint.MaxValue >> this.bitCount)) >> ((4 - valuableBytesCount) * 8);
+
+            // 2x size due to possible stuff bytes, max out to 8
+            Span<byte> tempBuffer = stackalloc byte[valuableBytesCount * 2];
+
+            // Write bytes to temporal buffer
+            int writeCount = 0;
+            for (int i = 0; i < valuableBytesCount; i++)
+            {
+                byte value = (byte)(packedBytes >> (i * 8));
+                tempBuffer[writeCount++] = value;
+                if (value == 0xff)
+                {
+                    tempBuffer[writeCount++] = 0;
+                }
+            }
+
+            // Write temporal buffer to the output stream
+            this.target.Write(tempBuffer, 0, writeCount);
+        }
+
         /// <summary>
         /// Emits the least significant count of bits to the stream write buffer.
         /// The precondition is bits
@@ -332,7 +357,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             if (count >= 32)
             {
-                this.emitBuffer[this.emitLen++] = this.accumulatedBits;
+                this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits;
                 this.accumulatedBits = correctedBits << (32 - this.bitCount);
 
                 count -= 32;
@@ -514,7 +539,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
 
             int writeIdx = 0;
-            for (int i = 0; i < this.emitLen * 4; i++)
+            int start = emitBytes.Length - 1;
+            int end = (this.emitWriteIndex * 4) - 1;
+            for (int i = start; i > end; i--)
             {
                 byte value = emitBytes[i];
                 this.streamWriteBuffer[writeIdx++] = value;
@@ -525,7 +552,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 
             this.target.Write(this.streamWriteBuffer, 0, writeIdx);
-            this.emitLen = 0;
+            this.emitWriteIndex = this.emitBuffer.Length;
         }
     }
 }

From 4c14c57d09aa9d115cf881cafef5f70ba99c035c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 18 Aug 2021 15:11:15 +0300
Subject: [PATCH 04/56] Greatly reduced operations per emit call

---
 .../Jpeg/Components/Encoder/HuffmanLut.cs     |  3 ++-
 .../Components/Encoder/HuffmanScanEncoder.cs  | 23 ++++++++-----------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index ec77bf87d..f563e74e0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -4,6 +4,7 @@
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     /// <summary>
+    /// TODO: THIS IS NO LONGER TRUE, INTERNAL REPRESENTATION WAS CHANGED AND THIS DOC SHOULD BE CHANGED TOO!!!
     /// A compiled look-up table representation of a huffmanSpec.
     /// Each value maps to a int32 of which the 24 most significant bits hold the
     /// codeword in bits and the 8 least significant bits hold the codeword size.
@@ -54,7 +55,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 int len = i + 1;
                 for (int j = 0; j < spec.Count[i]; j++)
                 {
-                    this.Values[spec.Values[k]] = len | (code << 8);
+                    this.Values[spec.Values[k]] = len | (code << (32 - len));
                     code++;
                     k++;
                 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 42a683539..fba814882 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -349,16 +349,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(InliningOptions.ShortMethod)]
         private void Emit(uint bits, int count)
         {
-            uint correctedBits = bits << (32 - count);
-
-            this.accumulatedBits |= correctedBits >> this.bitCount;
+            this.accumulatedBits |= bits >> this.bitCount;
 
             count += this.bitCount;
 
             if (count >= 32)
             {
                 this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits;
-                this.accumulatedBits = correctedBits << (32 - this.bitCount);
+                this.accumulatedBits = bits << (32 - this.bitCount);
 
                 count -= 32;
             }
@@ -375,7 +373,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private void EmitHuff(int[] table, int value)
         {
             int x = table[value];
-            this.Emit((uint)x >> 8, x & 0xff);
+            this.Emit((uint)x & 0xffff_ff00u, x & 0xff);
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -389,13 +387,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 b = value - 1;
             }
 
-            int bt = GetHuffmanEncodingLength((uint)a);
+            int valueLen = GetHuffmanEncodingLength((uint)a);
 
-            this.EmitHuff(table, bt);
-            if (bt > 0)
-            {
-                this.Emit((uint)(b & ((1 << bt) - 1)), bt);
-            }
+            this.EmitHuff(table, valueLen);
+            this.Emit((uint)b << (32 - valueLen), valueLen);
         }
 
         /// <summary>
@@ -415,10 +410,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 b = value - 1;
             }
 
-            int bt = GetHuffmanEncodingLength((uint)a);
+            int valueLen = GetHuffmanEncodingLength((uint)a);
 
-            this.EmitHuff(table, (runLength << 4) | bt);
-            this.Emit((uint)(b & ((1 << bt) - 1)), bt);
+            this.EmitHuff(table, (runLength << 4) | valueLen);
+            this.Emit((uint)b << (32 - valueLen), valueLen);
         }
 
         /// <summary>

From c39a20326b991ed767204c61f88c15915fe24a27 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 20 Aug 2021 12:47:33 +0300
Subject: [PATCH 05/56] Merged huffman prefix & value Emit() calls

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 31 +++++++------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index fba814882..8289a4b3c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -269,7 +269,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // Emit the DC delta.
             int dc = (int)refTemp2[0];
-            this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC);
+            this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);
 
             // Emit the AC components.
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
@@ -376,23 +376,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.Emit((uint)x & 0xffff_ff00u, x & 0xff);
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitDirectCurrentTerm(int[] table, int value)
-        {
-            int a = value;
-            int b = value;
-            if (a < 0)
-            {
-                a = -value;
-                b = value - 1;
-            }
-
-            int valueLen = GetHuffmanEncodingLength((uint)a);
-
-            this.EmitHuff(table, valueLen);
-            this.Emit((uint)b << (32 - valueLen), valueLen);
-        }
-
         /// <summary>
         /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
         /// </summary>
@@ -412,8 +395,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             int valueLen = GetHuffmanEncodingLength((uint)a);
 
-            this.EmitHuff(table, (runLength << 4) | valueLen);
-            this.Emit((uint)b << (32 - valueLen), valueLen);
+            // Huffman prefix code
+            int huffPackage = table[(runLength << 4) | valueLen];
+            int prefixLen = huffPackage & 0xff;
+            uint prefix = (uint)huffPackage & 0xffff_0000u;
+
+            // Actual encoded value
+            uint encodedValue = (uint)b << (32 - valueLen);
+
+            // Doing two binary shifts to get rid of leading 1's in negative value case
+            this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen);
         }
 
         /// <summary>

From 93044e4de00e5cad9fa776692223634742064411 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 23 Aug 2021 10:59:59 +0300
Subject: [PATCH 06/56] Sandbox code & results

---
 .../Program.cs                                | 93 +++++++++++++++++--
 1 file changed, 85 insertions(+), 8 deletions(-)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index e6e82b981..d4656f8be 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -2,6 +2,9 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Diagnostics;
+using System.IO;
+using SixLabors.ImageSharp.Formats.Jpeg;
 using SixLabors.ImageSharp.Tests.Formats.Jpg;
 using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations;
 using SixLabors.ImageSharp.Tests.ProfilingBenchmarks;
@@ -31,14 +34,88 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
         /// </param>
         public static void Main(string[] args)
         {
-            LoadResizeSaveParallelMemoryStress.Run();
-            // RunJpegEncoderProfilingTests();
-            // RunJpegColorProfilingTests();
-            // RunDecodeJpegProfilingTests();
-            // RunToVector4ProfilingTest();
-            // RunResizeProfilingTest();
-
-            // Console.ReadLine();
+            /* Master */
+            // Elapsed: 5431ms across 200 iterations
+            // Average: 27,155ms
+
+            /* Inserting stuff bytes later */
+            // Elapsed: 5300ms across 200 iterations
+            // Average: 26,5ms
+
+            /* Flush if check */
+            // Elapsed: 5209ms across 200 iterations
+            // Average: 26,045ms
+
+            /* [INVALID] int32 flush - invalid flush order */
+            // Elapsed: 4784ms across 200 iterations
+            // Average: 23,92ms
+
+            /* int32 flush - correct flush order */
+            // Elapsed: 5049ms across 200 iterations
+            // Average: 25,245ms
+
+            /* int32 flush - identical file output */
+            // Elapsed: 4800ms across 200 iterations
+            // Average: 24.00ms
+
+            /* int32 flush - optimized huffman storage & reduced instructions per Emit() */
+            // Elapsed: 4680ms across 200 iterations
+            // Average: 23,4ms
+
+            /* int32 flush - merged prefix & value Emit() call */
+            // Elapsed: 4644ms across 200 iterations
+            // Average: 23,22ms
+
+            BenchmarkEncoder("uniform_size", 200, 100);
+
+            //ReEncodeImage("uniform_size", 100);
+
+            Console.WriteLine("Done.");
+        }
+
+        const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg";
+
+        private static void BenchmarkEncoder(string fileName, int iterations, int quality)
+        {
+            string loadPath = String.Format(pathTemplate, fileName);
+
+            using var saveStream = new MemoryStream();
+
+            var decoder = new JpegDecoder { IgnoreMetadata = true };
+            using Image img = decoder.Decode(Configuration.Default, new FileStream(loadPath, FileMode.Open));
+
+            var encoder = new JpegEncoder()
+            {
+                Quality = quality,
+                ColorType = JpegColorType.YCbCr,
+                Subsample = JpegSubsample.Ratio444
+            };
+
+            Stopwatch sw = new Stopwatch();
+            sw.Start();
+            for (int i = 0; i < iterations; i++)
+            {
+                img.SaveAsJpeg(saveStream, encoder);
+                saveStream.Position = 0;
+            }
+            sw.Stop();
+
+            Console.WriteLine($"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n// Average: {(double)sw.ElapsedMilliseconds / iterations}ms");
+        }
+
+        private static void ReEncodeImage(string fileName, int quality)
+        {
+            string loadPath = String.Format(pathTemplate, fileName);
+            using Image img = Image.Load(loadPath);
+
+            string savePath = String.Format(pathTemplate, $"testSave_{fileName}");
+            var encoder = new JpegEncoder()
+            {
+                Quality = quality,
+                ColorType = JpegColorType.YCbCr,
+                Subsample = JpegSubsample.Ratio444
+            };
+            img.SaveAsJpeg(savePath, encoder);
         }
 
         private static void RunJpegEncoderProfilingTests()

From cc45eed3a1eeace81581eaba6a22f878d2bcc08d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 23 Aug 2021 11:02:41 +0300
Subject: [PATCH 07/56] Fixed last valuable index logic

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8289a4b3c..d8ea6bb0e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -482,7 +482,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 for (int i = 7; i >= 0; i--)
                 {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
 
                     // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
                     if (areEqual != equalityMask)

From 937a8689ba3bf5dfdd41061c82c26f2fb652442d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 23 Aug 2021 11:30:11 +0300
Subject: [PATCH 08/56] Optimized lvi calculation via lzcnt intrinsic

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index d8ea6bb0e..373475f6b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -3,6 +3,7 @@
 
 using System;
 using System.IO;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -441,7 +442,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // Lzcnt would return 32 for input value of 0 - no need to check that with branching
             // Fallback code if Lzcnt is not supported still use if-check
             // But most modern CPUs support this instruction so this should not be a problem
-            return 32 - System.Numerics.BitOperations.LeadingZeroCount(value);
+            return 32 - BitOperations.LeadingZeroCount(value);
 #else
             // Ideally:
             // if 0 - return 0 in this case
@@ -458,13 +459,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// Returns index of the last non-zero element in given mcu block.
-        /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support.
-        /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop.
-        /// This method is guaranteed to return either -1 or 0 if all elements are zero.
+        /// Returns index of the last non-zero element in given matrix.
         /// </summary>
         /// <remarks>
-        /// This is an internal operation supposed to be used only in <see cref="HuffmanScanEncoder"/> class for jpeg encoding.
+        /// Returns 0 for all-zero matrix by convention.
         /// </remarks>
         /// <param name="mcu">Mcu block.</param>
         /// <returns>Index of the last non-zero element.</returns>
@@ -484,24 +482,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 {
                     int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
 
-                    // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
                     if (areEqual != equalityMask)
                     {
-                        // last index in the stride, we go from the end to the start of the stride
-                        int startIndex = i * 8;
-                        int index = startIndex + 7;
-                        ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-                        while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0)
-                        {
-                            index--;
-                        }
-
-                        // this implementation will return -1 if all ac components are zero and dc are zero
-                        return index;
+                        // Each 4 bits represents comparison operation for each 4-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 7 - (lzcnt / 4);
+                        return (i * 8) + strideRelativeIndex;
                     }
                 }
 
-                return -1;
+                return 0;
             }
             else
 #endif
@@ -514,7 +513,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     index--;
                 }
 
-                // this implementation will return 0 if all ac components and dc are zero
                 return index;
             }
         }

From f9b36e794dfca1079ae517fa58af70e7b1d01e15 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 23 Aug 2021 11:30:47 +0300
Subject: [PATCH 09/56] Sandbox code & results

---
 tests/ImageSharp.Tests.ProfilingSandbox/Program.cs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index d4656f8be..bdba1bef6 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -66,6 +66,15 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             // Elapsed: 4644ms across 200 iterations
             // Average: 23,22ms
 
+
+            /* Fixed last valuable index calculation */
+            // Elapsed: 4606ms across 200 iterations
+            // Average: 23,03ms
+
+            /* Intrinsic last valuable index */
+            // Elapsed: 4519ms across 200 iterations
+            // Average: 22,595ms
+
             BenchmarkEncoder("uniform_size", 200, 100);
 
             //ReEncodeImage("uniform_size", 100);

From 787ffa57eeee862755d039c0ca672f8b1ef86aac Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 23 Aug 2021 17:04:57 +0300
Subject: [PATCH 10/56] Removed unused methods & constructor, fixed warnings

---
 .../Formats/Jpeg/Components/Block8x8.cs       | 77 ++++++-------------
 .../Formats/Jpg/Block8x8FTests.cs             |  4 +-
 .../Formats/Jpg/Block8x8Tests.cs              | 38 +++------
 .../Jpg/Utils/LibJpegTools.ComponentData.cs   |  2 +-
 4 files changed, 34 insertions(+), 87 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index bc6036903..d61a3c6fd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -28,17 +28,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         private fixed short data[Size];
 
-        /// <summary>
-        /// Initializes a new instance of the <see cref="Block8x8"/> struct.
-        /// </summary>
-        /// <param name="coefficients">A <see cref="Span{T}"/> of coefficients</param>
-        public Block8x8(Span<short> coefficients)
-        {
-            ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
-            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(coefficients));
-            Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short));
-        }
-
         /// <summary>
         /// Gets or sets a <see cref="short"/> value at the given index
         /// </summary>
@@ -75,15 +64,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             set => this[(y * 8) + x] = value;
         }
 
-        public static bool operator ==(Block8x8 left, Block8x8 right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right);
 
-        public static bool operator !=(Block8x8 left, Block8x8 right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right);
 
         /// <summary>
         /// Multiply all elements by a given <see cref="int"/>
@@ -149,34 +132,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             return result;
         }
 
-        /// <summary>
-        /// Pointer-based "Indexer" (getter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <returns>The scaleVec value at the specified index</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static short GetScalarAt(Block8x8* blockPtr, int idx)
-        {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            return fp[idx];
-        }
-
-        /// <summary>
-        /// Pointer-based "Indexer" (setter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <param name="value">Value</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void SetScalarAt(Block8x8* blockPtr, int idx, short value)
+        public static Block8x8 Load(Span<short> data)
         {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            fp[idx] = value;
+            Block8x8 result = default;
+            result.LoadFrom(data);
+            return result;
         }
 
         /// <summary>
@@ -194,7 +154,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         public short[] ToArray()
         {
-            var result = new short[Size];
+            short[] result = new short[Size];
             this.CopyTo(result);
             return result;
         }
@@ -220,6 +180,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
+        /// <summary>
+        /// Load raw 16bit integers from source.
+        /// </summary>
+        /// <param name="source">Source</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public void LoadFrom(Span<short> source)
+        {
+            ref byte s = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
+            ref byte d = ref Unsafe.As<Block8x8, byte>(ref this);
+
+            Unsafe.CopyBlock(ref d, ref s, Size * sizeof(short));
+        }
+
         /// <summary>
         /// Cast and copy <see cref="Size"/> <see cref="int"/>-s from the beginning of 'source' span.
         /// </summary>
@@ -271,16 +244,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         }
 
         /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Block8x8 other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other);
 
         /// <inheritdoc />
-        public override int GetHashCode()
-        {
-            return (this[0] * 31) + this[1];
-        }
+        public override int GetHashCode() => (this[0] * 31) + this[1];
 
         /// <summary>
         /// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index c68b0ffa8..42fdd603e 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -462,7 +462,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             short[] data = Create8x8ShortData();
 
-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);
 
             Block8x8F dest = default;
             dest.LoadFromInt16Scalar(ref source);
@@ -483,7 +483,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             short[] data = Create8x8ShortData();
 
-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);
 
             Block8x8F dest = default;
             dest.LoadFromInt16ExtendedAvx2(ref source);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index 9195f0915..afe71ad04 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         {
             short[] data = Create8x8ShortData();
 
-            var block = new Block8x8(data);
+            var block = Block8x8.Load(data);
 
             for (int i = 0; i < Block8x8.Size; i++)
             {
@@ -43,32 +43,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             Assert.Equal(42, block[42]);
         }
 
-        [Fact]
-        public unsafe void Indexer_GetScalarAt_SetScalarAt()
-        {
-            int sum;
-            var block = default(Block8x8);
-
-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                Block8x8.SetScalarAt(&block, i, (short)i);
-            }
-
-            sum = 0;
-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                sum += Block8x8.GetScalarAt(&block, i);
-            }
-
-            Assert.Equal(sum, 64 * 63 / 2);
-        }
-
         [Fact]
         public void AsFloatBlock()
         {
             short[] data = Create8x8ShortData();
 
-            var source = new Block8x8(data);
+            var source = Block8x8.Load(data);
 
             Block8x8F dest = source.AsFloatBlock();
 
@@ -82,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         public void ToArray()
         {
             short[] data = Create8x8ShortData();
-            var block = new Block8x8(data);
+            var block = Block8x8.Load(data);
 
             short[] result = block.ToArray();
 
@@ -93,8 +73,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         public void Equality_WhenTrue()
         {
             short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);
 
             block1[0] = 42;
             block2[0] = 42;
@@ -107,8 +87,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         public void Equality_WhenFalse()
         {
             short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);
 
             block1[0] = 42;
             block2[0] = 666;
@@ -131,8 +111,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         public void TotalDifference()
         {
             short[] data = Create8x8ShortData();
-            var block1 = new Block8x8(data);
-            var block2 = new Block8x8(data);
+            var block1 = Block8x8.Load(data);
+            var block2 = Block8x8.Load(data);
 
             block2[10] += 7;
             block2[63] += 8;
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
index edb8d457b..560238edb 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@@ -53,7 +53,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
             {
                 this.MinVal = Math.Min(this.MinVal, data.Min());
                 this.MaxVal = Math.Max(this.MaxVal, data.Max());
-                this.SpectralBlocks[x, y] = new Block8x8(data);
+                this.SpectralBlocks[x, y] = Block8x8.Load(data);
             }
 
             public void LoadSpectralStride(Buffer2D<Block8x8> data, int strideIndex)

From a75d6e6e7d28747f361a90e5a06421ee8d22173b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 24 Aug 2021 14:34:55 +0300
Subject: [PATCH 11/56] Added sse/avx vector fields to the Block8x8, small QOL
 fixes

---
 .../Formats/Jpeg/Components/Block8x8.cs       | 57 ++++++++++++++-----
 .../Formats/Jpeg/Components/Block8x8F.cs      | 13 +----
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index d61a3c6fd..79b26a042 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -2,17 +2,18 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 using System.Text;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     /// <summary>
-    /// Represents a Jpeg block with <see cref="short"/> coefficients.
+    /// 8x8 coefficients matrix of <see cref="short"/> type.
     /// </summary>
     // ReSharper disable once InconsistentNaming
+    [StructLayout(LayoutKind.Explicit)]
     internal unsafe struct Block8x8 : IEquatable<Block8x8>
     {
         /// <summary>
@@ -20,13 +21,44 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         public const int Size = 64;
 
+#pragma warning disable IDE0051 // Remove unused private member
         /// <summary>
-        /// A fixed size buffer holding the values.
-        /// See: <see>
-        ///         <cref>https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers</cref>
-        ///     </see>
+        /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes.
         /// </summary>
+        /// <remarks>
+        /// This is not used directly in the code.
+        /// </remarks>
+        [FieldOffset(0)]
         private fixed short data[Size];
+#pragma warning restore IDE0051
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [FieldOffset(0)]
+        public Vector128<short> V0;
+        [FieldOffset(16)]
+        public Vector128<short> V1;
+        [FieldOffset(32)]
+        public Vector128<short> V2;
+        [FieldOffset(48)]
+        public Vector128<short> V3;
+        [FieldOffset(64)]
+        public Vector128<short> V4;
+        [FieldOffset(80)]
+        public Vector128<short> V5;
+        [FieldOffset(96)]
+        public Vector128<short> V6;
+        [FieldOffset(112)]
+        public Vector128<short> V7;
+
+        [FieldOffset(0)]
+        public Vector256<short> V01;
+        [FieldOffset(32)]
+        public Vector256<short> V23;
+        [FieldOffset(64)]
+        public Vector256<short> V45;
+        [FieldOffset(96)]
+        public Vector256<short> V67;
+#endif
 
         /// <summary>
         /// Gets or sets a <see cref="short"/> value at the given index
@@ -38,7 +70,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get
             {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                 ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                 return Unsafe.Add(ref selfRef, idx);
             }
@@ -46,7 +79,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             set
             {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                 ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                 Unsafe.Add(ref selfRef, idx) = value;
             }
@@ -204,13 +238,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
         /// <inheritdoc />
         public override string ToString()
         {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index d55dfced7..a11b807bb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -16,7 +16,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     /// <summary>
-    /// Represents a Jpeg block with <see cref="float"/> coefficients.
+    /// 8x8 coefficients matrix of <see cref="float"/> type.
     /// </summary>
     [StructLayout(LayoutKind.Explicit)]
     internal partial struct Block8x8F : IEquatable<Block8x8F>
@@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get
             {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                 ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
                 return Unsafe.Add(ref selfRef, idx);
             }
@@ -110,7 +110,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             set
             {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                 ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
                 Unsafe.Add(ref selfRef, idx) = value;
             }
@@ -672,13 +672,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             return row.FastRound();
         }
 
-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
         /// <summary>
         /// Transpose the block into the destination block.
         /// </summary>

From 2bccda8c03ec44261a563b33f1716ee8dda4ec9c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 24 Aug 2021 15:29:08 +0300
Subject: [PATCH 12/56] 8x8 matrices small fixes

---
 .../Formats/Jpeg/Components/Block8x8.cs       | 60 ++++++++++++++++++
 .../Formats/Jpeg/Components/Block8x8F.cs      | 55 +++++++++++++++++
 .../Components/Encoder/HuffmanScanEncoder.cs  | 61 +------------------
 .../Formats/Jpg/HuffmanScanEncoderTests.cs    | 10 +--
 4 files changed, 121 insertions(+), 65 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 79b26a042..adfabc13c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -2,9 +2,11 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using System.Text;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
@@ -276,6 +278,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <inheritdoc />
         public override int GetHashCode() => (this[0] * 31) + this[1];
 
+        /// <summary>
+        /// Returns index of the last non-zero element in given matrix.
+        /// </summary>
+        /// <remarks>
+        /// Returns 0 for all-zero matrix by convention.
+        /// </remarks>
+        /// <returns>Index of the last non-zero element.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public int GetLastValuableElementIndex()
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                Vector256<int> zero8 = Vector256<int>.Zero;
+
+                ref Vector256<short> mcuStride = ref Unsafe.As<Block8x8, Vector256<short>>(ref this);
+
+                for (int i = 7; i >= 0; i--)
+                {
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i).AsInt32(), zero8).AsByte());
+
+                    if (areEqual != equalityMask)
+                    {
+                        // Each 2 bits represents comparison operation for each 2-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 7 - (lzcnt / 2);
+                        return (i * 8) + strideRelativeIndex;
+                    }
+                }
+
+                return 0;
+            }
+            else
+#endif
+            {
+                int index = Size - 1;
+                ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
+
+                while (index > 0 && Unsafe.Add(ref elemRef, index) == 0)
+                {
+                    index--;
+                }
+
+                return index;
+            }
+        }
+
         /// <summary>
         /// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
         /// </summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index a11b807bb..b0d7b0876 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -864,5 +864,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 return true;
             }
         }
+
+        /// <summary>
+        /// Returns index of the last non-zero element in this matrix.
+        /// </summary>
+        /// <returns>Index of the last non-zero element. Returns -1 if all elements are equal to zero.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public int GetLastValuableElementIndex()
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                Vector256<int> zero8 = Vector256<int>.Zero;
+
+                ref Vector256<float> mcuStride = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
+
+                for (int i = 7; i >= 0; i--)
+                {
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
+
+                    if (areEqual != equalityMask)
+                    {
+                        // Each 4 bits represents comparison operation for each 4-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 7 - (lzcnt / 4);
+                        return (i * 8) + strideRelativeIndex;
+                    }
+                }
+
+                return -1;
+            }
+            else
+#endif
+            {
+                int index = Size - 1;
+                ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
+
+                while (index >= 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
+                {
+                    index--;
+                }
+
+                return index;
+            }
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 373475f6b..134b4e1cc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -276,7 +276,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
 
             int runLength = 0;
-            int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2);
+            int lastValuableIndex = refTemp2.GetLastValuableElementIndex();
             for (int zig = 1; zig <= lastValuableIndex; zig++)
             {
                 int ac = (int)refTemp2[zig];
@@ -458,65 +458,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
-        /// <summary>
-        /// Returns index of the last non-zero element in given matrix.
-        /// </summary>
-        /// <remarks>
-        /// Returns 0 for all-zero matrix by convention.
-        /// </remarks>
-        /// <param name="mcu">Mcu block.</param>
-        /// <returns>Index of the last non-zero element.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static int GetLastValuableElementIndex(ref Block8x8F mcu)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
-                Vector256<int> zero8 = Vector256<int>.Zero;
-
-                ref Vector256<float> mcuStride = ref mcu.V0;
-
-                for (int i = 7; i >= 0; i--)
-                {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
-
-                    if (areEqual != equalityMask)
-                    {
-                        // Each 4 bits represents comparison operation for each 4-byte element in input vectors
-                        // LSB represents first element in the stride
-                        // MSB represents last element in the stride
-                        // lzcnt operation would calculate number of zero numbers at the end
-
-                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
-                        // So we need to invert it
-                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
-
-                        // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4
-                        // to get the exact number of zero elements in the stride
-                        int strideRelativeIndex = 7 - (lzcnt / 4);
-                        return (i * 8) + strideRelativeIndex;
-                    }
-                }
-
-                return 0;
-            }
-            else
-#endif
-            {
-                int index = Block8x8F.Size - 1;
-                ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-
-                while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
-                {
-                    index--;
-                }
-
-                return index;
-            }
-        }
-
         [MethodImpl(InliningOptions.ShortMethod)]
         private void WriteToStream()
         {
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
index b953e80b8..f75b0a0b8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
@@ -95,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 int expectedLessThan = 1;
 
-                int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
+                int actual = data.GetLastValuableElementIndex();
 
                 Assert.True(actual < expectedLessThan);
             }
@@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 int expected = Block8x8F.Size - 1;
 
-                int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
+                int actual = data.GetLastValuableElementIndex();
 
                 Assert.Equal(expected, actual);
             }
@@ -147,7 +147,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = setIndex;
 
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
+                    int actual = data.GetLastValuableElementIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -182,7 +182,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = lastIndex;
 
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
+                    int actual = data.GetLastValuableElementIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -226,7 +226,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = lastIndex2;
 
-                    int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data);
+                    int actual = data.GetLastValuableElementIndex();
 
                     Assert.Equal(expected, actual);
                 }

From 8098e8ef684ab43ef3bf9ff8ffb1dd0ef71ec25f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 24 Aug 2021 21:32:13 +0300
Subject: [PATCH 13/56] Fixed last stream flush

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 35 ++++++-------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 134b4e1cc..08fe486a9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -6,10 +6,6 @@ using System.IO;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@@ -316,25 +312,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
 
             // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
-            uint packedBytes = (this.accumulatedBits | (uint.MaxValue >> this.bitCount)) >> ((4 - valuableBytesCount) * 8);
+            uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
 
-            // 2x size due to possible stuff bytes, max out to 8
-            Span<byte> tempBuffer = stackalloc byte[valuableBytesCount * 2];
+            int writeIndex = this.emitWriteIndex;
+            this.emitBuffer[writeIndex - 1] = packedBytes;
 
-            // Write bytes to temporal buffer
-            int writeCount = 0;
-            for (int i = 0; i < valuableBytesCount; i++)
-            {
-                byte value = (byte)(packedBytes >> (i * 8));
-                tempBuffer[writeCount++] = value;
-                if (value == 0xff)
-                {
-                    tempBuffer[writeCount++] = 0;
-                }
-            }
-
-            // Write temporal buffer to the output stream
-            this.target.Write(tempBuffer, 0, writeCount);
+            this.WriteToStream((writeIndex * 4) - valuableBytesCount);
         }
 
         /// <summary>
@@ -459,14 +442,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void WriteToStream()
+        private void WriteToStream() => this.WriteToStream(this.emitWriteIndex * 4);
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void WriteToStream(int endIndex)
         {
             Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
 
             int writeIdx = 0;
-            int start = emitBytes.Length - 1;
-            int end = (this.emitWriteIndex * 4) - 1;
-            for (int i = start; i > end; i--)
+            int startIndex = emitBytes.Length - 1;
+            for (int i = startIndex; i >= endIndex; i--)
             {
                 byte value = emitBytes[i];
                 this.streamWriteBuffer[writeIdx++] = value;

From e5fec9784451a24fd36efc49d04f5637811019e1 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 25 Aug 2021 01:50:59 +0300
Subject: [PATCH 14/56] Fixed lvi

---
 .../Formats/Jpeg/Components/Block8x8.cs       | 23 +++++++++----------
 .../Formats/Jpeg/Components/Block8x8F.cs      |  6 +++--
 .../Components/Encoder/HuffmanScanEncoder.cs  |  2 +-
 .../Formats/Jpg/HuffmanScanEncoderTests.cs    | 20 ++++++++--------
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index adfabc13c..3e5277c06 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -281,25 +281,24 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <summary>
         /// Returns index of the last non-zero element in given matrix.
         /// </summary>
-        /// <remarks>
-        /// Returns 0 for all-zero matrix by convention.
-        /// </remarks>
-        /// <returns>Index of the last non-zero element.</returns>
+        /// <returns>
+        /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
+        /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public int GetLastValuableElementIndex()
+        public int GetLastNonZeroIndex()
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
             {
                 const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
 
-                Vector256<int> zero8 = Vector256<int>.Zero;
+                Vector256<short> zero16 = Vector256<short>.Zero;
 
                 ref Vector256<short> mcuStride = ref Unsafe.As<Block8x8, Vector256<short>>(ref this);
 
-                for (int i = 7; i >= 0; i--)
+                for (int i = 3; i >= 0; i--)
                 {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i).AsInt32(), zero8).AsByte());
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte());
 
                     if (areEqual != equalityMask)
                     {
@@ -314,12 +313,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
                         // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2
                         // to get the exact number of zero elements in the stride
-                        int strideRelativeIndex = 7 - (lzcnt / 2);
-                        return (i * 8) + strideRelativeIndex;
+                        int strideRelativeIndex = 15 - (lzcnt / 2);
+                        return (i * 16) + strideRelativeIndex;
                     }
                 }
 
-                return 0;
+                return -1;
             }
             else
 #endif
@@ -327,7 +326,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 int index = Size - 1;
                 ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
 
-                while (index > 0 && Unsafe.Add(ref elemRef, index) == 0)
+                while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0)
                 {
                     index--;
                 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index b0d7b0876..8479cdc97 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -868,9 +868,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <summary>
         /// Returns index of the last non-zero element in this matrix.
         /// </summary>
-        /// <returns>Index of the last non-zero element. Returns -1 if all elements are equal to zero.</returns>
+        /// <returns>
+        /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
+        /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public int GetLastValuableElementIndex()
+        public int GetLastNonZeroIndex()
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 08fe486a9..fc1146544 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -272,7 +272,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
 
             int runLength = 0;
-            int lastValuableIndex = refTemp2.GetLastValuableElementIndex();
+            int lastValuableIndex = refTemp2.GetLastNonZeroIndex();
             for (int zig = 1; zig <= lastValuableIndex; zig++)
             {
                 int ac = (int)refTemp2[zig];
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
index f75b0a0b8..a3aa957ee 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
@@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         }
 
         [Fact]
-        public void GetLastValuableElementIndex_AllZero()
+        public void GetLastNonZeroIndex_AllZero()
         {
             static void RunTest()
             {
@@ -95,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 int expectedLessThan = 1;
 
-                int actual = data.GetLastValuableElementIndex();
+                int actual = data.GetLastNonZeroIndex();
 
                 Assert.True(actual < expectedLessThan);
             }
@@ -106,7 +106,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         }
 
         [Fact]
-        public void GetLastValuableElementIndex_AllNonZero()
+        public void GetLastNonZeroIndex_AllNonZero()
         {
             static void RunTest()
             {
@@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 int expected = Block8x8F.Size - 1;
 
-                int actual = data.GetLastValuableElementIndex();
+                int actual = data.GetLastNonZeroIndex();
 
                 Assert.Equal(expected, actual);
             }
@@ -131,7 +131,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         [Theory]
         [InlineData(1)]
         [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledSingle(int seed)
+        public void GetLastNonZeroIndex_RandomFilledSingle(int seed)
         {
             static void RunTest(string seedSerialized)
             {
@@ -147,7 +147,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = setIndex;
 
-                    int actual = data.GetLastValuableElementIndex();
+                    int actual = data.GetLastNonZeroIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -162,7 +162,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         [Theory]
         [InlineData(1)]
         [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledPartially(int seed)
+        public void GetLastNonZeroIndex_RandomFilledPartially(int seed)
         {
             static void RunTest(string seedSerialized)
             {
@@ -182,7 +182,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = lastIndex;
 
-                    int actual = data.GetLastValuableElementIndex();
+                    int actual = data.GetLastNonZeroIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         [Theory]
         [InlineData(1)]
         [InlineData(2)]
-        public void GetLastValuableElementIndex_RandomFilledFragmented(int seed)
+        public void GetLastNonZeroIndex_RandomFilledFragmented(int seed)
         {
             static void RunTest(string seedSerialized)
             {
@@ -226,7 +226,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = lastIndex2;
 
-                    int actual = data.GetLastValuableElementIndex();
+                    int actual = data.GetLastNonZeroIndex();
 
                     Assert.Equal(expected, actual);
                 }

From 81349f2358e6f1b19764928599e2ba8df796aa7f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 25 Aug 2021 17:04:45 +0300
Subject: [PATCH 15/56] Docs, fixes, added support for other subsamples/color
 types

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 123 +++++++++++++-----
 .../Formats/Jpeg/JpegEncoderCore.cs           |   7 +-
 2 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index fc1146544..a6334e2da 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -14,6 +14,51 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     internal class HuffmanScanEncoder
     {
+        /// <summary>
+        /// Maximum number of bytes encoded jpeg 8x8 block can occupy.
+        /// It's highly unlikely for block to occupy this much space - it's a theoretical limit.
+        /// </summary>
+        /// <remarks>
+        /// Where 16 is maximum huffman code binary length according to itu
+        /// specs. 10 is maximum value binary length, value comes from discrete
+        /// cosine tranform with value range: [-1024..1023]. Block stores
+        /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get
+        /// the number of bytes. This value is then multiplied by
+        /// <see cref="MaxBytesPerBlockMultiplier"/> for performance reasons.
+        /// </remarks>
+        private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier;
+
+        /// <summary>
+        /// Multiplier used within cache buffers size calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Theoretically, <see cref="MaxBytesPerBlock"/> bytes buffer can fit
+        /// exactly one minimal coding unit. In reality, coding blocks occupy much
+        /// less space than the theoretical maximum - this can be exploited.
+        /// If temporal buffer size is multiplied by at least 2, second half of
+        /// the resulting buffer will be used as an overflow 'guard' if next
+        /// block would occupy maximum number of bytes. While first half may fit
+        /// many blocks before needing to flush.
+        /// </para>
+        /// <para>
+        /// This is subject to change. This can be equal to 1 but recomended
+        /// value is 2 or even greater - futher benchmarking needed.
+        /// </para>
+        /// </remarks>
+        private const int MaxBytesPerBlockMultiplier = 2;
+
+        /// <summary>
+        /// <see cref="streamWriteBuffer"/> size multiplier.
+        /// </summary>
+        /// <remarks>
+        /// Jpeg specification requiers to insert 'stuff' bytes after each
+        /// 0xff byte value. Worst case scenarion is when all bytes are 0xff.
+        /// While it's highly unlikely (if not impossible) to get such
+        /// combination, it's theoretically possible so buffer size must be guarded.
+        /// </remarks>
+        private const int OutputBufferLengthMultiplier = 2;
+
         /// <summary>
         /// Compiled huffman tree to encode given values.
         /// </summary>
@@ -21,24 +66,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private HuffmanLut[] huffmanTables;
 
         /// <summary>
-        /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+        /// Buffer for temporal storage of huffman rle encoding bit data.
         /// </summary>
         /// <remarks>
-        /// This is subject to change, 1024 seems to be the best value in terms of performance.
-        /// <see cref="Emit(int, int)"/> expects it to be at least 8 (see comments in method body).
+        /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer.
+        /// This process does NOT include inserting stuff bytes.
         /// </remarks>
-        private const int EmitBufferSizeInBytes = 1024;
+        private readonly uint[] emitBuffer;
 
         /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+        /// Buffer for temporal storage which is then written to the output stream.
         /// </summary>
-        private readonly uint[] emitBuffer = new uint[EmitBufferSizeInBytes / 4];
-
-        private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2];
-
-        private const int BytesPerCodingUnit = 256 * 3;
+        /// <remarks>
+        /// Encoding bits from <see cref="emitBuffer"/> are copied to this byte buffer including stuff bytes.
+        /// </remarks>
+        private readonly byte[] streamWriteBuffer;
 
-        private int emitWriteIndex = (EmitBufferSizeInBytes / 4);
+        private int emitWriteIndex;
 
         /// <summary>
         /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
@@ -58,11 +102,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private readonly Stream target;
 
-        public HuffmanScanEncoder(Stream outputStream)
+        public HuffmanScanEncoder(int componentCount, Stream outputStream)
         {
+            int emitBufferByteLength = MaxBytesPerBlock * componentCount;
+            this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)];
+            this.emitWriteIndex = this.emitBuffer.Length;
+
+            this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier];
+
             this.target = outputStream;
         }
 
+        private bool IsFlushNeeded
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => this.emitWriteIndex < this.emitBuffer.Length / 2;
+        }
+
         /// <summary>
         /// Encodes the image with no subsampling.
         /// </summary>
@@ -117,14 +173,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref chrominanceQuantTable,
                         ref unzig);
 
-                    if (this.emitWriteIndex < this.emitBuffer.Length / 2)
+                    if (this.IsFlushNeeded)
                     {
-                        this.WriteToStream();
+                        this.FlushToStream();
                     }
                 }
             }
 
-            this.EmitFinalBits();
+            this.FlushRemainingBytes();
         }
 
         /// <summary>
@@ -190,10 +246,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Cr,
                         ref chrominanceQuantTable,
                         ref unzig);
+
+                    if (this.IsFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                 }
             }
 
-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
         }
 
         /// <summary>
@@ -233,10 +294,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Y,
                         ref luminanceQuantTable,
                         ref unzig);
+
+                    if (this.IsFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                 }
             }
 
-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
         }
 
         /// <summary>
@@ -306,7 +372,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitFinalBits()
+        private void FlushRemainingBytes()
         {
             // Bytes count we want to write to the output stream
             int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
@@ -317,7 +383,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int writeIndex = this.emitWriteIndex;
             this.emitBuffer[writeIndex - 1] = packedBytes;
 
-            this.WriteToStream((writeIndex * 4) - valuableBytesCount);
+            this.FlushToStream((writeIndex * 4) - valuableBytesCount);
         }
 
         /// <summary>
@@ -391,21 +457,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen);
         }
 
-        /// <summary>
-        /// Writes remaining bytes from internal buffer to the target stream.
-        /// </summary>
-        /// <remarks>Pads last byte with 1's if necessary</remarks>
-        private void FlushInternalBuffer()
-        {
-            // pad last byte with 1's
-            //int padBitsCount = 8 - (this.bitCount % 8);
-            //if (padBitsCount != 0)
-            //{
-            //    this.Emit((1 << padBitsCount) - 1, padBitsCount);
-            //    this.target.Write(this.emitBuffer, 0, this.emitLen);
-            //}
-        }
-
         /// <summary>
         /// Calculates how many minimum bits needed to store given value for Huffman jpeg encoding.
         /// </summary>
@@ -442,10 +493,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void WriteToStream() => this.WriteToStream(this.emitWriteIndex * 4);
+        private void FlushToStream() => this.FlushToStream(this.emitWriteIndex * 4);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void WriteToStream(int endIndex)
+        private void FlushToStream(int endIndex)
         {
             Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
 
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 88d96f554..8c6726e65 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -114,11 +114,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteStartOfScan(image, componentCount, cancellationToken);
 
             // Write the scan compressed data.
-            var scanEncoder = new HuffmanScanEncoder(stream);
             if (this.colorType == JpegColorType.Luminance)
             {
                 // luminance quantization table only
-                scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
+                new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
             }
             else
             {
@@ -126,10 +125,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 switch (this.subsample)
                 {
                     case JpegSubsample.Ratio444:
-                        scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
                         break;
                     case JpegSubsample.Ratio420:
-                        scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
                         break;
                 }
             }

From 6c5cf28ecdb35b1a286b9ece0106975a35030589 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 26 Aug 2021 13:36:50 +0300
Subject: [PATCH 16/56] New zig-zag implementation

---
 .../Formats/Jpeg/Components/Block8x8.cs       |   2 +-
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  87 ++++
 .../Jpeg/Components/Block8x8F.ScaledCopyTo.cs |   2 +-
 .../Formats/Jpeg/Components/Block8x8F.cs      | 138 +-----
 .../Components/Decoder/HuffmanScanDecoder.cs  |  17 +-
 .../Jpeg/Components/Decoder/IRawJpegData.cs   |   2 +-
 .../Decoder/JpegBlockPostProcessor.cs         |   2 +-
 .../Components/Encoder/HuffmanScanEncoder.cs  |  43 +-
 .../Formats/Jpeg/Components/Quantization.cs   |  67 +--
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 404 ++++++++++++++++++
 .../Formats/Jpeg/Components/ZigZag.cs         |  79 +---
 .../Formats/Jpeg/JpegDecoderCore.cs           |   6 +-
 .../Formats/Jpeg/JpegEncoderCore.cs           |  12 +-
 .../Formats/Jpg/Block8x8FTests.cs             |  74 +---
 .../Formats/Jpg/QuantizationTests.cs          |   8 +-
 .../Jpg/Utils/ReferenceImplementations.cs     |  54 +--
 .../Formats/Jpg/ZigZagTests.cs                |   5 +-
 17 files changed, 627 insertions(+), 375 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 3e5277c06..c76eb942f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -12,7 +12,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     /// <summary>
-    /// 8x8 coefficients matrix of <see cref="short"/> type.
+    /// 8x8 matrix of <see cref="short"/> coefficients.
     /// </summary>
     // ReSharper disable once InconsistentNaming
     [StructLayout(LayoutKind.Explicit)]
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
new file mode 100644
index 000000000..073580d40
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -0,0 +1,87 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal partial struct Block8x8F
+    {
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+
+        private static ReadOnlySpan<int> DivideIntoInt16_Avx2_ShuffleMask => new int[] {
+            0, 1, 4, 5, 2, 3, 6, 7
+        };
+
+        private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask)
+            {
+                Vector256<int> crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32();
+
+                ref Vector256<float> aBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref a);
+                ref Vector256<float> bBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref b);
+
+                ref Vector256<short> destBase = ref Unsafe.As<Block8x8, Vector256<short>>(ref dest);
+
+                for (int i = 0; i < 8; i += 2)
+                {
+                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                    Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                    row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
+
+                    Unsafe.Add(ref destBase, i / 2) = row;
+                }
+            }
+        }
+
+        private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+            ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
+            ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
+
+            ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
+
+            for (int i = 0; i < 16; i += 2)
+            {
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector128<short> row = Sse2.PackSignedSaturate(left, right);
+                Unsafe.Add(ref destBase, i / 2) = row;
+            }
+        }
+    }
+}
+#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
index 23cf4ce4a..498fe4d03 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 using System.Numerics;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 8479cdc97..79a35e2cd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -16,7 +16,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     /// <summary>
-    /// 8x8 coefficients matrix of <see cref="float"/> type.
+    /// 8x8 matrix of <see cref="float"/> coefficients.
     /// </summary>
     [StructLayout(LayoutKind.Explicit)]
     internal partial struct Block8x8F : IEquatable<Block8x8F>
@@ -66,30 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public Vector4 V7L;
         [FieldOffset(240)]
         public Vector4 V7R;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        /// <summary>
-        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
-        /// </summary>
-        public const int RowCount = 8;
-
-        [FieldOffset(0)]
-        public Vector256<float> V0;
-        [FieldOffset(32)]
-        public Vector256<float> V1;
-        [FieldOffset(64)]
-        public Vector256<float> V2;
-        [FieldOffset(96)]
-        public Vector256<float> V3;
-        [FieldOffset(128)]
-        public Vector256<float> V4;
-        [FieldOffset(160)]
-        public Vector256<float> V5;
-        [FieldOffset(192)]
-        public Vector256<float> V6;
-        [FieldOffset(224)]
-        public Vector256<float> V7;
-#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented
 
         /// <summary>
@@ -188,13 +164,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             return result;
         }
 
-        /// <summary>
-        /// Fill the block with defaults (zeroes).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public void Clear()
-            => this = default; // The cheapest way to do this in C#:
-
         /// <summary>
         /// Load raw 32bit floating point data from source.
         /// </summary>
@@ -302,7 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
         public float[] ToArray()
         {
-            var result = new float[Size];
+            float[] result = new float[Size];
             this.ScaledCopyTo(result);
             return result;
         }
@@ -434,102 +403,37 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         }
 
         /// <summary>
-        /// Quantize the block.
-        /// </summary>
-        /// <param name="blockPtr">The block pointer.</param>
-        /// <param name="qtPtr">The qt pointer.</param>
-        /// <param name="unzigPtr">Unzig pointer</param>
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
-        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Size; qtIndex++)
-            {
-                byte blockIndex = unzigPtr[qtIndex];
-                float* unzigPos = b + blockIndex;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
-            }
-        }
-
-        /// <summary>
-        /// Quantize 'block' into 'dest' using the 'qt' quantization table:
-        /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values.
-        /// To finish the rounding it's enough to (int)-cast these values.
+        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
         /// </summary>
-        /// <param name="block">Source block</param>
-        /// <param name="dest">Destination block</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        public static unsafe void Quantize(
-            ref Block8x8F block,
-            ref Block8x8F dest,
-            ref Block8x8F qt,
-            ref ZigZag unZig)
+        /// <param name="block">Source block.</param>
+        /// <param name="dest">Destination block.</param>
+        /// <param name="qt">The quantization table.</param>
+        public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
         {
-            for (int zig = 0; zig < Size; zig++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
             {
-                dest[zig] = block[unZig[zig]];
+                DivideIntoInt16_Avx2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
             }
-
-            DivideRoundAll(ref dest, ref qt);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
+            else if (Ssse3.IsSupported)
             {
-                var vnegOne = Vector256.Create(-1f);
-                var vadd = Vector256.Create(.5F);
-                var vone = Vector256.Create(1f);
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
-                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
-                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
-                }
+                DivideIntoInt16_Sse2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
             }
             else
 #endif
             {
-                a.V0L = DivideRound(a.V0L, b.V0L);
-                a.V0R = DivideRound(a.V0R, b.V0R);
-                a.V1L = DivideRound(a.V1L, b.V1L);
-                a.V1R = DivideRound(a.V1R, b.V1R);
-                a.V2L = DivideRound(a.V2L, b.V2L);
-                a.V2R = DivideRound(a.V2R, b.V2R);
-                a.V3L = DivideRound(a.V3L, b.V3L);
-                a.V3R = DivideRound(a.V3R, b.V3R);
-                a.V4L = DivideRound(a.V4L, b.V4L);
-                a.V4R = DivideRound(a.V4R, b.V4R);
-                a.V5L = DivideRound(a.V5L, b.V5L);
-                a.V5R = DivideRound(a.V5R, b.V5R);
-                a.V6L = DivideRound(a.V6L, b.V6L);
-                a.V6R = DivideRound(a.V6R, b.V6R);
-                a.V7L = DivideRound(a.V7L, b.V7L);
-                a.V7R = DivideRound(a.V7R, b.V7R);
+                for (int i = 0; i < Size; i++)
+                {
+                    // TODO: find a way to index block & qt matrices with natural order indices for performance?
+                    int zig = ZigZag.ZigZagOrder[i];
+                    float divRes = block[zig] / qt[zig];
+                    dest[i] = (short)(divRes + (divRes > 0 ? 0.5f : -0.5f));
+                }
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
-        {
-            var neg = new Vector4(-1);
-            var add = new Vector4(.5F);
-
-            // sign(dividend) = max(min(dividend, 1), -1)
-            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
-            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
-            return (dividend / divisor) + (sign * add);
-        }
-
         public void RoundInto(ref Block8x8 dest)
         {
             for (int i = 0; i < Size; i++)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
index 70a446512..bbc4e40af 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@@ -54,9 +54,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         /// </summary>
         private readonly HuffmanTable[] acHuffmanTables;
 
-        // The unzig data.
-        private ZigZag dctZigZag;
-
         private HuffmanScanBuffer scanBuffer;
 
         private readonly SpectralConverter spectralConverter;
@@ -74,7 +71,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             SpectralConverter converter,
             CancellationToken cancellationToken)
         {
-            this.dctZigZag = ZigZag.CreateUnzigTable();
             this.stream = stream;
             this.spectralConverter = converter;
             this.cancellationToken = cancellationToken;
@@ -477,7 +473,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         {
             ref short blockDataRef = ref Unsafe.As<Block8x8, short>(ref block);
             ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;
 
             // DC
             int t = buffer.DecodeHuffman(ref dcTable);
@@ -502,7 +497,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                 {
                     i += r;
                     s = buffer.Receive(s);
-                    Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s;
+                    Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
                 }
                 else
                 {
@@ -556,7 +551,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                 }
 
                 ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-                ref ZigZag zigzag = ref this.dctZigZag;
                 int start = this.SpectralStart;
                 int end = this.SpectralEnd;
                 int low = this.SuccessiveLow;
@@ -572,7 +566,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                     if (s != 0)
                     {
                         s = buffer.Receive(s);
-                        Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low);
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
                     }
                     else
                     {
@@ -602,7 +596,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         {
             // Refinement scan for these AC coefficients
             ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;
             int start = this.SpectralStart;
             int end = this.SpectralEnd;
 
@@ -649,7 +642,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
 
                     do
                     {
-                        ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
                         if (coef != 0)
                         {
                             buffer.CheckBits();
@@ -675,7 +668,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
 
                     if ((s != 0) && (k < 64))
                     {
-                        Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s;
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
                     }
                 }
             }
@@ -684,7 +677,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             {
                 for (; k <= end; k++)
                 {
-                    ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
 
                     if (coef != 0)
                     {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
index 391dac784..0b80acc5d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
@@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         IJpegComponent[] Components { get; }
 
         /// <summary>
-        /// Gets the quantization tables, in zigzag order.
+        /// Gets the quantization tables, in natural order.
         /// </summary>
         Block8x8F[] QuantizationTables { get; }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 7cfbaddcc..00169d082 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -46,7 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
         {
             int qtIndex = component.QuantizationTableIndex;
-            this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]);
+            this.DequantiazationTable = decoder.QuantizationTables[qtIndex];
             this.subSamplingDivisors = component.SubSamplingDivisors;
 
             this.SourceBlock = default;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index a6334e2da..8b61b66c9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -96,6 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
         private Block8x8F temporalBlock1;
         private Block8x8F temporalBlock2;
+        private Block8x8 temporalShortBlock;
 
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
@@ -132,8 +133,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
-            var unzig = ZigZag.CreateUnzigTable();
-
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
 
@@ -156,22 +155,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
 
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
 
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
 
                     if (this.IsFlushNeeded)
                     {
@@ -197,8 +193,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
-            var unzig = ZigZag.CreateUnzigTable();
-
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
             ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
@@ -222,30 +216,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                             QuantIndex.Luminance,
                             prevDCY,
                             ref pixelConverter.YLeft,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
                             ref pixelConverter.YRight,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);
                     }
 
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
 
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
 
                     if (this.IsFlushNeeded)
                     {
@@ -269,8 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
-            var unzig = ZigZag.CreateUnzigTable();
-
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0;
 
@@ -292,8 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
 
                     if (this.IsFlushNeeded)
                     {
@@ -320,28 +307,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             QuantIndex index,
             int prevDC,
             ref Block8x8F src,
-            ref Block8x8F quant,
-            ref ZigZag unZig)
+            ref Block8x8F quant)
         {
             ref Block8x8F refTemp1 = ref this.temporalBlock1;
             ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8 spectralBlock = ref this.temporalShortBlock;
 
             FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
 
-            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+            Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant);
 
             // Emit the DC delta.
-            int dc = (int)refTemp2[0];
+            int dc = spectralBlock[0];
             this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);
 
             // Emit the AC components.
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
 
             int runLength = 0;
-            int lastValuableIndex = refTemp2.GetLastNonZeroIndex();
+            int lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
             for (int zig = 1; zig <= lastValuableIndex; zig++)
             {
-                int ac = (int)refTemp2[zig];
+                int ac = spectralBlock[zig];
 
                 if (ac == 0)
                 {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
index 2ff56c63b..eab5e6a08 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
@@ -39,53 +39,59 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public const int QualityEstimationConfidenceUpperThreshold = 98;
 
         /// <summary>
-        /// Gets the unscaled luminance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled luminance quantization table.
         /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
         // The C# compiler emits this as a compile-time constant embedded in the PE file.
         // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
         // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+        public static ReadOnlySpan<byte> LuminanceTable => new byte[]
         {
-            16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-            40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-            57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-            109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-            100, 120, 92, 101, 103, 99,
+            16, 11, 10, 16,  24,  40,  51,  61,
+            12, 12, 14, 19,  26,  58,  60,  55,
+            14, 13, 16, 24,  40,  57,  69,  56,
+            14, 17, 22, 29,  51,  87,  80,  62,
+            18, 22, 37, 56,  68, 109, 103,  77,
+            24, 35, 55, 64,  81, 104, 113,  92,
+            49, 64, 78, 87, 103, 121, 120, 101,
+            72, 92, 95, 98, 112, 100, 103,  99,
         };
 
         /// <summary>
-        /// Gets the unscaled chrominance quantization table in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from ITU section K.1 after converting from natural to
-        /// zig-zag order.
+        /// Gets unscaled chrominance quantization table.
         /// </summary>
+        /// <remarks>
+        /// The values are derived from ITU section K.1.
+        /// </remarks>
         // The C# compiler emits this as a compile-time constant embedded in the PE file.
         // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
         // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        public static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+        public static ReadOnlySpan<byte> ChrominanceTable => new byte[]
         {
-            17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-            99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+            17, 18, 24, 47, 99, 99, 99, 99,
+            18, 21, 26, 66, 99, 99, 99, 99,
+            24, 26, 56, 99, 99, 99, 99, 99,
+            47, 66, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
+            99, 99, 99, 99, 99, 99, 99, 99,
             99, 99, 99, 99, 99, 99, 99, 99,
         };
 
         /// Ported from JPEGsnoop:
         /// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694
         /// <summary>
-        /// Estimates jpeg quality based on quantization table in zig-zag order.
+        /// Estimates jpeg quality based on standard quantization table.
         /// </summary>
         /// <remarks>
-        /// This technically can be used with any given table but internal decoder code uses ITU spec tables:
-        /// <see cref="UnscaledQuant_Luminance"/> and <see cref="UnscaledQuant_Chrominance"/>.
+        /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables:
+        /// <see cref="LuminanceTable"/> and <see cref="ChrominanceTable"/>.
         /// </remarks>
         /// <param name="table">Input quantization table.</param>
-        /// <param name="target">Quantization to estimate against.</param>
-        /// <returns>Estimated quality</returns>
+        /// <param name="target">Natural order quantization table to estimate against.</param>
+        /// <returns>Estimated quality.</returns>
         public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan<byte> target)
         {
             // This method can be SIMD'ified if standard table is injected as Block8x8F.
@@ -106,11 +112,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             int quality;
             for (int i = 0; i < Block8x8F.Size; i++)
             {
-                float coeff = table[i];
-                int coeffInteger = (int)coeff;
+                int coeff = (int)table[i];
 
                 // Coefficients are actually int16 casted to float numbers so there's no truncating error.
-                if (coeffInteger != 0)
+                if (coeff != 0)
                 {
                     comparePercent = 100.0 * (table[i] / target[i]);
                 }
@@ -152,7 +157,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <returns>Estimated quality</returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable)
-            => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance);
+            => EstimateQuality(ref luminanceTable, LuminanceTable);
 
         /// <summary>
         /// Estimates jpeg quality based on quantization table in zig-zag order.
@@ -161,7 +166,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <returns>Estimated quality</returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable)
-            => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance);
+            => EstimateQuality(ref chrominanceTable, ChrominanceTable);
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int QualityToScale(int quality)
@@ -185,10 +190,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Block8x8F ScaleLuminanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable);
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Block8x8F ScaleChrominanceTable(int quality)
-            => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance);
+            => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable);
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
new file mode 100644
index 000000000..066eb2846
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -0,0 +1,404 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class ZigZag
+    {
+        /// <summary>
+        /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
+        /// </summary>
+        private const byte Z = 0xff;
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
+        {
+            // 0_A
+            0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z,
+            // 0_B
+            Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5,
+            // 0_C
+            Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 1_A
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11,
+            // 1_B
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z,
+            // 1_C
+            2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z,
+            // 1_D
+            Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 1_E
+            Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 2_B
+            8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_C
+            Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_D
+            Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 2_E
+            Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5,
+            // 2_F
+            Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z,
+            // 2_G
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z,
+
+            // 3_A
+            Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z,
+            // 3_B
+            Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z,
+            // 3_C
+            Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z,
+            // 3_D/4_E
+            6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9,
+
+            // 4_F
+            Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z,
+            // 4_G
+            Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z,
+            // 4_H
+            Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z,
+
+            // 5_B
+            Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 5_C
+            Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 5_D
+            10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z,
+            // 5_E
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z,
+            // 5_F
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z,
+            // 5_G
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
+
+            // 6_D
+            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z,
+            // 6_E
+            Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z,
+            // 6_F
+            Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13,
+            // 6_G
+            Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            // 6_H
+            4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+
+            // 7_F
+            Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z,
+            // 7_G
+            10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z,
+            // 7_H
+            Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx"/>
+        /// zig zag implementation.
+        /// </summary>
+        private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
+        {
+                // 01_AB/01_EF/23_CD - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
+
+                // 01_AB - inner-lane
+                0, 1, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   6, 7, 12, 13,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   4, 5, 6, 7,
+
+                // 01_CD/23_GH - cross-lane
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,
+
+                // 01_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   Z, Z, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,
+
+                // 01_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   0, 1, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 23_AB/45_CD/67_EF - cross-lane
+                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,
+
+                // 23_AB - inner-lane
+                4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   2, 3, 8, 9,   Z, Z, Z, Z,
+
+                // 23_CD - inner-lane
+                Z, Z, 6, 7,   12, 13, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 12, 13,
+
+                // 23_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 23_GH - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 45_AB - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 45_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   Z, Z, 2, 3,   8, 9, Z, Z,   Z, Z, Z, Z,
+
+                // 45_EF - cross-lane
+                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   Z, Z, Z, Z,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+
+                // 45_EF - inner-lane
+                2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,
+
+                // 45_GH - inner-lane
+                Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 6, 7,
+
+                // 67_CD - inner-lane
+                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+
+                // 67_EF - inner-lane
+                Z, Z, Z, Z,   Z, Z, 6, 7,   0, 1, Z, Z,   2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,
+
+                // 67_GH - inner-lane
+                8, 9, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, 6, 7,   12, 13, 14, 15
+        };
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
+        /// </summary>
+        /// <remarks>
+        /// Requires Ssse3 support.
+        /// </remarks>
+        /// <param name="source">Input matrix.</param>
+        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 source, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
+
+            fixed (byte* maskPtr = SseShuffleMasks)
+            {
+                Vector128<byte> A = source.V0.AsByte();
+                Vector128<byte> B = source.V1.AsByte();
+                Vector128<byte> C = source.V2.AsByte();
+                Vector128<byte> D = source.V3.AsByte();
+                Vector128<byte> E = source.V4.AsByte();
+                Vector128<byte> F = source.V5.AsByte();
+                Vector128<byte> G = source.V6.AsByte();
+                Vector128<byte> H = source.V7.AsByte();
+
+                // row0
+                Vector128<short> row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
+                Vector128<short> row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(row0_A, row0_B);
+                Vector128<short> row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
+                row0 = Sse2.Or(row0, row0_C);
+
+                // row1
+                Vector128<short> row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
+                Vector128<short> row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
+                Vector128<short> row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_C);
+                Vector128<short> row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_D);
+                Vector128<short> row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1_E);
+
+                // row2
+                Vector128<short> row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
+                Vector128<short> row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(row2_B, row2_C);
+                Vector128<short> row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_D);
+                Vector128<short> row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_E);
+                Vector128<short> row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_F);
+                Vector128<short> row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2_G);
+
+                // row3
+                Vector128<short> A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
+                Vector128<short> B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(A_3, B_3);
+                Vector128<short> C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                row3 = Sse2.Or(row3, C_3);
+                Vector128<byte> D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
+                Vector128<short> D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16();
+                row3 = Sse2.Or(row3, D_3);
+
+                // row4
+                Vector128<short> E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16();
+                Vector128<short> F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(E_4, F_4);
+                Vector128<short> G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, G_4);
+                Vector128<short> H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, H_4);
+
+                // row5
+                Vector128<short> B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
+                Vector128<short> C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(B_5, C_5);
+                Vector128<short> D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, D_5);
+                Vector128<short> E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, E_5);
+                Vector128<short> F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, F_5);
+                Vector128<short> G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, G_5);
+
+                // row6
+                Vector128<short> D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
+                Vector128<short> E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(D_6, E_6);
+                Vector128<short> F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, F_6);
+                Vector128<short> G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, G_6);
+                Vector128<short> H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, H_6);
+
+                // row7
+                Vector128<short> F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
+                Vector128<short> G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(F_7, G_7);
+                Vector128<short> H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
+                row7 = Sse2.Or(row7, H_7);
+
+                dest.V0 = row0;
+                dest.V1 = row1;
+                dest.V2 = row2;
+                dest.V3 = row3;
+                dest.V4 = row4;
+                dest.V5 = row5;
+                dest.V6 = row6;
+                dest.V7 = row7;
+            }
+        }
+
+        /// <summary>
+        /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
+        /// </summary>
+        /// <remarks>
+        /// Requires Avx2 support.
+        /// </remarks>
+        /// <param name="source">Input matrix.</param>
+        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
+        public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 source, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+            {
+                // 18 loads
+                // 10 cross-lane shuffles (permutations)
+                // 14 shuffles
+                // 10 bitwise or's
+                // 4 stores
+
+                // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7
+                // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7
+                // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7
+                // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7
+                Vector256<byte> AB = source.V01.AsByte();
+                Vector256<byte> CD = source.V23.AsByte();
+                Vector256<byte> EF = source.V45.AsByte();
+                Vector256<byte> GH = source.V67.AsByte();
+
+                // row01 - A0  A1  B0  C0  B1  A2  A3  B2 | C1  D0  E0  D1  C2  B3  A4  A5
+                Vector256<int> AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+
+                // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X  X)  (X  X)
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row01_AB - (A0 A1) (B0  X) (B1 A2) (A3 B2) | (X  X)  (X  X)  (X  B3) (A4 A5)
+                row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
+
+                Vector256<int> CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
+
+                // row01_CD - (C0 C1) (X X)  (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X)
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
+                // row01_CD - (X  X)  (X C0) (X X) (X X) | (C1 D0) (X  D1)  (C2 X)  (X X)
+                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
+
+                // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X)  (X X) (X X)
+                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row01_EF - (X X) (X X) (X X) (X X) | (X  X)  (E0 X) (X X) (X X)
+                Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+                Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
+
+
+                // row23 - B4  C3  D2  E1  F0  G0  F1  E2 | D3  C4  B5  A6  A7  B6  C5  D4
+
+                Vector256<int> AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+
+                // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X)
+                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X)
+                Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+
+                // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X)
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
+                // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4)
+                row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
+
+                // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+
+                // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X)
+                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
+                // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+
+                Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
+
+
+                // row45 - E3  F2  G1  H0  H1  G2  F3  E4 | D5  C6  B7  C7  D6  E5  F4  G3
+
+                // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X)
+                Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
+
+                // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X)
+                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X)
+                Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
+
+                Vector256<int> EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
+
+                // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X)
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
+                // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X)
+                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
+
+                // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3)
+                Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+
+                Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
+
+
+                // row67 - H2  H3  G4  F5  E6  D7  E7  F6 | G5  H4  H5  G6  F7  G7  H6  H7
+
+                // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
+
+                // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X)
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
+                // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X)
+                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
+
+                // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7)
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
+                // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7)
+                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+
+                Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+
+                dest.V01 = row01.AsInt16();
+                dest.V23 = row23.AsInt16();
+                dest.V45 = row45.AsInt16();
+                dest.V67 = row67.AsInt16();
+            }
+        }
+    }
+}
+#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
index 737652d4e..c2b0fc5d0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@@ -4,19 +4,17 @@
 using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
-    /// <summary>
-    /// Holds the Jpeg UnZig array in a value/stack type.
-    /// Unzig maps from the zigzag ordering to the natural ordering. For example,
-    /// unzig[3] is the column and row of the fourth element in zigzag order. The
-    /// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-    /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct ZigZag
+    internal static partial class ZigZag
     {
         /// <summary>
+        /// Gets span of zig-zag ordering indices.
+        /// </summary>
+        /// <remarks>
         /// When reading corrupted data, the Huffman decoders could attempt
         /// to reference an entry beyond the end of this array (if the decoded
         /// zero run length reaches past the end of the block).  To prevent
@@ -25,20 +23,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// to be stored in location 63 of the block, not somewhere random.
         /// The worst case would be a run-length of 15, which means we need 16
         /// fake entries.
-        /// </summary>
-        private const int Size = 64 + 16;
-
-        /// <summary>
-        /// Copy of <see cref="Unzig"/> in a value type
-        /// </summary>
-        public fixed byte Data[Size];
-
-        /// <summary>
-        /// Gets the unzigs map, which maps from the zigzag ordering to the natural ordering.
-        /// For example, unzig[3] is the column and row of the fourth element in zigzag order.
-        /// The value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
-        /// </summary>
-        private static ReadOnlySpan<byte> Unzig => new byte[]
+        /// </remarks>
+        public static ReadOnlySpan<byte> ZigZagOrder => new byte[]
         {
             0,  1,  8, 16,  9,  2,  3, 10,
             17, 24, 32, 25, 18, 11,  4,  5,
@@ -48,53 +34,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             29, 22, 15, 23, 30, 37, 44, 51,
             58, 59, 52, 45, 38, 31, 39, 46,
             53, 60, 61, 54, 47, 55, 62, 63,
-            63, 63, 63, 63, 63, 63, 63, 63, // Extra entries for safety in decoder
+
+            // Extra entries for safety in decoder
+            63, 63, 63, 63, 63, 63, 63, 63,
             63, 63, 63, 63, 63, 63, 63, 63
         };
-
-        /// <summary>
-        /// Returns the value at the given index
-        /// </summary>
-        /// <param name="idx">The index</param>
-        /// <returns>The <see cref="byte"/></returns>
-        public byte this[int idx]
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            get
-            {
-                ref byte self = ref Unsafe.As<ZigZag, byte>(ref this);
-                return Unsafe.Add(ref self, idx);
-            }
-        }
-
-        /// <summary>
-        /// Creates and fills an instance of <see cref="ZigZag"/> with Jpeg unzig indices
-        /// </summary>
-        /// <returns>The new instance</returns>
-        public static ZigZag CreateUnzigTable()
-        {
-            ZigZag result = default;
-            ref byte sourceRef = ref MemoryMarshal.GetReference(Unzig);
-            ref byte destinationRef = ref Unsafe.AsRef<byte>(result.Data);
-
-            Unzig.CopyTo(new Span<byte>(result.Data, Size));
-
-            return result;
-        }
-
-        /// <summary>
-        /// Apply Zigging to the given quantization table, so it will be sufficient to multiply blocks for dequantizing them.
-        /// </summary>
-        public static Block8x8F CreateDequantizationTable(ref Block8x8F qt)
-        {
-            Block8x8F result = default;
-
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                result[Unzig[i]] = qt[i];
-            }
-
-            return result;
-        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
index e94b07faa..477054264 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@@ -740,9 +740,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                         stream.Read(this.temp, 0, 64);
                         remaining -= 64;
 
+                        // Parsing quantization table & saving it in natural order
                         for (int j = 0; j < 64; j++)
                         {
-                            table[j] = this.temp[j];
+                            table[ZigZag.ZigZagOrder[j]] = this.temp[j];
                         }
 
                         break;
@@ -760,9 +761,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                         stream.Read(this.temp, 0, 128);
                         remaining -= 128;
 
+                        // Parsing quantization table & saving it in natural order
                         for (int j = 0; j < 64; j++)
                         {
-                            table[j] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
+                            table[ZigZag.ZigZagOrder[j]] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1];
                         }
 
                         break;
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 8c6726e65..85a2c6846 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -151,7 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             dqt[offset++] = (byte)i;
             for (int j = 0; j < Block8x8F.Size; j++)
             {
-                dqt[offset++] = (byte)quant[j];
+                dqt[offset++] = (byte)quant[ZigZag.ZigZagOrder[j]];
             }
         }
 
@@ -635,11 +635,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// Initializes quntization tables.
         /// </summary>
         /// <remarks>
+        /// <para>
+        /// Zig-zag ordering is NOT applied to the resulting tables.
+        /// </para>
+        /// <para>
         /// We take quality values in a hierarchical order:
         /// 1. Check if encoder has set quality
-        /// 2. Check if metadata has special table for encoding
-        /// 3. Check if metadata has set quality
-        /// 4. Take default quality value - 75
+        /// 2. Check if metadata has set quality
+        /// 3. Take default quality value - 75
+        /// </para>
         /// </remarks>
         /// <param name="componentCount">Color components count.</param>
         /// <param name="metadata">Jpeg metadata instance.</param>
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 42fdd603e..fc642dcc7 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -272,32 +272,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             this.CompareBlocks(expected, actual, 0);
         }
 
+        // TODO: intrinsic tests
         [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public unsafe void Quantize(int seed)
+        [InlineData(1, 2)]
+        [InlineData(2, 1)]
+        public void Quantize(int srcSeed, int qtSeed)
         {
-            var block = default(Block8x8F);
-            block.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
-
-            var qt = default(Block8x8F);
-            qt.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed));
-
-            var unzig = ZigZag.CreateUnzigTable();
+            Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
+            Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);
 
-            int* expectedResults = stackalloc int[Block8x8F.Size];
-            ReferenceImplementations.QuantizeRational(&block, expectedResults, &qt, unzig.Data);
+            Block8x8 expected = default;
+            ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
 
-            var actualResults = default(Block8x8F);
+            Block8x8 actual = default;
+            Block8x8F.Quantize(ref source, ref actual, ref quant);
 
-            Block8x8F.Quantize(ref block, ref actualResults, ref qt, ref unzig);
-
-            for (int i = 0; i < Block8x8F.Size; i++)
+            for (int i = 0; i < Block8x8.Size; i++)
             {
-                int expected = expectedResults[i];
-                int actual = (int)actualResults[i];
-
-                Assert.Equal(expected, actual);
+                Assert.Equal(expected[i], actual[i]);
             }
         }
 
@@ -368,48 +360,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
         }
 
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void DequantizeBlock(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-            Block8x8F.DequantizeBlock(&actual, &qt, unzig.Data);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public unsafe void ZigZag_CreateDequantizationTable_MultiplicationShouldQuantize(int seed)
-        {
-            Block8x8F original = CreateRandomFloatBlock(-500, 500, seed);
-            Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42);
-
-            var unzig = ZigZag.CreateUnzigTable();
-            Block8x8F zigQt = ZigZag.CreateDequantizationTable(ref qt);
-
-            Block8x8F expected = original;
-            Block8x8F actual = original;
-
-            ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data);
-
-            actual.MultiplyInPlace(ref zigQt);
-
-            this.CompareBlocks(expected, actual, 0);
-        }
-
         [Fact]
         public void AddToAllInPlace()
         {
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
index 03f7020c0..4505ef538 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs
@@ -21,7 +21,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 Block8x8F table = JpegQuantization.ScaleLuminanceTable(quality);
                 int estimatedQuality = JpegQuantization.EstimateLuminanceQuality(ref table);
 
-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate luminance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate luminance quality for standard table at quality level {quality}");
             }
         }
 
@@ -35,7 +37,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 Block8x8F table = JpegQuantization.ScaleChrominanceTable(quality);
                 int estimatedQuality = JpegQuantization.EstimateChrominanceQuality(ref table);
 
-                Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate chrominance quality for standard table at quality level {quality}");
+                Assert.True(
+                    quality.Equals(estimatedQuality),
+                    $"Failed to estimate chrominance quality for standard table at quality level {quality}");
             }
         }
     }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
index 2c673f30e..aa98a7379 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@@ -15,18 +15,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
     /// </summary>
     internal static partial class ReferenceImplementations
     {
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
+        public static void DequantizeBlock(ref Block8x8F block, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
         {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Block8x8F.Size; qtIndex++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
-                byte i = unzigPtr[qtIndex];
-                float* unzigPos = b + i;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
+                int zig = zigzag[i];
+                block[zig] *= qt[i];
             }
         }
 
@@ -101,42 +95,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
 
         /// <summary>
         /// Reference implementation to test <see cref="Block8x8F.Quantize"/>.
-        /// Rounding is done used an integer-based algorithm defined in <see cref="RationalRound(int,int)"/>.
         /// </summary>
-        /// <param name="src">The input block</param>
-        /// <param name="dest">The destination block of integers</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unzigPtr">Pointer to <see cref="ZigZag.Data"/> </param>
-        public static unsafe void QuantizeRational(Block8x8F* src, int* dest, Block8x8F* qt, byte* unzigPtr)
+        /// <param name="src">The input block.</param>
+        /// <param name="dest">The destination block of 16bit integers.</param>
+        /// <param name="qt">The quantization table.</param>
+        /// <param name="zigzag">Zig-Zag index sequence span.</param>
+        public static void Quantize(ref Block8x8F src, ref Block8x8 dest, ref Block8x8F qt, ReadOnlySpan<byte> zigzag)
         {
-            float* s = (float*)src;
-            float* q = (float*)qt;
-
-            for (int zig = 0; zig < Block8x8F.Size; zig++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
-                int a = (int)s[unzigPtr[zig]];
-                int b = (int)q[zig];
-
-                int val = RationalRound(a, b);
-                dest[zig] = val;
+                int zig = zigzag[i];
+                dest[i] = (short)Math.Round(src[zig] / qt[zig], MidpointRounding.AwayFromZero);
             }
         }
-
-        /// <summary>
-        /// Rounds a rational number defined as dividend/divisor into an integer.
-        /// </summary>
-        /// <param name="dividend">The dividend.</param>
-        /// <param name="divisor">The divisor.</param>
-        /// <returns>The rounded value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int RationalRound(int dividend, int divisor)
-        {
-            if (dividend >= 0)
-            {
-                return (dividend + (divisor >> 1)) / divisor;
-            }
-
-            return -((-dividend + (divisor >> 1)) / divisor);
-        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
index e03cf9958..39046438a 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
@@ -13,8 +13,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         public void ZigZagCanHandleAllPossibleCoefficients()
         {
             // Mimic the behaviour of the huffman scan decoder using all possible byte values
-            var block = new short[64];
-            var zigzag = ZigZag.CreateUnzigTable();
+            short[] block = new short[64];
 
             for (int h = 0; h < 255; h++)
             {
@@ -27,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     if (s != 0)
                     {
                         i += r;
-                        block[zigzag[i++]] = (short)s;
+                        block[ZigZag.ZigZagOrder[i++]] = (short)s;
                     }
                     else
                     {

From a220b3d5b894724fb7722efae95cd75c83609edc Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 28 Aug 2021 19:29:30 +0300
Subject: [PATCH 17/56] Removed obsolete code, tests cleanup

---
 .../Formats/Jpeg/Components/Block8x8F.cs      |  57 -------
 .../Formats/Jpeg/Components/ZigZag.cs         |   4 -
 .../Formats/Jpg/Block8x8Tests.cs              | 155 +++++++++++++++++-
 .../Formats/Jpg/HuffmanScanEncoderTests.cs    | 152 -----------------
 4 files changed, 154 insertions(+), 214 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 79a35e2cd..b29c13e6e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -768,62 +768,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 return true;
             }
         }
-
-        /// <summary>
-        /// Returns index of the last non-zero element in this matrix.
-        /// </summary>
-        /// <returns>
-        /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
-        /// </returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public int GetLastNonZeroIndex()
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
-                Vector256<int> zero8 = Vector256<int>.Zero;
-
-                ref Vector256<float> mcuStride = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
-
-                for (int i = 7; i >= 0; i--)
-                {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
-
-                    if (areEqual != equalityMask)
-                    {
-                        // Each 4 bits represents comparison operation for each 4-byte element in input vectors
-                        // LSB represents first element in the stride
-                        // MSB represents last element in the stride
-                        // lzcnt operation would calculate number of zero numbers at the end
-
-                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
-                        // So we need to invert it
-                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
-
-                        // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4
-                        // to get the exact number of zero elements in the stride
-                        int strideRelativeIndex = 7 - (lzcnt / 4);
-                        return (i * 8) + strideRelativeIndex;
-                    }
-                }
-
-                return -1;
-            }
-            else
-#endif
-            {
-                int index = Size - 1;
-                ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
-
-                while (index >= 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
-                {
-                    index--;
-                }
-
-                return index;
-            }
-        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
index c2b0fc5d0..e519a8a1d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@@ -2,10 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index afe71ad04..6d73181cb 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -1,9 +1,10 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
-
+using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;
 using Xunit.Abstractions;
 
@@ -121,5 +122,157 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             Assert.Equal(15, d);
         }
+
+        [Fact]
+        public void GetLastNonZeroIndex_AllZero()
+        {
+            static void RunTest()
+            {
+                Block8x8 data = default;
+
+                int expected = -1;
+
+                int actual = data.GetLastNonZeroIndex();
+
+                Assert.Equal(expected, actual);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Fact]
+        public void GetLastNonZeroIndex_AllNonZero()
+        {
+            static void RunTest()
+            {
+                Block8x8 data = default;
+                for (int i = 0; i < Block8x8.Size; i++)
+                {
+                    data[i] = 10;
+                }
+
+                int expected = Block8x8.Size - 1;
+
+                int actual = data.GetLastNonZeroIndex();
+
+                Assert.Equal(expected, actual);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledSingle(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    int setIndex = rng.Next(1, Block8x8.Size);
+                    data[setIndex] = (short)rng.Next(-2000, 2000);
+
+                    int expected = setIndex;
+
+                    int actual = data.GetLastNonZeroIndex();
+
+                    Assert.Equal(expected, actual);
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledPartially(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    int lastIndex = rng.Next(1, Block8x8.Size);
+                    short fillValue = (short)rng.Next(-2000, 2000);
+                    for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++)
+                    {
+                        data[dataIndex] = fillValue;
+                    }
+
+                    int expected = lastIndex;
+
+                    int actual = data.GetLastNonZeroIndex();
+
+                    Assert.Equal(expected, actual);
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void GetLastNonZeroIndex_RandomFilledFragmented(int seed)
+        {
+            static void RunTest(string seedSerialized)
+            {
+                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
+                var rng = new Random(seed);
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    Block8x8 data = default;
+
+                    short fillValue = (short)rng.Next(-2000, 2000);
+
+                    // first filled chunk
+                    int lastIndex1 = rng.Next(1, Block8x8F.Size / 2);
+                    for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++)
+                    {
+                        data[dataIndex] = fillValue;
+                    }
+
+                    // second filled chunk, there might be a spot with zero(s) between first and second chunk
+                    int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size);
+                    for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++)
+                    {
+                        data[dataIndex] = fillValue;
+                    }
+
+                    int expected = lastIndex2;
+
+                    int actual = data.GetLastNonZeroIndex();
+
+                    Assert.Equal(expected, actual);
+                }
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                seed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
index a3aa957ee..42f2fa0d5 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs
@@ -85,157 +85,5 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 Assert.Equal(expected, actual);
             }
         }
-
-        [Fact]
-        public void GetLastNonZeroIndex_AllZero()
-        {
-            static void RunTest()
-            {
-                Block8x8F data = default;
-
-                int expectedLessThan = 1;
-
-                int actual = data.GetLastNonZeroIndex();
-
-                Assert.True(actual < expectedLessThan);
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Fact]
-        public void GetLastNonZeroIndex_AllNonZero()
-        {
-            static void RunTest()
-            {
-                Block8x8F data = default;
-                for (int i = 0; i < Block8x8F.Size; i++)
-                {
-                    data[i] = 10;
-                }
-
-                int expected = Block8x8F.Size - 1;
-
-                int actual = data.GetLastNonZeroIndex();
-
-                Assert.Equal(expected, actual);
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastNonZeroIndex_RandomFilledSingle(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int setIndex = rng.Next(1, Block8x8F.Size);
-                    data[setIndex] = rng.Next();
-
-                    int expected = setIndex;
-
-                    int actual = data.GetLastNonZeroIndex();
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastNonZeroIndex_RandomFilledPartially(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int lastIndex = rng.Next(1, Block8x8F.Size);
-                    int fillValue = rng.Next();
-                    for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    int expected = lastIndex;
-
-                    int actual = data.GetLastNonZeroIndex();
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void GetLastNonZeroIndex_RandomFilledFragmented(int seed)
-        {
-            static void RunTest(string seedSerialized)
-            {
-                int seed = FeatureTestRunner.Deserialize<int>(seedSerialized);
-                var rng = new Random(seed);
-
-                for (int i = 0; i < 1000; i++)
-                {
-                    Block8x8F data = default;
-
-                    int fillValue = rng.Next();
-
-                    // first filled chunk
-                    int lastIndex1 = rng.Next(1, Block8x8F.Size / 2);
-                    for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    // second filled chunk, there might be a spot with zero(s) between first and second chunk
-                    int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size);
-                    for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++)
-                    {
-                        data[dataIndex] = fillValue;
-                    }
-
-                    int expected = lastIndex2;
-
-                    int actual = data.GetLastNonZeroIndex();
-
-                    Assert.Equal(expected, actual);
-                }
-            }
-
-            FeatureTestRunner.RunWithHwIntrinsicsFeature(
-                RunTest,
-                seed,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
-        }
     }
 }

From cc99da35bf20804ae57000e15bb75b4c330a8679 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 29 Aug 2021 05:35:58 +0300
Subject: [PATCH 18/56] Added DCT in place

---
 .../Decoder/JpegBlockPostProcessor.cs         | 24 ++++------
 .../Components/Encoder/HuffmanScanEncoder.cs  | 22 +++++-----
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 44 +++++++++++++++----
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  |  2 +-
 4 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 00169d082..cf5fdd2df 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -19,14 +19,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         public Block8x8F SourceBlock;
 
         /// <summary>
-        /// Temporal block 1 to store intermediate and/or final computation results.
+        /// Temporal block to store intermediate computation results.
         /// </summary>
-        public Block8x8F WorkspaceBlock1;
-
-        /// <summary>
-        /// Temporal block 2 to store intermediate and/or final computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock2;
+        public Block8x8F WorkspaceBlock;
 
         /// <summary>
         /// The quantization table as <see cref="Block8x8F"/>.
@@ -50,8 +45,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             this.subSamplingDivisors = component.SubSamplingDivisors;
 
             this.SourceBlock = default;
-            this.WorkspaceBlock1 = default;
-            this.WorkspaceBlock2 = default;
+            this.WorkspaceBlock = default;
         }
 
         /// <summary>
@@ -71,20 +65,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             int destAreaStride,
             float maximumValue)
         {
-            ref Block8x8F b = ref this.SourceBlock;
-            b.LoadFrom(ref sourceBlock);
+            ref Block8x8F block = ref this.SourceBlock;
+            block.LoadFrom(ref sourceBlock);
 
             // Dequantize:
-            b.MultiplyInPlace(ref this.DequantiazationTable);
+            block.MultiplyInPlace(ref this.DequantiazationTable);
 
-            FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);
+            FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock);
 
             // To conform better to libjpeg we actually NEED TO loose precision here.
             // This is because they store blocks as Int16 between all the operations.
             // To be "more accurate", we need to emulate this by rounding!
-            this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue);
+            block.NormalizeColorsAndRoundInPlace(maximumValue);
 
-            this.WorkspaceBlock1.ScaledCopyTo(
+            block.ScaledCopyTo(
                 ref destAreaOrigin,
                 destAreaStride,
                 this.subSamplingDivisors.Width,
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8b61b66c9..4f5ffb3f8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private int bitCount;
 
-        private Block8x8F temporalBlock1;
-        private Block8x8F temporalBlock2;
+        private Block8x8F temporalBlock;
         private Block8x8 temporalShortBlock;
 
         /// <summary>
@@ -299,23 +298,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <param name="index">The quantization table index.</param>
         /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <param name="block">Source block.</param>
+        /// <param name="quant">Quantization table.</param>
         /// <returns>The <see cref="int"/>.</returns>
         private int WriteBlock(
             QuantIndex index,
             int prevDC,
-            ref Block8x8F src,
+            ref Block8x8F block,
             ref Block8x8F quant)
         {
-            ref Block8x8F refTemp1 = ref this.temporalBlock1;
-            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8F refTemp = ref this.temporalBlock;
             ref Block8x8 spectralBlock = ref this.temporalShortBlock;
 
-            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+            // Shifting level from 0..255 to -128..127
+            block.AddInPlace(-128f);
 
-            Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant);
+            // Discrete cosine transform
+            FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp);
+
+            // Quantization
+            Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);
 
             // Emit the DC delta.
             int dc = spectralBlock[0];
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 0f569b5da..dd46a83e3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -276,28 +276,36 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
         /// <param name="temp">Temporary block provided by the caller for optimization</param>
-        /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
         public static void TransformFDCT(
             ref Block8x8F src,
             ref Block8x8F dest,
-            ref Block8x8F temp,
-            bool offsetSourceByNeg128 = true)
+            ref Block8x8F temp)
         {
             src.TransposeInto(ref temp);
-            if (offsetSourceByNeg128)
-            {
-                temp.AddInPlace(-128F);
-            }
-
             FDCT8x8(ref temp, ref dest);
 
             dest.TransposeInto(ref temp);
-
             FDCT8x8(ref temp, ref dest);
 
             dest.MultiplyInPlace(C_0_125);
         }
 
+        /// <summary>
+        /// Apply floating point FDCT inplace.
+        /// </summary>
+        /// <param name="matrix">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp)
+        {
+            matrix.TransposeInto(ref temp);
+            FDCT8x8(ref temp, ref matrix);
+
+            matrix.TransposeInto(ref temp);
+            FDCT8x8(ref temp, ref matrix);
+
+            matrix.MultiplyInPlace(C_0_125);
+        }
+
         /// <summary>
         /// Performs 8x8 matrix Inverse Discrete Cosine Transform
         /// </summary>
@@ -510,5 +518,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
             dest.MultiplyInPlace(C_0_125);
         }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// </summary>
+        /// <param name="matrix">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp)
+        {
+            block.TransposeInto(ref temp);
+
+            IDCT8x8(ref temp, ref block);
+            block.TransposeInto(ref temp);
+            IDCT8x8(ref temp, ref block);
+
+            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+            block.MultiplyInPlace(C_0_125);
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index d49a6498c..34ca7f9eb 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -310,7 +310,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
                     // testee
-                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2);
 
                     var actualDest = new float[64];
                     destBlock.ScaledCopyTo(actualDest);

From 839da83f17b55e97fe96720e754eb4a60d2cd302 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 31 Aug 2021 19:19:00 +0300
Subject: [PATCH 19/56] Update sandbox

---
 .../Program.cs                                | 78 +++++++------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index bdba1bef6..ef41294bc 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -34,70 +34,46 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
         /// </param>
         public static void Main(string[] args)
         {
-            /* Master */
-            // Elapsed: 5431ms across 200 iterations
-            // Average: 27,155ms
-
-            /* Inserting stuff bytes later */
-            // Elapsed: 5300ms across 200 iterations
-            // Average: 26,5ms
-
-            /* Flush if check */
-            // Elapsed: 5209ms across 200 iterations
-            // Average: 26,045ms
-
-            /* [INVALID] int32 flush - invalid flush order */
-            // Elapsed: 4784ms across 200 iterations
-            // Average: 23,92ms
-
-            /* int32 flush - correct flush order */
-            // Elapsed: 5049ms across 200 iterations
-            // Average: 25,245ms
-
-            /* int32 flush - identical file output */
-            // Elapsed: 4800ms across 200 iterations
-            // Average: 24.00ms
-
-            /* int32 flush - optimized huffman storage & reduced instructions per Emit() */
-            // Elapsed: 4680ms across 200 iterations
-            // Average: 23,4ms
-
-            /* int32 flush - merged prefix & value Emit() call */
-            // Elapsed: 4644ms across 200 iterations
-            // Average: 23,22ms
-
-
-            /* Fixed last valuable index calculation */
-            // Elapsed: 4606ms across 200 iterations
-            // Average: 23,03ms
-
-            /* Intrinsic last valuable index */
-            // Elapsed: 4519ms across 200 iterations
-            // Average: 22,595ms
-
-            BenchmarkEncoder("uniform_size", 200, 100);
-
-            //ReEncodeImage("uniform_size", 100);
+            BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444);
+
+            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio420);
+            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio420);
+            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio420);
+            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio420);
+
+            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444);
+
+            //ReEncodeImage("snow_main", 100);
+            //ReEncodeImage("snow_main", 90);
+            //ReEncodeImage("snow_main", 75);
+            //ReEncodeImage("snow_main", 50);
 
             Console.WriteLine("Done.");
         }
 
         const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg";
 
-        private static void BenchmarkEncoder(string fileName, int iterations, int quality)
+        private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color, JpegSubsample subsample)
         {
             string loadPath = String.Format(pathTemplate, fileName);
 
+            using var inputStream = new FileStream(loadPath, FileMode.Open);
             using var saveStream = new MemoryStream();
 
             var decoder = new JpegDecoder { IgnoreMetadata = true };
-            using Image img = decoder.Decode(Configuration.Default, new FileStream(loadPath, FileMode.Open));
+            using Image img = decoder.Decode(Configuration.Default, inputStream);
 
             var encoder = new JpegEncoder()
             {
                 Quality = quality,
-                ColorType = JpegColorType.YCbCr,
-                Subsample = JpegSubsample.Ratio444
+                ColorType = color,
+                Subsample = subsample
             };
 
             Stopwatch sw = new Stopwatch();
@@ -109,7 +85,9 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             }
             sw.Stop();
 
-            Console.WriteLine($"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n// Average: {(double)sw.ElapsedMilliseconds / iterations}ms");
+            Console.WriteLine($"// Encoding q={quality} | color={color} | sub={subsample}\n" +
+                $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" +
+                $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms");
         }
 
         private static void ReEncodeImage(string fileName, int quality)
@@ -117,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             string loadPath = String.Format(pathTemplate, fileName);
             using Image img = Image.Load(loadPath);
 
-            string savePath = String.Format(pathTemplate, $"testSave_{fileName}");
+            string savePath = String.Format(pathTemplate, $"q{quality}_test_{fileName}");
             var encoder = new JpegEncoder()
             {
                 Quality = quality,

From e3d328053b9e1f426acc9f14c79be55cff8dda8c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 6 Sep 2021 07:42:51 +0300
Subject: [PATCH 20/56] 1

---
 shared-infrastructure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index 9b94ebc4b..f48ab8291 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 9b94ebc4be9b7a8d7620c257e6ee485455973332
+Subproject commit f48ab829167c42c69242ed0d303683232fbfccd1

From 81204d3fcb481d7da9427dc6b9d6cbac65d3880a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 6 Sep 2021 08:10:36 +0300
Subject: [PATCH 21/56] Fixed switch for color type

---
 .../Formats/Jpeg/JpegEncoderCore.cs           | 40 ++++++++-----------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 1a911ecb0..6ff887667 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -131,29 +131,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteStartOfScan(componentCount, componentIds);
 
             // Write the scan compressed data.
-            if (this.colorType == JpegColorType.Luminance)
-            {
-                // luminance quantization table only
-                new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
-            }
-            else
-            {
-                // luminance and chrominance quantization tables.
-                switch (this.colorType)
-                {
-                    case JpegColorType.YCbCrRatio444:
-                        new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
-                        break;
-                    case JpegColorType.YCbCrRatio420:
-                        new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
-                        break;
-                    case JpegColorType.Luminance:
-                        new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
-                        break;
-                    case JpegColorType.Rgb:
-                        new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken);
-                        break;
-                }
+            switch (this.colorType)
+            {
+                case JpegColorType.YCbCrRatio444:
+                    new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.YCbCrRatio420:
+                    new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.Luminance:
+                    new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
+                    break;
+                case JpegColorType.Rgb:
+                    new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken);
+                    break;
+                default:
+                    // all other non-supported color types are checked at the start of this method
+                    break;
             }
 
             // Write the End Of Image marker.

From 7a21a889446027cd81ee84dbd29cca74cb9a3642 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 6 Sep 2021 08:55:22 +0300
Subject: [PATCH 22/56] Fixed failing tests

---
 .../Components/Encoder/HuffmanScanEncoder.cs   | 18 +++++++++---------
 .../Formats/Jpg/Block8x8Tests.cs               | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8e799e98b..db0bc32ae 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -303,8 +303,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
-            var unzig = ZigZag.CreateUnzigTable();
-
             // ReSharper disable once InconsistentNaming
             int prevDCR = 0, prevDCG = 0, prevDCB = 0;
 
@@ -327,26 +325,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCR,
                         ref pixelConverter.R,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
 
                     prevDCG = this.WriteBlock(
                         QuantIndex.Luminance,
                         prevDCG,
                         ref pixelConverter.G,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
 
                     prevDCB = this.WriteBlock(
                         QuantIndex.Luminance,
                         prevDCB,
                         ref pixelConverter.B,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
+
+                    if (this.IsFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                 }
             }
 
-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
         }
 
         /// <summary>
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index 6d73181cb..69375ae1b 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -248,24 +248,26 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     short fillValue = (short)rng.Next(-2000, 2000);
 
                     // first filled chunk
-                    int lastIndex1 = rng.Next(1, Block8x8F.Size / 2);
-                    for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++)
+                    int firstChunkStart = rng.Next(0, Block8x8.Size / 2);
+                    int firstChunkEnd = rng.Next(firstChunkStart, Block8x8.Size / 2);
+                    for (int dataIdx = firstChunkStart; dataIdx <= firstChunkEnd; dataIdx++)
                     {
-                        data[dataIndex] = fillValue;
+                        data[dataIdx] = fillValue;
                     }
 
                     // second filled chunk, there might be a spot with zero(s) between first and second chunk
-                    int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size);
-                    for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++)
+                    int secondChunkStart = rng.Next(firstChunkEnd, Block8x8.Size);
+                    int secondChunkEnd = rng.Next(secondChunkStart, Block8x8.Size);
+                    for (int dataIdx = secondChunkStart; dataIdx <= secondChunkEnd; dataIdx++)
                     {
-                        data[dataIndex] = fillValue;
+                        data[dataIdx] = fillValue;
                     }
 
-                    int expected = lastIndex2;
+                    int expected = secondChunkEnd;
 
                     int actual = data.GetLastNonZeroIndex();
 
-                    Assert.Equal(expected, actual);
+                    Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}");
                 }
             }
 

From 4d5886680fd6a5e4651024846b0dd177b276816f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 6 Sep 2021 08:55:27 +0300
Subject: [PATCH 23/56] Fixed sandbox

---
 .../Program.cs                                | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index ef41294bc..471251c2e 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -34,7 +34,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
         /// </param>
         public static void Main(string[] args)
         {
-            BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444);
+            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444);
             //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444);
             //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444);
             //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444);
@@ -49,17 +49,17 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444);
             //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444);
 
-            //ReEncodeImage("snow_main", 100);
-            //ReEncodeImage("snow_main", 90);
-            //ReEncodeImage("snow_main", 75);
-            //ReEncodeImage("snow_main", 50);
+            ReEncodeImage("snow_main", 100);
+            ReEncodeImage("snow_main", 90);
+            ReEncodeImage("snow_main", 75);
+            ReEncodeImage("snow_main", 50);
 
             Console.WriteLine("Done.");
         }
 
         const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg";
 
-        private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color, JpegSubsample subsample)
+        private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color)
         {
             string loadPath = String.Format(pathTemplate, fileName);
 
@@ -72,8 +72,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             var encoder = new JpegEncoder()
             {
                 Quality = quality,
-                ColorType = color,
-                Subsample = subsample
+                ColorType = color
             };
 
             Stopwatch sw = new Stopwatch();
@@ -85,7 +84,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             }
             sw.Stop();
 
-            Console.WriteLine($"// Encoding q={quality} | color={color} | sub={subsample}\n" +
+            Console.WriteLine($"// Encoding q={quality} | color={color}\n" +
                 $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" +
                 $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms");
         }
@@ -99,8 +98,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             var encoder = new JpegEncoder()
             {
                 Quality = quality,
-                ColorType = JpegColorType.YCbCr,
-                Subsample = JpegSubsample.Ratio444
+                ColorType = JpegColorType.Rgb
             };
             img.SaveAsJpeg(savePath, encoder);
         }

From 0b55bed262d75aac144e295bf83b98d1cb3ae142 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 7 Sep 2021 04:12:56 +0300
Subject: [PATCH 24/56] Slightly improved tiff decoding with jpeg data, removed
 unnecessary GC pressure

---
 .../Jpeg/Components/Decoder/SpectralConverter.cs      |  2 +-
 .../Compression/Decompressors/JpegTiffCompression.cs  | 11 +++++------
 .../Decompressors/RgbJpegSpectralConverter.cs         |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
index 23bb01409..e975b11fb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@@ -39,6 +39,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         /// <param name="frame">The jpeg frame with the color space to convert to.</param>
         /// <param name="jpegData">The raw JPEG data.</param>
         /// <returns>The color converter.</returns>
-        public virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
+        protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
     }
 }
diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
index bd1c496b4..e764c014d 100644
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
@@ -65,22 +65,21 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
                 scanDecoder.ResetInterval = 0;
                 jpegDecoder.ParseStream(stream, scanDecoder, CancellationToken.None);
 
-                using var image = new Image<Rgb24>(this.configuration, spectralConverter.PixelBuffer, new ImageMetadata());
-                CopyImageBytesToBuffer(buffer, image);
+                CopyImageBytesToBuffer(buffer, spectralConverter.PixelBuffer);
             }
             else
             {
                 using var image = Image.Load<Rgb24>(stream);
-                CopyImageBytesToBuffer(buffer, image);
+                CopyImageBytesToBuffer(buffer, image.Frames.RootFrame.PixelBuffer);
             }
         }
 
-        private static void CopyImageBytesToBuffer(Span<byte> buffer, Image<Rgb24> image)
+        private static void CopyImageBytesToBuffer(Span<byte> buffer, Buffer2D<Rgb24> pixelBuffer)
         {
             int offset = 0;
-            for (int y = 0; y < image.Height; y++)
+            for (int y = 0; y < pixelBuffer.Height; y++)
             {
-                Span<Rgb24> pixelRowSpan = image.GetPixelRowSpan(y);
+                Span<Rgb24> pixelRowSpan = pixelBuffer.GetRowSpan(y);
                 Span<byte> rgbBytes = MemoryMarshal.AsBytes(pixelRowSpan);
                 rgbBytes.CopyTo(buffer.Slice(offset));
                 offset += rgbBytes.Length;
diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs
index 45be3dd03..aefec7fa3 100644
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs
@@ -28,6 +28,6 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
         }
 
         /// <inheritdoc/>
-        public override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision);
+        protected override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision);
     }
 }

From 17ca003babe826074ee503432cc73b4ac2b872fa Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 7 Sep 2021 05:57:35 +0300
Subject: [PATCH 25/56] Fixed sandbox

---
 .../Program.cs                                | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index 471251c2e..7f1817e5d 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -34,25 +34,25 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
         /// </param>
         public static void Main(string[] args)
         {
-            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444);
-
-            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio420);
-            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio420);
-            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio420);
-            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio420);
-
-            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444);
-            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444);
-
-            ReEncodeImage("snow_main", 100);
-            ReEncodeImage("snow_main", 90);
-            ReEncodeImage("snow_main", 75);
-            ReEncodeImage("snow_main", 50);
+            BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio444);
+            BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio444);
+            BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio444);
+            BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio444);
+
+            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio420);
+            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio420);
+            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio420);
+            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio420);
+
+            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance);
+            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance);
+            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance);
+            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance);
+
+            //ReEncodeImage("snow_main", 100);
+            //ReEncodeImage("snow_main", 90);
+            //ReEncodeImage("snow_main", 75);
+            //ReEncodeImage("snow_main", 50);
 
             Console.WriteLine("Done.");
         }
@@ -98,7 +98,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
             var encoder = new JpegEncoder()
             {
                 Quality = quality,
-                ColorType = JpegColorType.Rgb
+                ColorType = JpegColorType.YCbCrRatio444
             };
             img.SaveAsJpeg(savePath, encoder);
         }

From ea09d59e083e1f8365a6df7eabc01479ab9037e4 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 7 Sep 2021 07:15:04 +0300
Subject: [PATCH 26/56] Rolled back to original implementation for rounding via
 scalar code

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 45 ++++++++++++++++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 6606acdd6..2656f07ca 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -424,16 +424,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             else
 #endif
             {
-                for (int i = 0; i < Size; i++)
-                {
-                    // TODO: find a way to index block & qt matrices with natural order indices for performance?
-                    int zig = ZigZag.ZigZagOrder[i];
-                    float divRes = block[zig] / qt[zig];
-                    dest[i] = (short)(divRes + (divRes > 0 ? 0.5f : -0.5f));
-                }
+                Divide(ref block, ref qt);
+                block.RoundInto(ref dest);
             }
         }
 
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void Divide(ref Block8x8F a, ref Block8x8F b)
+        {
+            a.V0L /= b.V0L;
+            a.V0R /= b.V0R;
+            a.V1L /= b.V1L;
+            a.V1R /= b.V1R;
+            a.V2L /= b.V2L;
+            a.V2R /= b.V2R;
+            a.V3L /= b.V3L;
+            a.V3R /= b.V3R;
+            a.V4L /= b.V4L;
+            a.V4R /= b.V4R;
+            a.V5L /= b.V5L;
+            a.V5R /= b.V5R;
+            a.V6L /= b.V6L;
+            a.V6R /= b.V6R;
+            a.V7L /= b.V7L;
+            a.V7R /= b.V7R;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
+        {
+            var neg = new Vector4(-1);
+            var add = new Vector4(.5F);
+
+            // sign(dividend) = max(min(dividend, 1), -1)
+            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
+
+            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
+            // TODO: This is wrong but I have no idea how to fix it without if-else operator
+            // sign here is a value in range [-1..1], it can be equal to -0.2 for example which is wrong
+            return (dividend / divisor) + (sign * add);
+        }
+
         public void RoundInto(ref Block8x8 dest)
         {
             for (int i = 0; i < Size; i++)

From 2f143bf9d39703f37030823c45c5e200ebc46a12 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 9 Sep 2021 21:26:18 +0300
Subject: [PATCH 27/56] New FDCT method, reciprocal quantization

---
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  81 +++-
 .../Formats/Jpeg/Components/Block8x8F.cs      | 209 +++------
 .../Decoder/JpegBlockPostProcessor.cs         |   2 +-
 .../Components/Encoder/HuffmanScanEncoder.cs  |  34 +-
 .../FastFloatingPointDCT.Intrinsic.cs         | 210 +++++++++
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 400 ++++++------------
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 108 ++---
 .../BlockOperations/Block8x8F_Transpose.cs    |   8 +-
 .../Formats/Jpg/Block8x8FTests.cs             |  50 +--
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  | 149 ++-----
 10 files changed, 599 insertions(+), 652 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 073580d40..83227ff07 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -3,6 +3,7 @@
 
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
@@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             0, 1, 4, 5, 2, 3, 6, 7
         };
 
-        private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
@@ -53,8 +54,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
                 for (int i = 0; i < 8; i += 2)
                 {
-                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
 
                     Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
                     row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
@@ -64,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
         {
             DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
 
@@ -75,13 +76,81 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             for (int i = 0; i < 16; i += 2)
             {
-                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
 
                 Vector128<short> row = Sse2.PackSignedSaturate(left, right);
                 Unsafe.Add(ref destBase, i / 2) = row;
             }
         }
+
+        private void TransposeAvx()
+        {
+            // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+            Vector256<float> r0 = Avx.InsertVector128(
+                this.V0,
+                Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
+                1);
+
+            Vector256<float> r1 = Avx.InsertVector128(
+               this.V1,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
+               1);
+
+            Vector256<float> r2 = Avx.InsertVector128(
+               this.V2,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
+               1);
+
+            Vector256<float> r3 = Avx.InsertVector128(
+               this.V3,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
+               1);
+
+            Vector256<float> r4 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
+               1);
+
+            Vector256<float> r5 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
+               1);
+
+            Vector256<float> r6 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
+               1);
+
+            Vector256<float> r7 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
+               1);
+
+            Vector256<float> t0 = Avx.UnpackLow(r0, r1);
+            Vector256<float> t2 = Avx.UnpackLow(r2, r3);
+            Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
+            this.V0 = Avx.Blend(t0, v, 0xCC);
+            this.V1 = Avx.Blend(t2, v, 0x33);
+
+            Vector256<float> t4 = Avx.UnpackLow(r4, r5);
+            Vector256<float> t6 = Avx.UnpackLow(r6, r7);
+            v = Avx.Shuffle(t4, t6, 0x4E);
+            this.V4 = Avx.Blend(t4, v, 0xCC);
+            this.V5 = Avx.Blend(t6, v, 0x33);
+
+            Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
+            Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
+            v = Avx.Shuffle(t1, t3, 0x4E);
+            this.V2 = Avx.Blend(t1, v, 0xCC);
+            this.V3 = Avx.Blend(t3, v, 0x33);
+
+            Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
+            Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
+            v = Avx.Shuffle(t5, t7, 0x4E);
+            this.V6 = Avx.Blend(t5, v, 0xCC);
+            this.V7 = Avx.Blend(t7, v, 0x33);
+        }
     }
 }
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 2656f07ca..0b7873585 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -413,41 +413,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
             {
-                DivideIntoInt16_Avx2(ref block, ref qt, ref dest);
+                MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
                 ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
             }
             else if (Ssse3.IsSupported)
             {
-                DivideIntoInt16_Sse2(ref block, ref qt, ref dest);
+                MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
                 ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
             }
             else
 #endif
             {
-                Divide(ref block, ref qt);
+                Multiply(ref block, ref qt);
                 block.RoundInto(ref dest);
             }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static void Divide(ref Block8x8F a, ref Block8x8F b)
-        {
-            a.V0L /= b.V0L;
-            a.V0R /= b.V0R;
-            a.V1L /= b.V1L;
-            a.V1R /= b.V1R;
-            a.V2L /= b.V2L;
-            a.V2R /= b.V2R;
-            a.V3L /= b.V3L;
-            a.V3R /= b.V3R;
-            a.V4L /= b.V4L;
-            a.V4R /= b.V4R;
-            a.V5L /= b.V5L;
-            a.V5R /= b.V5R;
-            a.V6L /= b.V6L;
-            a.V6R /= b.V6R;
-            a.V7L /= b.V7L;
-            a.V7R /= b.V7R;
+        private static void Multiply(ref Block8x8F a, ref Block8x8F b)
+        {
+            a.V0L *= b.V0L;
+            a.V0R *= b.V0R;
+            a.V1L *= b.V1L;
+            a.V1R *= b.V1R;
+            a.V2L *= b.V2L;
+            a.V2R *= b.V2R;
+            a.V3L *= b.V3L;
+            a.V3R *= b.V3R;
+            a.V4L *= b.V4L;
+            a.V4R *= b.V4R;
+            a.V5L *= b.V5L;
+            a.V5R *= b.V5R;
+            a.V6L *= b.V6L;
+            a.V6R *= b.V6R;
+            a.V7L *= b.V7L;
+            a.V7R *= b.V7R;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -608,154 +608,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         }
 
         /// <summary>
-        /// Transpose the block into the destination block.
+        /// Transpose the block inplace.
         /// </summary>
-        /// <param name="d">The destination block</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void TransposeInto(ref Block8x8F d)
+        public void Transpose()
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-                Vector256<float> r0 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-                   1);
-
-                Vector256<float> r1 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-                   1);
-
-                Vector256<float> r2 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-                   1);
-
-                Vector256<float> r3 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-                   1);
-
-                Vector256<float> r4 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-                   1);
-
-                Vector256<float> r5 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-                   1);
-
-                Vector256<float> r6 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-                   1);
-
-                Vector256<float> r7 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-                   1);
-
-                Vector256<float> t0 = Avx.UnpackLow(r0, r1);
-                Vector256<float> t2 = Avx.UnpackLow(r2, r3);
-                Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                d.V0 = Avx.Blend(t0, v, 0xCC);
-                d.V1 = Avx.Blend(t2, v, 0x33);
-
-                Vector256<float> t4 = Avx.UnpackLow(r4, r5);
-                Vector256<float> t6 = Avx.UnpackLow(r6, r7);
-                v = Avx.Shuffle(t4, t6, 0x4E);
-                d.V4 = Avx.Blend(t4, v, 0xCC);
-                d.V5 = Avx.Blend(t6, v, 0x33);
-
-                Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
-                Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
-                v = Avx.Shuffle(t1, t3, 0x4E);
-                d.V2 = Avx.Blend(t1, v, 0xCC);
-                d.V3 = Avx.Blend(t3, v, 0x33);
-
-                Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
-                Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
-                v = Avx.Shuffle(t5, t7, 0x4E);
-                d.V6 = Avx.Blend(t5, v, 0xCC);
-                d.V7 = Avx.Blend(t7, v, 0x33);
+                this.TransposeAvx();
             }
             else
 #endif
             {
-                d.V0L.X = this.V0L.X;
-                d.V1L.X = this.V0L.Y;
-                d.V2L.X = this.V0L.Z;
-                d.V3L.X = this.V0L.W;
-                d.V4L.X = this.V0R.X;
-                d.V5L.X = this.V0R.Y;
-                d.V6L.X = this.V0R.Z;
-                d.V7L.X = this.V0R.W;
-
-                d.V0L.Y = this.V1L.X;
-                d.V1L.Y = this.V1L.Y;
-                d.V2L.Y = this.V1L.Z;
-                d.V3L.Y = this.V1L.W;
-                d.V4L.Y = this.V1R.X;
-                d.V5L.Y = this.V1R.Y;
-                d.V6L.Y = this.V1R.Z;
-                d.V7L.Y = this.V1R.W;
-
-                d.V0L.Z = this.V2L.X;
-                d.V1L.Z = this.V2L.Y;
-                d.V2L.Z = this.V2L.Z;
-                d.V3L.Z = this.V2L.W;
-                d.V4L.Z = this.V2R.X;
-                d.V5L.Z = this.V2R.Y;
-                d.V6L.Z = this.V2R.Z;
-                d.V7L.Z = this.V2R.W;
-
-                d.V0L.W = this.V3L.X;
-                d.V1L.W = this.V3L.Y;
-                d.V2L.W = this.V3L.Z;
-                d.V3L.W = this.V3L.W;
-                d.V4L.W = this.V3R.X;
-                d.V5L.W = this.V3R.Y;
-                d.V6L.W = this.V3R.Z;
-                d.V7L.W = this.V3R.W;
-
-                d.V0R.X = this.V4L.X;
-                d.V1R.X = this.V4L.Y;
-                d.V2R.X = this.V4L.Z;
-                d.V3R.X = this.V4L.W;
-                d.V4R.X = this.V4R.X;
-                d.V5R.X = this.V4R.Y;
-                d.V6R.X = this.V4R.Z;
-                d.V7R.X = this.V4R.W;
-
-                d.V0R.Y = this.V5L.X;
-                d.V1R.Y = this.V5L.Y;
-                d.V2R.Y = this.V5L.Z;
-                d.V3R.Y = this.V5L.W;
-                d.V4R.Y = this.V5R.X;
-                d.V5R.Y = this.V5R.Y;
-                d.V6R.Y = this.V5R.Z;
-                d.V7R.Y = this.V5R.W;
-
-                d.V0R.Z = this.V6L.X;
-                d.V1R.Z = this.V6L.Y;
-                d.V2R.Z = this.V6L.Z;
-                d.V3R.Z = this.V6L.W;
-                d.V4R.Z = this.V6R.X;
-                d.V5R.Z = this.V6R.Y;
-                d.V6R.Z = this.V6R.Z;
-                d.V7R.Z = this.V6R.W;
-
-                d.V0R.W = this.V7L.X;
-                d.V1R.W = this.V7L.Y;
-                d.V2R.W = this.V7L.Z;
-                d.V3R.W = this.V7L.W;
-                d.V4R.W = this.V7R.X;
-                d.V5R.W = this.V7R.Y;
-                d.V6R.W = this.V7R.Z;
-                d.V7R.W = this.V7R.W;
+                this.TransposeScalar();
+            }
+        }
+
+        /// <summary>
+        /// Scalar inplace transpose implementation for <see cref="Transpose"/>
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void TransposeScalar()
+        {
+            float tmp;
+            int horIndex, verIndex;
+
+            // We don't care about the last row as it consists of a single element
+            // Which won't be swapped with anything
+            for (int i = 0; i < 7; i++)
+            {
+                // We don't care about the first element in each row as it's not swapped
+                for (int j = i + 1; j < 8; j++)
+                {
+                    horIndex = (i * 8) + j;
+                    verIndex = (j * 8) + i;
+                    tmp = this[horIndex];
+                    this[horIndex] = this[verIndex];
+                    this[verIndex] = tmp;
+                }
             }
         }
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index cf5fdd2df..085cd4a29 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             // Dequantize:
             block.MultiplyInPlace(ref this.DequantiazationTable);
 
-            FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock);
+            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
 
             // To conform better to libjpeg we actually NEED TO loose precision here.
             // This is because they store blocks as Int16 between all the operations.
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index db0bc32ae..da4723e21 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private int bitCount;
 
-        private Block8x8F temporalBlock;
-        private Block8x8 temporalShortBlock;
+        private Block8x8 tempBlock;
 
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
@@ -130,6 +129,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
+            }
+
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
             // ReSharper disable once InconsistentNaming
@@ -190,6 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
+            }
+
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
             // ReSharper disable once InconsistentNaming
@@ -256,6 +269,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+            }
+
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
             // ReSharper disable once InconsistentNaming
@@ -301,6 +320,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
+            // Calculate reciprocal quantization tables for FDCT method
+            for (int i = 0; i < 64; i++)
+            {
+                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
+            }
+
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
             // ReSharper disable once InconsistentNaming
@@ -365,14 +390,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             ref Block8x8F block,
             ref Block8x8F quant)
         {
-            ref Block8x8F refTemp = ref this.temporalBlock;
-            ref Block8x8 spectralBlock = ref this.temporalShortBlock;
+            ref Block8x8 spectralBlock = ref this.tempBlock;
 
             // Shifting level from 0..255 to -128..127
             block.AddInPlace(-128f);
 
             // Discrete cosine transform
-            FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp);
+            FastFloatingPointDCT.TransformFDCT(ref block);
 
             // Quantization
             Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
new file mode 100644
index 000000000..eb60445d3
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -0,0 +1,210 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Collections.Generic;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Text;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal static partial class FastFloatingPointDCT
+    {
+        /// <summary>
+        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Current FDCT implementation expects its results to be multiplied by
+        /// a reciprocal quantization table. Values in this table must be divided
+        /// by quantization table values scaled with quality settings.
+        /// </para>
+        /// <para>
+        /// These values were calculates with this formula:
+        /// <code>
+        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+        /// </code>
+        /// Where:
+        /// <code>
+        /// scalefactor[0] = 1
+        /// </code>
+        /// <code>
+        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+        /// </code>
+        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
+        /// </para>
+        /// </remarks>
+        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
+        {
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+        };
+
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
+        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
+        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
+
+        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
+        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
+        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
+        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
+#pragma warning restore SA1310, SA1311, IDE1006
+
+        /// <summary>
+        /// Apply floating point FDCT inplace using simd operations.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransformSimd(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
+
+            // First pass - process rows
+            block.Transpose();
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_avx(ref block);
+            }
+            else if (Sse.IsSupported)
+            {
+                // Left part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+
+                // Right part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+            }
+
+            // Second pass - process columns
+            block.Transpose();
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_avx(ref block);
+            }
+            else if (Sse.IsSupported)
+            {
+                // Left part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+
+                // Right part
+                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+            }
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Requires Sse support.
+        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+        /// </remarks>
+        /// <param name="blockRef">Input reference to the first </param>
+        public static void FDCT8x4_sse(ref Vector128<float> blockRef)
+        {
+            DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
+
+            Vector128<float> tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
+            Vector128<float> tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
+            Vector128<float> tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
+            Vector128<float> tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
+            Vector128<float> tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
+            Vector128<float> tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
+            Vector128<float> tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
+            Vector128<float> tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
+
+            // Even part
+            Vector128<float> tmp10 = Sse.Add(tmp0, tmp3);
+            Vector128<float> tmp13 = Sse.Subtract(tmp0, tmp3);
+            Vector128<float> tmp11 = Sse.Add(tmp1, tmp2);
+            Vector128<float> tmp12 = Sse.Subtract(tmp1, tmp2);
+
+            Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11);
+            Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11);
+
+            Vector128<float> z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071);
+            Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1);
+            Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1);
+
+            // Odd part
+            tmp10 = Sse.Add(tmp4, tmp5);
+            tmp11 = Sse.Add(tmp5, tmp6);
+            tmp12 = Sse.Add(tmp6, tmp7);
+
+            Vector128<float> z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826);
+            Vector128<float> z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5);
+            Vector128<float> z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5);
+            Vector128<float> z3 = Sse.Multiply(tmp11, mm128_F_0_7071);
+
+            Vector128<float> z11 = Sse.Add(tmp7, z3);
+            Vector128<float> z13 = Sse.Subtract(tmp7, z3);
+
+            Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2);
+            Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2);
+            Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4);
+            Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4);
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Requires Avx support.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        public static void FDCT8x8_avx(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
+            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
+            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
+            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
+            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
+            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
+            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
+            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+
+            // Even part
+            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
+            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
+            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
+            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
+
+            block.V0 = Avx.Add(tmp10, tmp11);
+            block.V4 = Avx.Subtract(tmp10, tmp11);
+
+            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+            block.V2 = Avx.Add(tmp13, z1);
+            block.V6 = Avx.Subtract(tmp13, z1);
+
+            // Odd part
+            tmp10 = Avx.Add(tmp4, tmp5);
+            tmp11 = Avx.Add(tmp5, tmp6);
+            tmp12 = Avx.Add(tmp6, tmp7);
+
+            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+            Vector256<float> z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5);
+            Vector256<float> z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5);
+            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+            Vector256<float> z11 = Avx.Add(tmp7, z3);
+            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
+
+            block.V5 = Avx.Add(z13, z2);
+            block.V3 = Avx.Subtract(z13, z2);
+            block.V1 = Avx.Add(z11, z4);
+            block.V7 = Avx.Subtract(z11, z4);
+        }
+    }
+}
+#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index dd46a83e3..a554e8577 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -46,11 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
 #if SUPPORTS_RUNTIME_INTRINSICS
         private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
-        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
         private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
-        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
-        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
-        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
 
         private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
         private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
@@ -62,250 +58,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
         private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
         private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
-
-        private static readonly Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
 #endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
-        private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
-
-        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 c0 = s.V0L;
-            Vector4 c1 = s.V7L;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6L;
-            c0 = s.V1L;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5L;
-            c0 = s.V2L;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3L;
-            c1 = s.V4L;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0L = c0 + c1;
-            d.V4L = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2L = (w0 * c2) + (w1 * c3);
-            d.V6L = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3L = c0 - c2;
-            d.V5L = c3 - c1;
-
-            float invsqrt2 = 0.707107f;
-            c0 = (c0 + c2) * invsqrt2;
-            c3 = (c3 + c1) * invsqrt2;
-
-            d.V1L = c0 + c3;
-            d.V7L = c0 - c3;
-        }
-
-        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 c0 = s.V0R;
-            Vector4 c1 = s.V7R;
-            Vector4 t0 = c0 + c1;
-            Vector4 t7 = c0 - c1;
-
-            c1 = s.V6R;
-            c0 = s.V1R;
-            Vector4 t1 = c0 + c1;
-            Vector4 t6 = c0 - c1;
-
-            c1 = s.V5R;
-            c0 = s.V2R;
-            Vector4 t2 = c0 + c1;
-            Vector4 t5 = c0 - c1;
-
-            c0 = s.V3R;
-            c1 = s.V4R;
-            Vector4 t3 = c0 + c1;
-            Vector4 t4 = c0 - c1;
-
-            c0 = t0 + t3;
-            Vector4 c3 = t0 - t3;
-            c1 = t1 + t2;
-            Vector4 c2 = t1 - t2;
-
-            d.V0R = c0 + c1;
-            d.V4R = c0 - c1;
-
-            float w0 = 0.541196f;
-            float w1 = 1.306563f;
-
-            d.V2R = (w0 * c2) + (w1 * c3);
-            d.V6R = (w0 * c3) - (w1 * c2);
-
-            w0 = 1.175876f;
-            w1 = 0.785695f;
-            c3 = (w0 * t4) + (w1 * t7);
-            c0 = (w0 * t7) - (w1 * t4);
-
-            w0 = 1.387040f;
-            w1 = 0.275899f;
-            c2 = (w0 * t5) + (w1 * t6);
-            c1 = (w0 * t6) - (w1 * t5);
-
-            d.V3R = c0 - c2;
-            d.V5R = c3 - c1;
-
-            c0 = (c0 + c2) * InvSqrt2;
-            c3 = (c3 + c1) * InvSqrt2;
-
-            d.V1R = c0 + c3;
-            d.V7R = c0 - c3;
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
-            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
-            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
-            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
-            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
-            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
-            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
-            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
-
-            Vector256<float> c0 = Avx.Add(t0, t3);
-            Vector256<float> c1 = Avx.Add(t1, t2);
-
-            // 0 4
-            d.V0 = Avx.Add(c0, c1);
-            d.V4 = Avx.Subtract(c0, c1);
-
-            Vector256<float> c3 = Avx.Subtract(t0, t3);
-            Vector256<float> c2 = Avx.Subtract(t1, t2);
-
-            // 2 6
-            d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
-            d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
-
-            c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
-            c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
-
-            c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
-            c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
-
-            // 3 5
-            d.V3 = Avx.Subtract(c0, c2);
-            d.V5 = Avx.Subtract(c3, c1);
-
-            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
-            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
-
-            // 1 7
-            d.V1 = Avx.Add(c0, c3);
-            d.V7 = Avx.Subtract(c0, c3);
-#endif
-        }
-
-        /// <summary>
-        /// Performs 8x8 matrix Forward Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                FDCT8x4_LeftPart(ref s, ref d);
-                FDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Apply floating point FDCT from src into dest
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller for optimization</param>
-        public static void TransformFDCT(
-            ref Block8x8F src,
-            ref Block8x8F dest,
-            ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref dest);
-
-            dest.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref dest);
-
-            dest.MultiplyInPlace(C_0_125);
-        }
-
-        /// <summary>
-        /// Apply floating point FDCT inplace.
-        /// </summary>
-        /// <param name="matrix">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp)
-        {
-            matrix.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref matrix);
-
-            matrix.TransposeInto(ref temp);
-            FDCT8x8(ref temp, ref matrix);
-
-            matrix.MultiplyInPlace(C_0_125);
-        }
-
         /// <summary>
         /// Performs 8x8 matrix Inverse Discrete Cosine Transform
         /// </summary>
@@ -501,40 +255,148 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         }
 
         /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
         /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        /// <param name="block">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
         {
-            src.TransposeInto(ref temp);
-
-            IDCT8x8(ref temp, ref dest);
-            dest.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref dest);
+            block.Transpose();
+            IDCT8x8(ref block, ref temp);
+            temp.Transpose();
+            IDCT8x8(ref temp, ref block);
 
             // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
+            block.MultiplyInPlace(C_0_125);
         }
 
         /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// Apply 2D floating point FDCT inplace using scalar operations.
         /// </summary>
-        /// <param name="matrix">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp)
+        /// <remarks>
+        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransformScalar(ref Block8x8F block)
         {
-            block.TransposeInto(ref temp);
+            const int dctSize = 8;
 
-            IDCT8x8(ref temp, ref block);
-            block.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref block);
+            float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+            float tmp10, tmp11, tmp12, tmp13;
+            float z1, z2, z3, z4, z5, z11, z13;
 
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            block.MultiplyInPlace(C_0_125);
+            // First pass - process rows
+            ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
+            {
+                tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
+                tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
+                tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
+                tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
+                tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
+                tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
+                tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
+                tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+            }
+
+            // Second pass - process columns
+            dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            for (int ctr = 7; ctr >= 0; ctr--)
+            {
+                tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
+                tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
+                tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
+                tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
+                tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+
+                // Even part
+                tmp10 = tmp0 + tmp3;
+                tmp13 = tmp0 - tmp3;
+                tmp11 = tmp1 + tmp2;
+                tmp12 = tmp1 - tmp2;
+
+                Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
+                Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+
+                z1 = (tmp12 + tmp13) * 0.707106781f;
+                Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
+                Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;
+
+                z5 = (tmp10 - tmp12) * 0.382683433f;
+                z2 = (0.541196100f * tmp10) + z5;
+                z4 = (1.306562965f * tmp12) + z5;
+                z3 = tmp11 * 0.707106781f;
+
+                z11 = tmp7 + z3;
+                z13 = tmp7 - z3;
+
+                Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
+                Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
+                Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
+                Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+
+                dataRef = ref Unsafe.Add(ref dataRef, 1);
+            }
+        }
+
+        /// <summary>
+        /// Apply 2D floating point FDCT inplace.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        public static void TransformFDCT(ref Block8x8F block)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported || Sse.IsSupported)
+            {
+                ForwardTransformSimd(ref block);
+            }
+            else
+#endif
+            {
+                ForwardTransformScalar(ref block);
+            }
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 066eb2846..878a67b50 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -10,10 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     internal static partial class ZigZag
     {
+#pragma warning disable SA1309 // naming rules violation warnings
         /// <summary>
         /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
         /// </summary>
-        private const byte Z = 0xff;
+        private const byte _ = 0xff;
+#pragma warning restore SA1309
 
         /// <summary>
         /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
@@ -22,82 +24,82 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
             // 0_A
-            0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z,
+            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
             // 0_B
-            Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5,
+            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
             // 0_C
-            Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
 
             // 1_A
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11,
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
             // 1_B
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
             // 1_C
-            2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z,
+            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
             // 1_D
-            Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
             // 1_E
-            Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
 
             // 2_B
-            8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
             // 2_C
-            Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
             // 2_D
-            Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
             // 2_E
-            Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5,
+            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
             // 2_F
-            Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z,
+            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
             // 2_G
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
 
             // 3_A
-            Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
             // 3_B
-            Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z,
+            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
             // 3_C
-            Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z,
+            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
             // 3_D/4_E
-            6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9,
+            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
 
             // 4_F
-            Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z,
+            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
             // 4_G
-            Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z,
+            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
             // 4_H
-            Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
 
             // 5_B
-            Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
             // 5_C
-            Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
             // 5_D
-            10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z,
+            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
             // 5_E
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
             // 5_F
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
             // 5_G
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
+            _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
 
             // 6_D
-            Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
             // 6_E
-            Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z,
+            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
             // 6_F
-            Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13,
+            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
             // 6_G
-            Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
             // 6_H
-            4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
 
             // 7_F
-            Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z,
+            _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
             // 7_G
-            10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z,
+            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
             // 7_H
-            Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15
+            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
         };
 
         /// <summary>
@@ -110,55 +112,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
 
                 // 01_AB - inner-lane
-                0, 1, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   6, 7, 12, 13,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   4, 5, 6, 7,
+                0, 1, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   6, 7, 12, 13,  _, _, _, _,   _, _, _, _,   _, _, 10, 11,   4, 5, 6, 7,
 
                 // 01_CD/23_GH - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   Z, Z, Z, Z,
+                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,
 
                 // 01_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   Z, Z, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   _, _, 10, 11,   4, 5, _, _,   _, _, _, _,
 
                 // 01_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   0, 1, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, _, _,   _, _, _, _,   _, _, _, _,
 
                 // 23_AB/45_CD/67_EF - cross-lane
-                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   Z, Z, Z, Z,
+                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
 
                 // 23_AB - inner-lane
-                4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   2, 3, 8, 9,   Z, Z, Z, Z,
+                4, 5, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   2, 3, 8, 9,   _, _, _, _,
 
                 // 23_CD - inner-lane
-                Z, Z, 6, 7,   12, 13, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 12, 13,
+                _, _, 6, 7,   12, 13, _, _,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   6, 7, 12, 13,
 
                 // 23_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
 
                 // 23_GH - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 0, 1,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
 
                 // 45_AB - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
 
                 // 45_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   6, 7, 0, 1,   Z, Z, 2, 3,   8, 9, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   _, _, 2, 3,   8, 9, _, _,   _, _, _, _,
 
                 // 45_EF - cross-lane
-                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   Z, Z, Z, Z,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   _, _, _, _,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
 
                 // 45_EF - inner-lane
-                2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, 4, 5,  Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 2, 3,   8, 9, Z, Z,
+                2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,  _, _, _, _,   _, _, _, _,   _, _, 2, 3,   8, 9, _, _,
 
                 // 45_GH - inner-lane
-                Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 6, 7,
+                _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 6, 7,
 
                 // 67_CD - inner-lane
-                Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, 10, 11,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, _, _,   _, _, 10, 11,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
 
                 // 67_EF - inner-lane
-                Z, Z, Z, Z,   Z, Z, 6, 7,   0, 1, Z, Z,   2, 3, 8, 9,   Z, Z, Z, Z,   Z, Z, Z, Z,   10, 11, Z, Z,   Z, Z, Z, Z,
+                _, _, _, _,   _, _, 6, 7,   0, 1, _, _,   2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,
 
                 // 67_GH - inner-lane
-                8, 9, 10, 11,   4, 5, Z, Z,   Z, Z, Z, Z,   Z, Z, Z, Z,   2, 3, 8, 9,   10, 11, 4, 5,   Z, Z, 6, 7,   12, 13, 14, 15
+                8, 9, 10, 11,   4, 5, _, _,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, 6, 7,   12, 13, 14, 15
         };
 
         /// <summary>
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 1d103cd1a..8e8787475 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -12,15 +12,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
         private static readonly Block8x8F Source = Create8x8FloatData();
 
         [Benchmark]
-        public void TransposeInto()
-        {
-            var dest = default(Block8x8F);
-            Source.TransposeInto(ref dest);
-        }
+        public void TransposeInto() => Source.Transpose();
 
         private static Block8x8F Create8x8FloatData()
         {
-            var result = new float[64];
+            float[] result = new float[64];
             for (int i = 0; i < 8; i++)
             {
                 for (int j = 0; j < 8; j++)
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index fc642dcc7..89ef74d8b 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -164,52 +164,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         }
 
         [Fact]
-        public void TransposeInto()
+        public void Transpose()
         {
             static void RunTest()
             {
                 float[] expected = Create8x8FloatData();
                 ReferenceImplementations.Transpose8x8(expected);
 
-                var source = default(Block8x8F);
-                source.LoadFrom(Create8x8FloatData());
+                var block8x8 = default(Block8x8F);
+                block8x8.LoadFrom(Create8x8FloatData());
 
-                var dest = default(Block8x8F);
-                source.TransposeInto(ref dest);
+                block8x8.Transpose();
 
                 float[] actual = new float[64];
-                dest.ScaledCopyTo(actual);
+                block8x8.ScaledCopyTo(actual);
 
                 Assert.Equal(expected, actual);
             }
 
             FeatureTestRunner.RunWithHwIntrinsicsFeature(
                 RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
-        }
-
-        private class BufferHolder
-        {
-            public Block8x8F Buffer;
-        }
-
-        [Fact]
-        public void TransposeInto_Benchmark()
-        {
-            var source = new BufferHolder();
-            source.Buffer.LoadFrom(Create8x8FloatData());
-            var dest = new BufferHolder();
-
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ...");
-            var sw = Stopwatch.StartNew();
-
-            for (int i = 0; i < Times; i++)
-            {
-                source.Buffer.TransposeInto(ref dest.Buffer);
-            }
-
-            sw.Stop();
-            this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
         }
 
         private static float[] Create8x8ColorCropTestData()
@@ -281,16 +256,21 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
             Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);
 
+            // Reference implementation quantizes given block via division
             Block8x8 expected = default;
             ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
 
+            // Actual current implementation quantizes given block via multiplication
+            // With quantization table reciprocal
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quant[i] = 1f / quant[i];
+            }
+
             Block8x8 actual = default;
             Block8x8F.Quantize(ref source, ref actual, ref quant);
 
-            for (int i = 0; i < Block8x8.Size; i++)
-            {
-                Assert.Equal(expected[i], actual[i]);
-            }
+            this.CompareBlocks(expected, actual, 1);
         }
 
         [Fact]
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 34ca7f9eb..55d208c5a 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,6 +2,9 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
 #endif
@@ -33,15 +36,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             {
                 float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
 
-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);
 
-                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);
 
                 var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
 
-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
             }
 
             [Theory]
@@ -52,15 +54,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             {
                 float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
 
-                var source = Block8x8F.Load(sourceArray);
+                var srcBlock = Block8x8F.Load(sourceArray);
 
-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
 
                 var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
 
-                this.CompareBlocks(expected, actual, 1f);
+                this.CompareBlocks(expected, srcBlock, 1f);
             }
 
             // Inverse transform
@@ -167,8 +168,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     var srcBlock = default(Block8x8F);
                     srcBlock.LoadFrom(src);
 
-                    var destBlock = default(Block8x8F);
-
                     var expectedDest = new float[64];
                     var temp1 = new float[64];
                     var temp2 = default(Block8x8F);
@@ -177,10 +176,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
 
                     // testee
-                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);
 
                     var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    srcBlock.ScaledCopyTo(actualDest);
 
                     Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
                 }
@@ -198,95 +197,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             }
 
             // Forward transform
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_LeftPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x4_RightPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee
-                FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void FDCT8x8_Avx(int seed)
-            {
-#if SUPPORTS_RUNTIME_INTRINSICS
-                var skip = !Avx.IsSupported;
-#else
-                var skip = true;
-#endif
-                if (skip)
-                {
-                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
-                }
-
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference, left part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
-
-                // reference, right part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee, whole 8x8
-                FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
+            // This test covers entire FDCT conversions chain
+            // This test checks all implementations: intrinsic and scalar fallback
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -297,37 +209,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     int seed = FeatureTestRunner.Deserialize<int>(serialized);
 
                     Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                    var srcBlock = default(Block8x8F);
-                    srcBlock.LoadFrom(src);
-
-                    var destBlock = default(Block8x8F);
+                    var block = default(Block8x8F);
+                    block.LoadFrom(src);
 
-                    var expectedDest = new float[64];
-                    var temp1 = new float[64];
-                    var temp2 = default(Block8x8F);
+                    float[] expectedDest = new float[64];
+                    float[] temp1 = new float[64];
 
                     // reference
                     ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
                     // testee
-                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2);
+                    // Part of the FDCT calculations is fused into the quantization step
+                    // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
+                    FastFloatingPointDCT.TransformFDCT(ref block);
+                    for (int i = 0; i < 64; i++)
+                    {
+                        block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
+                    }
 
-                    var actualDest = new float[64];
-                    destBlock.ScaledCopyTo(actualDest);
+                    float[] actualDest = block.ToArray();
 
-                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                    Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
                 }
 
                 // 3 paths:
                 // 1. AllowAll - call avx/fma implementation
                 // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call fallback code of Vector4 implementation
-                //
-                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                // 3. DisableAvx - call sse implementation
+                // 4. DisableHWIntrinsic - call scalar fallback implementation
                 FeatureTestRunner.RunWithHwIntrinsicsFeature(
                     RunTest,
                     seed,
-                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
             }
         }
     }

From fb038aaf3c6af75ecedecee38ab11dedc2655881 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 10 Sep 2021 06:42:03 +0300
Subject: [PATCH 28/56] Tidied up DCT code

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 109 ++++----
 .../FastFloatingPointDCT.Intrinsic.cs         | 230 +++++++++++++++-
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 247 ++----------------
 3 files changed, 284 insertions(+), 302 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 0b7873585..a25c572ae 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -450,21 +450,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             a.V7R *= b.V7R;
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
-        {
-            var neg = new Vector4(-1);
-            var add = new Vector4(.5F);
-
-            // sign(dividend) = max(min(dividend, 1), -1)
-            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
-            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
-            // TODO: This is wrong but I have no idea how to fix it without if-else operator
-            // sign here is a value in range [-1..1], it can be equal to -0.2 for example which is wrong
-            return (dividend / divisor) + (sign * add);
-        }
-
         public void RoundInto(ref Block8x8 dest)
         {
             for (int i = 0; i < Size; i++)
@@ -562,6 +547,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             Unsafe.Add(ref dRef, 7) = bottom;
         }
 
+        /// <summary>
+        /// Compares entire 8x8 block to a single scalar value.
+        /// </summary>
+        /// <param name="value">Value to compare to.</param>
+        public bool EqualsToScalar(int value)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                var targetVector = Vector256.Create(value);
+                ref Vector256<float> blockStride = ref this.V0;
+
+                for (int i = 0; i < RowCount; i++)
+                {
+                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
+                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+#endif
+            {
+                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
+
+                for (int i = 0; i < Size; i++)
+                {
+                    if ((int)Unsafe.Add(ref scalars, i) != value)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+        }
+
         /// <inheritdoc />
         public bool Equals(Block8x8F other)
             => this.V0L == other.V0L
@@ -598,15 +624,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             return sb.ToString();
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
-        {
-            row += off;
-            row = Vector.Max(row, Vector<float>.Zero);
-            row = Vector.Min(row, max);
-            return row.FastRound();
-        }
-
         /// <summary>
         /// Transpose the block inplace.
         /// </summary>
@@ -650,45 +667,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        /// <summary>
-        /// Compares entire 8x8 block to a single scalar value.
-        /// </summary>
-        /// <param name="value">Value to compare to.</param>
-        public bool EqualsToScalar(int value)
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
         {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
-                var targetVector = Vector256.Create(value);
-                ref Vector256<float> blockStride = ref this.V0;
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
-                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
-                    {
-                        return false;
-                    }
-                }
-
-                return true;
-            }
-#endif
-            {
-                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
-
-                for (int i = 0; i < Size; i++)
-                {
-                    if ((int)Unsafe.Add(ref scalars, i) != value)
-                    {
-                        return false;
-                    }
-                }
-
-                return true;
-            }
+            row += off;
+            row = Vector.Max(row, Vector<float>.Zero);
+            row = Vector.Min(row, max);
+            return row.FastRound();
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index eb60445d3..acc83e279 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -14,6 +14,30 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
     internal static partial class FastFloatingPointDCT
     {
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
+        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
+        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
+
+        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
+        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
+        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
+        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
+
+        private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
+        private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
+        private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
+        private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
+        private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
+        private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f);
+        private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f);
+        private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f);
+        private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f);
+        private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
+        private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
+#pragma warning restore SA1310, SA1311, IDE1006
+
         /// <summary>
         /// Gets reciprocal coefficients for jpeg quantization tables calculation.
         /// </summary>
@@ -50,18 +74,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
         };
 
-#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
-        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
-        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
-        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
-        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
-
-        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
-        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
-        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
-        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
-#pragma warning restore SA1310, SA1311, IDE1006
-
         /// <summary>
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
@@ -205,6 +217,200 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             block.V1 = Avx.Add(z11, z4);
             block.V7 = Avx.Subtract(z11, z4);
         }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+            Vector256<float> my1 = s.V1;
+            Vector256<float> my7 = s.V7;
+            Vector256<float> mz0 = Avx.Add(my1, my7);
+
+            Vector256<float> my3 = s.V3;
+            Vector256<float> mz2 = Avx.Add(my3, my7);
+            Vector256<float> my5 = s.V5;
+            Vector256<float> mz1 = Avx.Add(my3, my5);
+            Vector256<float> mz3 = Avx.Add(my1, my5);
+
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
+
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
+            mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
+            mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
+
+            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
+            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
+            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
+            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
+
+            Vector256<float> my2 = s.V2;
+            Vector256<float> my6 = s.V6;
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
+            Vector256<float> my0 = s.V0;
+            Vector256<float> my4 = s.V4;
+            mz0 = Avx.Add(my0, my4);
+            mz1 = Avx.Subtract(my0, my4);
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
+
+            my0 = Avx.Add(mz0, mz3);
+            my3 = Avx.Subtract(mz0, mz3);
+            my1 = Avx.Add(mz1, mz2);
+            my2 = Avx.Subtract(mz1, mz2);
+
+            d.V0 = Avx.Add(my0, mb0);
+            d.V7 = Avx.Subtract(my0, mb0);
+            d.V1 = Avx.Add(my1, mb1);
+            d.V6 = Avx.Subtract(my1, mb1);
+            d.V2 = Avx.Add(my2, mb2);
+            d.V5 = Avx.Subtract(my2, mb2);
+            d.V3 = Avx.Add(my3, mb3);
+            d.V4 = Avx.Subtract(my3, mb3);
+#endif
+        }
     }
 }
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index a554e8577..181f18185 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -43,216 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
-        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
-
-        private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
-        private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
-        private static readonly Vector256<float> C_V_n0_8999 = Vector256.Create(-0.899976223f);
-        private static readonly Vector256<float> C_V_n2_5629 = Vector256.Create(-2.562915447f);
-        private static readonly Vector256<float> C_V_0_2986 = Vector256.Create(0.298631336f);
-        private static readonly Vector256<float> C_V_2_0531 = Vector256.Create(2.053119869f);
-        private static readonly Vector256<float> C_V_3_0727 = Vector256.Create(3.072711026f);
-        private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
-        private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
-        private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
-#endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> my1 = s.V1;
-            Vector256<float> my7 = s.V7;
-            Vector256<float> mz0 = Avx.Add(my1, my7);
-
-            Vector256<float> my3 = s.V3;
-            Vector256<float> mz2 = Avx.Add(my3, my7);
-            Vector256<float> my5 = s.V5;
-            Vector256<float> mz1 = Avx.Add(my3, my5);
-            Vector256<float> mz3 = Avx.Add(my1, my5);
-
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
-
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
-            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
-            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
-
-            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
-            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
-            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
-            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
-
-            Vector256<float> my2 = s.V2;
-            Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
-            Vector256<float> my0 = s.V0;
-            Vector256<float> my4 = s.V4;
-            mz0 = Avx.Add(my0, my4);
-            mz1 = Avx.Subtract(my0, my4);
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
-
-            my0 = Avx.Add(mz0, mz3);
-            my3 = Avx.Subtract(mz0, mz3);
-            my1 = Avx.Add(mz1, mz2);
-            my2 = Avx.Subtract(mz1, mz2);
-
-            d.V0 = Avx.Add(my0, mb0);
-            d.V7 = Avx.Subtract(my0, mb0);
-            d.V1 = Avx.Add(my1, mb1);
-            d.V6 = Avx.Subtract(my1, mb1);
-            d.V2 = Avx.Add(my2, mb2);
-            d.V5 = Avx.Subtract(my2, mb2);
-            d.V3 = Avx.Add(my3, mb3);
-            d.V4 = Avx.Subtract(my3, mb3);
-#endif
-        }
 
         /// <summary>
         /// Apply floating point IDCT inplace.
@@ -267,10 +58,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             temp.Transpose();
             IDCT8x8(ref temp, ref block);
 
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+            // TODO: This can be fused into quantization table step
             block.MultiplyInPlace(C_0_125);
         }
 
+        /// <summary>
+        /// Apply 2D floating point FDCT inplace.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        public static void TransformFDCT(ref Block8x8F block)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported || Sse.IsSupported)
+            {
+                ForwardTransformSimd(ref block);
+            }
+            else
+#endif
+            {
+                ForwardTransformScalar(ref block);
+            }
+        }
+
         /// <summary>
         /// Apply 2D floating point FDCT inplace using scalar operations.
         /// </summary>
@@ -380,23 +189,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 dataRef = ref Unsafe.Add(ref dataRef, 1);
             }
         }
-
-        /// <summary>
-        /// Apply 2D floating point FDCT inplace.
-        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        public static void TransformFDCT(ref Block8x8F block)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported || Sse.IsSupported)
-            {
-                ForwardTransformSimd(ref block);
-            }
-            else
-#endif
-            {
-                ForwardTransformScalar(ref block);
-            }
-        }
     }
 }

From 9973e8da3b531f272f6079054e69d80303494ea7 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 10 Sep 2021 09:01:41 +0300
Subject: [PATCH 29/56] Removed excess code, added benchmarks

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 30 +++++--------------
 .../FastFloatingPointDCT.Intrinsic.cs         |  3 +-
 .../Jpeg/Components/FastFloatingPointDCT.cs   |  3 --
 .../BlockOperations/Block8x8F_Quantize.cs     | 23 ++++++++++++++
 .../BlockOperations/Block8x8F_Transpose.cs    | 16 +++++-----
 tests/ImageSharp.Benchmarks/Program.cs        | 11 +++----
 6 files changed, 44 insertions(+), 42 deletions(-)
 create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index a25c572ae..d93375f39 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -424,32 +424,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             else
 #endif
             {
-                Multiply(ref block, ref qt);
-                block.RoundInto(ref dest);
+                for (int i = 0; i < Size; i++)
+                {
+                    int idx = ZigZag.ZigZagOrder[i];
+                    float quantizedVal = block[idx] * qt[idx];
+                    quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
+                    dest[i] = (short)quantizedVal;
+                }
             }
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void Multiply(ref Block8x8F a, ref Block8x8F b)
-        {
-            a.V0L *= b.V0L;
-            a.V0R *= b.V0R;
-            a.V1L *= b.V1L;
-            a.V1R *= b.V1R;
-            a.V2L *= b.V2L;
-            a.V2R *= b.V2R;
-            a.V3L *= b.V3L;
-            a.V3R *= b.V3R;
-            a.V4L *= b.V4L;
-            a.V4R *= b.V4R;
-            a.V5L *= b.V5L;
-            a.V5R *= b.V5R;
-            a.V6L *= b.V6L;
-            a.V6R *= b.V6R;
-            a.V7L *= b.V7L;
-            a.V7R *= b.V7R;
-        }
-
         public void RoundInto(ref Block8x8 dest)
         {
             for (int i = 0; i < Size; i++)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index acc83e279..d9a04befb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -3,12 +3,11 @@
 
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System;
-using System.Collections.Generic;
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
-using System.Text;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 181f18185..6f68881cd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,11 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
-using System.Diagnostics;
-using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
 
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
new file mode 100644
index 000000000..b826193c3
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
@@ -0,0 +1,23 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
+{
+    [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
+    public class Block8x8F_Quantize
+    {
+        private Block8x8F block = default;
+        private Block8x8F quant = default;
+        private Block8x8 result = default;
+
+        [Benchmark]
+        public short Quantize()
+        {
+            Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant);
+            return this.result[0];
+        }
+    }
+}
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 8e8787475..47f7d2fbc 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -9,25 +9,27 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
     [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
     public class Block8x8F_Transpose
     {
-        private static readonly Block8x8F Source = Create8x8FloatData();
+        private Block8x8F source = Create8x8FloatData();
 
         [Benchmark]
-        public void TransposeInto() => Source.Transpose();
+        public float TransposeInto()
+        {
+            this.source.Transpose();
+            return this.source[0];
+        }
 
         private static Block8x8F Create8x8FloatData()
         {
-            float[] result = new float[64];
+            Block8x8F block = default;
             for (int i = 0; i < 8; i++)
             {
                 for (int j = 0; j < 8; j++)
                 {
-                    result[(i * 8) + j] = (i * 10) + j;
+                    block[(i * 8) + j] = (i * 10) + j;
                 }
             }
 
-            var source = default(Block8x8F);
-            source.LoadFrom(result);
-            return source;
+            return block;
         }
     }
 }
diff --git a/tests/ImageSharp.Benchmarks/Program.cs b/tests/ImageSharp.Benchmarks/Program.cs
index 8080825d9..f6ffa6f80 100644
--- a/tests/ImageSharp.Benchmarks/Program.cs
+++ b/tests/ImageSharp.Benchmarks/Program.cs
@@ -1,8 +1,6 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
-using System.Reflection;
-
 using BenchmarkDotNet.Running;
 
 namespace SixLabors.ImageSharp.Benchmarks
@@ -15,9 +13,8 @@ namespace SixLabors.ImageSharp.Benchmarks
         /// <param name="args">
         /// The arguments to pass to the program.
         /// </param>
-        public static void Main(string[] args)
-        {
-            new BenchmarkSwitcher(typeof(Program).GetTypeInfo().Assembly).Run(args);
-        }
+        public static void Main(string[] args) => BenchmarkSwitcher
+            .FromAssembly(typeof(Program).Assembly)
+            .Run(args);
     }
 }

From d21e374e86cca30c97ffbe7f3d31dedbe9d4dc7f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 10 Sep 2021 12:27:35 +0300
Subject: [PATCH 30/56] Tidied up the code, added benchmarks

---
 .../Formats/Jpeg/Components/Block8x8.cs       |   2 +
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  29 +-
 .../FastFloatingPointDCT.Intrinsic.cs         | 172 -----------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 173 +++++++++++
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 290 +++++++-----------
 .../BlockOperations/Block8x8F_Quantize.cs     |  31 +-
 .../BlockOperations/Block8x8F_Transpose.cs    |  14 +
 .../Config.HwIntrinsics.cs                    |  10 +-
 8 files changed, 348 insertions(+), 373 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index c76eb942f..71077675d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -5,8 +5,10 @@ using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+#endif
 using System.Text;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 83227ff07..733d32892 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         [FieldOffset(224)]
         public Vector256<float> V7;
 
-        private static ReadOnlySpan<int> DivideIntoInt16_Avx2_ShuffleMask => new int[] {
-            0, 1, 4, 5, 2, 3, 6, 7
-        };
+        private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
 
         private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
-            fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask)
-            {
-                Vector256<int> crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32();
-
-                ref Vector256<float> aBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref a);
-                ref Vector256<float> bBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref b);
+            ref Vector256<float> aBase = ref a.V0;
+            ref Vector256<float> bBase = ref b.V0;
 
-                ref Vector256<short> destBase = ref Unsafe.As<Block8x8, Vector256<short>>(ref dest);
+            ref Vector256<short> destRef = ref dest.V01;
 
-                for (int i = 0; i < 8; i += 2)
-                {
-                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+            for (int i = 0; i < 8; i += 2)
+            {
+                Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
 
-                    Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
-                    row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
+                Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
 
-                    Unsafe.Add(ref destBase, i / 2) = row;
-                }
+                Unsafe.Add(ref destRef, i / 2) = row;
             }
         }
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index d9a04befb..7a2b0a78c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
@@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
 #pragma warning restore SA1310, SA1311, IDE1006
 
-        /// <summary>
-        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
-        /// </summary>
-        /// <remarks>
-        /// <para>
-        /// Current FDCT implementation expects its results to be multiplied by
-        /// a reciprocal quantization table. Values in this table must be divided
-        /// by quantization table values scaled with quality settings.
-        /// </para>
-        /// <para>
-        /// These values were calculates with this formula:
-        /// <code>
-        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
-        /// </code>
-        /// Where:
-        /// <code>
-        /// scalefactor[0] = 1
-        /// </code>
-        /// <code>
-        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-        /// </code>
-        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
-        /// </para>
-        /// </remarks>
-        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
-        {
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
-            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
-            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
-            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
-            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
-        };
-
         /// <summary>
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
@@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             block.V7 = Avx.Subtract(z11, z4);
         }
 
-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
         /// <summary>
         /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
         /// using AVX commands.
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 6f68881cd..91b92d8cf 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
@@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private const float C_0_125 = 0.1250f;
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
 
+        /// <summary>
+        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Current FDCT implementation expects its results to be multiplied by
+        /// a reciprocal quantization table. Values in this table must be divided
+        /// by quantization table values scaled with quality settings.
+        /// </para>
+        /// <para>
+        /// These values were calculates with this formula:
+        /// <code>
+        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+        /// </code>
+        /// Where:
+        /// <code>
+        /// scalefactor[0] = 1
+        /// </code>
+        /// <code>
+        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+        /// </code>
+        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
+        /// </para>
+        /// </remarks>
+        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
+        {
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+        };
+
         /// <summary>
         /// Apply floating point IDCT inplace.
         /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
@@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 dataRef = ref Unsafe.Add(ref dataRef, 1);
             }
         }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 878a67b50..abe02d040 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
-            // 0_A
+            // row0
+            // A B C
             0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
-            // 0_B
             _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
-            // 0_C
             _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
 
-            // 1_A
+            // row1
+            // A B C D E
             _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            // 1_B
             _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
-            // 1_C
             2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
-            // 1_D
             _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-            // 1_E
             _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
 
-            // 2_B
+            // row2
+            // B C D E F G
             8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-            // 2_C
             _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-            // 2_D
             _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
-            // 2_E
             _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
-            // 2_F
             _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-            // 2_G
             _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
 
-            // 3_A
+            // row3
+            // A B C D
+            // D shuffle mask is the for row4 E row shuffle mask
             _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
-            // 3_B
             _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
-            // 3_C
             _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
-            // 3_D/4_E
             6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
 
-            // 4_F
+            // row4
+            // E F G H
+            // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
             _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
-            // 4_G
             _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
-            // 4_H
             _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
 
-            // 5_B
+            // row5
+            // B C D E F G
             _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
-            // 5_C
             _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
-            // 5_D
             10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-            // 5_E
             _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
-            // 5_F
             _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
-            // 5_G
             _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
 
-            // 6_D
+            // row6
+            // D E F G H
             _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
-            // 6_E
             _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
-            // 6_F
             _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            // 6_G
             _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
-            // 6_H
             4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
 
-            // 7_F
+            // row7
+            // F G H
             _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
-            // 7_G
             10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
-            // 7_H
             _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
         };
 
@@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             fixed (byte* maskPtr = SseShuffleMasks)
             {
-                Vector128<byte> A = source.V0.AsByte();
-                Vector128<byte> B = source.V1.AsByte();
-                Vector128<byte> C = source.V2.AsByte();
-                Vector128<byte> D = source.V3.AsByte();
-                Vector128<byte> E = source.V4.AsByte();
-                Vector128<byte> F = source.V5.AsByte();
-                Vector128<byte> G = source.V6.AsByte();
-                Vector128<byte> H = source.V7.AsByte();
+                Vector128<byte> rowA = source.V0.AsByte();
+                Vector128<byte> rowB = source.V1.AsByte();
+                Vector128<byte> rowC = source.V2.AsByte();
+                Vector128<byte> rowD = source.V3.AsByte();
+                Vector128<byte> rowE = source.V4.AsByte();
+                Vector128<byte> rowF = source.V5.AsByte();
+                Vector128<byte> rowG = source.V6.AsByte();
+                Vector128<byte> rowH = source.V7.AsByte();
 
                 // row0
-                Vector128<short> row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
-                Vector128<short> row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
-                Vector128<short> row0 = Sse2.Or(row0_A, row0_B);
-                Vector128<short> row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
-                row0 = Sse2.Or(row0, row0_C);
+                Vector128<short> row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
+                Vector128<short> row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(row0A, row0B);
+                Vector128<short> row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
+                row0 = Sse2.Or(row0, row0C);
 
                 // row1
-                Vector128<short> row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
-                Vector128<short> row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
-                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
-                Vector128<short> row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_C);
-                Vector128<short> row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_D);
-                Vector128<short> row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_E);
+                Vector128<short> row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
+                Vector128<short> row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1A, row1B);
+                Vector128<short> row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1C);
+                Vector128<short> row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1D);
+                Vector128<short> row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1E);
 
                 // row2
-                Vector128<short> row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
-                Vector128<short> row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
-                Vector128<short> row2 = Sse2.Or(row2_B, row2_C);
-                Vector128<short> row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_D);
-                Vector128<short> row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_E);
-                Vector128<short> row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_F);
-                Vector128<short> row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_G);
+                Vector128<short> row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
+                Vector128<short> row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(row2B, row2C);
+                Vector128<short> row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2D);
+                Vector128<short> row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2E);
+                Vector128<short> row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2F);
+                Vector128<short> row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2G);
 
                 // row3
-                Vector128<short> A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
-                Vector128<short> B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
-                Vector128<short> row3 = Sse2.Or(A_3, B_3);
-                Vector128<short> C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
-                row3 = Sse2.Or(row3, C_3);
-                Vector128<byte> D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
-                Vector128<short> D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16();
-                row3 = Sse2.Or(row3, D_3);
+                Vector128<short> row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(row3A, row3B);
+                Vector128<short> row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                row3 = Sse2.Or(row3, row3C);
+                Vector128<byte> row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
+                Vector128<short> row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16();
+                row3 = Sse2.Or(row3, row3D);
 
                 // row4
-                Vector128<short> E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16();
-                Vector128<short> F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
-                Vector128<short> row4 = Sse2.Or(E_4, F_4);
-                Vector128<short> G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, G_4);
-                Vector128<short> H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, H_4);
+                Vector128<short> row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16();
+                Vector128<short> row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(row4E, row4F);
+                Vector128<short> row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, row4G);
+                Vector128<short> row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, row4H);
 
                 // row5
-                Vector128<short> B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
-                Vector128<short> C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
-                Vector128<short> row5 = Sse2.Or(B_5, C_5);
-                Vector128<short> D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, D_5);
-                Vector128<short> E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, E_5);
-                Vector128<short> F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, F_5);
-                Vector128<short> G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, G_5);
+                Vector128<short> row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
+                Vector128<short> row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(row5B, row5C);
+                Vector128<short> row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5D);
+                Vector128<short> row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5E);
+                Vector128<short> row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5F);
+                Vector128<short> row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5G);
 
                 // row6
-                Vector128<short> D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
-                Vector128<short> E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
-                Vector128<short> row6 = Sse2.Or(D_6, E_6);
-                Vector128<short> F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, F_6);
-                Vector128<short> G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, G_6);
-                Vector128<short> H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, H_6);
+                Vector128<short> row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
+                Vector128<short> row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(row6D, row6E);
+                Vector128<short> row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6F);
+                Vector128<short> row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6G);
+                Vector128<short> row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6H);
 
                 // row7
-                Vector128<short> F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
-                Vector128<short> G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
-                Vector128<short> row7 = Sse2.Or(F_7, G_7);
-                Vector128<short> H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
-                row7 = Sse2.Or(row7, H_7);
+                Vector128<short> row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
+                Vector128<short> row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(row7F, row7G);
+                Vector128<short> row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
+                row7 = Sse2.Or(row7, row7H);
 
                 dest.V0 = row0;
                 dest.V1 = row1;
@@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
             {
-                // 18 loads
-                // 10 cross-lane shuffles (permutations)
-                // 14 shuffles
-                // 10 bitwise or's
-                // 4 stores
-
-                // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7
-                // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7
-                // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7
-                // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7
-                Vector256<byte> AB = source.V01.AsByte();
-                Vector256<byte> CD = source.V23.AsByte();
-                Vector256<byte> EF = source.V45.AsByte();
-                Vector256<byte> GH = source.V67.AsByte();
-
-                // row01 - A0  A1  B0  C0  B1  A2  A3  B2 | C1  D0  E0  D1  C2  B3  A4  A5
-                Vector256<int> AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
-
-                // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X  X)  (X  X)
-                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row01_AB - (A0 A1) (B0  X) (B1 A2) (A3 B2) | (X  X)  (X  X)  (X  B3) (A4 A5)
+                Vector256<byte> rowsAB = source.V01.AsByte();
+                Vector256<byte> rowsCD = source.V23.AsByte();
+                Vector256<byte> rowsEF = source.V45.AsByte();
+                Vector256<byte> rowsGH = source.V67.AsByte();
+
+                // rows 0 1
+                Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
 
-                Vector256<int> CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
-
-                // row01_CD - (C0 C1) (X X)  (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X)
-                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
-                // row01_CD - (X  X)  (X C0) (X X) (X X) | (C1 D0) (X  D1)  (C2 X)  (X X)
+                Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
                 row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
 
-                // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X)  (X X) (X X)
-                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row01_EF - (X X) (X X) (X X) (X X) | (X  X)  (E0 X) (X X) (X X)
+                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
 
                 Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
 
-
-                // row23 - B4  C3  D2  E1  F0  G0  F1  E2 | D3  C4  B5  A6  A7  B6  C5  D4
-
-                Vector256<int> AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
-
-                // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X)
-                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X)
+                // rows 2 3
+                Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
 
-                // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X)
-                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4)
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
 
-                // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X)
                 Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
 
-                // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X)
-                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
-                // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
                 Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
 
                 Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
 
-
-                // row45 - E3  F2  G1  H0  H1  G2  F3  E4 | D5  C6  B7  C7  D6  E5  F4  G3
-
-                // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X)
+                // rows 4 5
                 Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
-
-                // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X)
-                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X)
+                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
 
-                Vector256<int> EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
-
-                // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X)
-                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
-                // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X)
+                Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
                 row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
 
-                // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3)
                 Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
 
                 Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
 
-
-                // row67 - H2  H3  G4  F5  E6  D7  E7  F6 | G5  H4  H5  G6  F7  G7  H6  H7
-
-                // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X)
+                // rows 6 7
                 Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
 
-                // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X)
-                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X)
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
 
-                // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7)
-                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
-                // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7)
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
                 row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
 
                 Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
index b826193c3..898bbdb45 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
@@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
     [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
     public class Block8x8F_Quantize
     {
-        private Block8x8F block = default;
-        private Block8x8F quant = default;
+        private Block8x8F block = CreateFromScalar(1);
+        private Block8x8F quant = CreateFromScalar(1);
         private Block8x8 result = default;
 
         [Benchmark]
@@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
             Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant);
             return this.result[0];
         }
+
+        private static Block8x8F CreateFromScalar(float scalar)
+        {
+            Block8x8F block = default;
+            for (int i = 0; i < 64; i++)
+            {
+                block[i] = scalar;
+            }
+
+            return block;
+        }
     }
 }
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  2. SSE             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  3. AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+|   Method |             Job |     Mean |    Error |   StdDev | Ratio |
+|--------- |-----------------|---------:|---------:|---------:|------:|
+| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns |  1.00 |
+| Quantize |             SSE | 24.11 ns | 0.298 ns | 0.279 ns |  0.33 |
+| Quantize |             AVX | 15.90 ns | 0.074 ns | 0.065 ns |  0.22 |
+ */
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 47f7d2fbc..28899b51e 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
         }
     }
 }
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]          : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+|        Method |             Job |      Mean |     Error |    StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
+|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:|
+| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns |  1.00 |     - |     - |     - |         - |
+| TransposeInto |             AVX |  8.613 ns | 0.0249 ns | 0.0208 ns |  0.44 |     - |     - |     - |         - |
+*/
diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
index 5ceb4c8a0..ffe0f4c02 100644
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks
                     .WithId("1. No HwIntrinsics").AsBaseline());
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-                if (Avx.IsSupported)
+                if (Sse.IsSupported)
                 {
                     this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithId("2. AVX"));
+                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
+                        .WithId("2. SSE"));
                 }
 
-                if (Sse.IsSupported)
+                if (Avx.IsSupported)
                 {
                     this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
-                        .WithId("3. SSE"));
+                        .WithId("3. AVX"));
                 }
 #endif
             }

From f297fce021ef03e988d7c61c5641e78bcdb895bd Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 10 Sep 2021 12:27:35 +0300
Subject: [PATCH 31/56] Tidied up the code, added benchmarks

---
 .../Formats/Jpeg/Components/Block8x8.cs       |   2 +
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  29 +-
 .../Components/Encoder/HuffmanScanEncoder.cs  |   8 +-
 .../FastFloatingPointDCT.Intrinsic.cs         | 172 -----------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 173 +++++++++++
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 290 +++++++-----------
 .../BlockOperations/Block8x8F_Quantize.cs     |  31 +-
 .../BlockOperations/Block8x8F_Transpose.cs    |  14 +
 .../Config.HwIntrinsics.cs                    |  10 +-
 9 files changed, 352 insertions(+), 377 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index c76eb942f..71077675d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -5,8 +5,10 @@ using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+#endif
 using System.Text;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 83227ff07..733d32892 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         [FieldOffset(224)]
         public Vector256<float> V7;
 
-        private static ReadOnlySpan<int> DivideIntoInt16_Avx2_ShuffleMask => new int[] {
-            0, 1, 4, 5, 2, 3, 6, 7
-        };
+        private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
 
         private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
-            fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask)
-            {
-                Vector256<int> crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32();
-
-                ref Vector256<float> aBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref a);
-                ref Vector256<float> bBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref b);
+            ref Vector256<float> aBase = ref a.V0;
+            ref Vector256<float> bBase = ref b.V0;
 
-                ref Vector256<short> destBase = ref Unsafe.As<Block8x8, Vector256<short>>(ref dest);
+            ref Vector256<short> destRef = ref dest.V01;
 
-                for (int i = 0; i < 8; i += 2)
-                {
-                    Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-                    Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+            for (int i = 0; i < 8; i += 2)
+            {
+                Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
 
-                    Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
-                    row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
+                Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
 
-                    Unsafe.Add(ref destBase, i / 2) = row;
-                }
+                Unsafe.Add(ref destRef, i / 2) = row;
             }
         }
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index da4723e21..75f384848 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
                 chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
@@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
                 chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
@@ -270,7 +270,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
             }
@@ -321,7 +321,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
             }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index d9a04befb..7a2b0a78c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
@@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
 #pragma warning restore SA1310, SA1311, IDE1006
 
-        /// <summary>
-        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
-        /// </summary>
-        /// <remarks>
-        /// <para>
-        /// Current FDCT implementation expects its results to be multiplied by
-        /// a reciprocal quantization table. Values in this table must be divided
-        /// by quantization table values scaled with quality settings.
-        /// </para>
-        /// <para>
-        /// These values were calculates with this formula:
-        /// <code>
-        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
-        /// </code>
-        /// Where:
-        /// <code>
-        /// scalefactor[0] = 1
-        /// </code>
-        /// <code>
-        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-        /// </code>
-        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
-        /// </para>
-        /// </remarks>
-        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
-        {
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
-            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
-            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
-            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
-            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
-        };
-
         /// <summary>
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
@@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             block.V7 = Avx.Subtract(z11, z4);
         }
 
-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
         /// <summary>
         /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
         /// using AVX commands.
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 6f68881cd..1c5cfc8d6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
@@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private const float C_0_125 = 0.1250f;
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
 
+        /// <summary>
+        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Current FDCT implementation expects its results to be multiplied by
+        /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
+        /// table must be divided by quantization table values scaled with quality settings.
+        /// </para>
+        /// <para>
+        /// These values were calculates with this formula:
+        /// <code>
+        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+        /// </code>
+        /// Where:
+        /// <code>
+        /// scalefactor[0] = 1
+        /// </code>
+        /// <code>
+        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+        /// </code>
+        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
+        /// </para>
+        /// </remarks>
+        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
+        {
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+        };
+
         /// <summary>
         /// Apply floating point IDCT inplace.
         /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
@@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 dataRef = ref Unsafe.Add(ref dataRef, 1);
             }
         }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 878a67b50..abe02d040 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
-            // 0_A
+            // row0
+            // A B C
             0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
-            // 0_B
             _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
-            // 0_C
             _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
 
-            // 1_A
+            // row1
+            // A B C D E
             _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            // 1_B
             _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
-            // 1_C
             2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
-            // 1_D
             _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-            // 1_E
             _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
 
-            // 2_B
+            // row2
+            // B C D E F G
             8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-            // 2_C
             _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-            // 2_D
             _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
-            // 2_E
             _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
-            // 2_F
             _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-            // 2_G
             _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
 
-            // 3_A
+            // row3
+            // A B C D
+            // D shuffle mask is the for row4 E row shuffle mask
             _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
-            // 3_B
             _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
-            // 3_C
             _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
-            // 3_D/4_E
             6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
 
-            // 4_F
+            // row4
+            // E F G H
+            // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
             _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
-            // 4_G
             _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
-            // 4_H
             _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
 
-            // 5_B
+            // row5
+            // B C D E F G
             _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
-            // 5_C
             _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
-            // 5_D
             10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-            // 5_E
             _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
-            // 5_F
             _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
-            // 5_G
             _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
 
-            // 6_D
+            // row6
+            // D E F G H
             _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
-            // 6_E
             _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
-            // 6_F
             _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            // 6_G
             _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
-            // 6_H
             4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
 
-            // 7_F
+            // row7
+            // F G H
             _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
-            // 7_G
             10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
-            // 7_H
             _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
         };
 
@@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             fixed (byte* maskPtr = SseShuffleMasks)
             {
-                Vector128<byte> A = source.V0.AsByte();
-                Vector128<byte> B = source.V1.AsByte();
-                Vector128<byte> C = source.V2.AsByte();
-                Vector128<byte> D = source.V3.AsByte();
-                Vector128<byte> E = source.V4.AsByte();
-                Vector128<byte> F = source.V5.AsByte();
-                Vector128<byte> G = source.V6.AsByte();
-                Vector128<byte> H = source.V7.AsByte();
+                Vector128<byte> rowA = source.V0.AsByte();
+                Vector128<byte> rowB = source.V1.AsByte();
+                Vector128<byte> rowC = source.V2.AsByte();
+                Vector128<byte> rowD = source.V3.AsByte();
+                Vector128<byte> rowE = source.V4.AsByte();
+                Vector128<byte> rowF = source.V5.AsByte();
+                Vector128<byte> rowG = source.V6.AsByte();
+                Vector128<byte> rowH = source.V7.AsByte();
 
                 // row0
-                Vector128<short> row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
-                Vector128<short> row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
-                Vector128<short> row0 = Sse2.Or(row0_A, row0_B);
-                Vector128<short> row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
-                row0 = Sse2.Or(row0, row0_C);
+                Vector128<short> row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
+                Vector128<short> row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(row0A, row0B);
+                Vector128<short> row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
+                row0 = Sse2.Or(row0, row0C);
 
                 // row1
-                Vector128<short> row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
-                Vector128<short> row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
-                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
-                Vector128<short> row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_C);
-                Vector128<short> row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_D);
-                Vector128<short> row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1_E);
+                Vector128<short> row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
+                Vector128<short> row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1A, row1B);
+                Vector128<short> row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1C);
+                Vector128<short> row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1D);
+                Vector128<short> row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
+                row1 = Sse2.Or(row1, row1E);
 
                 // row2
-                Vector128<short> row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
-                Vector128<short> row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
-                Vector128<short> row2 = Sse2.Or(row2_B, row2_C);
-                Vector128<short> row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_D);
-                Vector128<short> row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_E);
-                Vector128<short> row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_F);
-                Vector128<short> row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2_G);
+                Vector128<short> row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
+                Vector128<short> row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(row2B, row2C);
+                Vector128<short> row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2D);
+                Vector128<short> row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2E);
+                Vector128<short> row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2F);
+                Vector128<short> row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
+                row2 = Sse2.Or(row2, row2G);
 
                 // row3
-                Vector128<short> A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
-                Vector128<short> B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
-                Vector128<short> row3 = Sse2.Or(A_3, B_3);
-                Vector128<short> C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
-                row3 = Sse2.Or(row3, C_3);
-                Vector128<byte> D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
-                Vector128<short> D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16();
-                row3 = Sse2.Or(row3, D_3);
+                Vector128<short> row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(row3A, row3B);
+                Vector128<short> row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                row3 = Sse2.Or(row3, row3C);
+                Vector128<byte> row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
+                Vector128<short> row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16();
+                row3 = Sse2.Or(row3, row3D);
 
                 // row4
-                Vector128<short> E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16();
-                Vector128<short> F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
-                Vector128<short> row4 = Sse2.Or(E_4, F_4);
-                Vector128<short> G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, G_4);
-                Vector128<short> H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, H_4);
+                Vector128<short> row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16();
+                Vector128<short> row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(row4E, row4F);
+                Vector128<short> row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, row4G);
+                Vector128<short> row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
+                row4 = Sse2.Or(row4, row4H);
 
                 // row5
-                Vector128<short> B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
-                Vector128<short> C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
-                Vector128<short> row5 = Sse2.Or(B_5, C_5);
-                Vector128<short> D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, D_5);
-                Vector128<short> E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, E_5);
-                Vector128<short> F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, F_5);
-                Vector128<short> G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, G_5);
+                Vector128<short> row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
+                Vector128<short> row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(row5B, row5C);
+                Vector128<short> row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5D);
+                Vector128<short> row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5E);
+                Vector128<short> row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5F);
+                Vector128<short> row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
+                row5 = Sse2.Or(row5, row5G);
 
                 // row6
-                Vector128<short> D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
-                Vector128<short> E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
-                Vector128<short> row6 = Sse2.Or(D_6, E_6);
-                Vector128<short> F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, F_6);
-                Vector128<short> G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, G_6);
-                Vector128<short> H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, H_6);
+                Vector128<short> row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
+                Vector128<short> row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(row6D, row6E);
+                Vector128<short> row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6F);
+                Vector128<short> row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6G);
+                Vector128<short> row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
+                row6 = Sse2.Or(row6, row6H);
 
                 // row7
-                Vector128<short> F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
-                Vector128<short> G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
-                Vector128<short> row7 = Sse2.Or(F_7, G_7);
-                Vector128<short> H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
-                row7 = Sse2.Or(row7, H_7);
+                Vector128<short> row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
+                Vector128<short> row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(row7F, row7G);
+                Vector128<short> row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
+                row7 = Sse2.Or(row7, row7H);
 
                 dest.V0 = row0;
                 dest.V1 = row1;
@@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
             {
-                // 18 loads
-                // 10 cross-lane shuffles (permutations)
-                // 14 shuffles
-                // 10 bitwise or's
-                // 4 stores
-
-                // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7
-                // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7
-                // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7
-                // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7
-                Vector256<byte> AB = source.V01.AsByte();
-                Vector256<byte> CD = source.V23.AsByte();
-                Vector256<byte> EF = source.V45.AsByte();
-                Vector256<byte> GH = source.V67.AsByte();
-
-                // row01 - A0  A1  B0  C0  B1  A2  A3  B2 | C1  D0  E0  D1  C2  B3  A4  A5
-                Vector256<int> AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
-
-                // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X  X)  (X  X)
-                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row01_AB - (A0 A1) (B0  X) (B1 A2) (A3 B2) | (X  X)  (X  X)  (X  B3) (A4 A5)
+                Vector256<byte> rowsAB = source.V01.AsByte();
+                Vector256<byte> rowsCD = source.V23.AsByte();
+                Vector256<byte> rowsEF = source.V45.AsByte();
+                Vector256<byte> rowsGH = source.V67.AsByte();
+
+                // rows 0 1
+                Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
 
-                Vector256<int> CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
-
-                // row01_CD - (C0 C1) (X X)  (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X)
-                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
-                // row01_CD - (X  X)  (X C0) (X X) (X X) | (C1 D0) (X  D1)  (C2 X)  (X X)
+                Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
                 row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
 
-                // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X)  (X X) (X X)
-                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row01_EF - (X X) (X X) (X X) (X X) | (X  X)  (E0 X) (X X) (X X)
+                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
 
                 Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
 
-
-                // row23 - B4  C3  D2  E1  F0  G0  F1  E2 | D3  C4  B5  A6  A7  B6  C5  D4
-
-                Vector256<int> AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
-
-                // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X)
-                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X)
+                // rows 2 3
+                Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
 
-                // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X)
-                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
-                // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4)
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
                 row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
 
-                // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X)
                 Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
 
-                // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X)
-                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
-                // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X)
+                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
                 Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
 
                 Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
 
-
-                // row45 - E3  F2  G1  H0  H1  G2  F3  E4 | D5  C6  B7  C7  D6  E5  F4  G3
-
-                // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X)
+                // rows 4 5
                 Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
-
-                // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X)
-                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X)
+                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
 
-                Vector256<int> EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
-
-                // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X)
-                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
-                // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X)
+                Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
                 row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
 
-                // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3)
                 Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
 
                 Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
 
-
-                // row67 - H2  H3  G4  F5  E6  D7  E7  F6 | G5  H4  H5  G6  F7  G7  H6  H7
-
-                // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X)
+                // rows 6 7
                 Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
 
-                // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X)
-                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
-                // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X)
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
                 row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
 
-                // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7)
-                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
-                // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7)
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
                 row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
 
                 Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
index b826193c3..898bbdb45 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
@@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
     [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
     public class Block8x8F_Quantize
     {
-        private Block8x8F block = default;
-        private Block8x8F quant = default;
+        private Block8x8F block = CreateFromScalar(1);
+        private Block8x8F quant = CreateFromScalar(1);
         private Block8x8 result = default;
 
         [Benchmark]
@@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
             Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant);
             return this.result[0];
         }
+
+        private static Block8x8F CreateFromScalar(float scalar)
+        {
+            Block8x8F block = default;
+            for (int i = 0; i < 64; i++)
+            {
+                block[i] = scalar;
+            }
+
+            return block;
+        }
     }
 }
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  2. SSE             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  3. AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+|   Method |             Job |     Mean |    Error |   StdDev | Ratio |
+|--------- |-----------------|---------:|---------:|---------:|------:|
+| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns |  1.00 |
+| Quantize |             SSE | 24.11 ns | 0.298 ns | 0.279 ns |  0.33 |
+| Quantize |             AVX | 15.90 ns | 0.074 ns | 0.065 ns |  0.22 |
+ */
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 47f7d2fbc..28899b51e 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
         }
     }
 }
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]          : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+|        Method |             Job |      Mean |     Error |    StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
+|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:|
+| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns |  1.00 |     - |     - |     - |         - |
+| TransposeInto |             AVX |  8.613 ns | 0.0249 ns | 0.0208 ns |  0.44 |     - |     - |     - |         - |
+*/
diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
index 5ceb4c8a0..ffe0f4c02 100644
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks
                     .WithId("1. No HwIntrinsics").AsBaseline());
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-                if (Avx.IsSupported)
+                if (Sse.IsSupported)
                 {
                     this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithId("2. AVX"));
+                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
+                        .WithId("2. SSE"));
                 }
 
-                if (Sse.IsSupported)
+                if (Avx.IsSupported)
                 {
                     this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
-                        .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
-                        .WithId("3. SSE"));
+                        .WithId("3. AVX"));
                 }
 #endif
             }

From 96f8717b12599af180aafd8c3915eea09811c204 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 11 Sep 2021 06:13:05 +0300
Subject: [PATCH 32/56] Optimized runLength calculation

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 75f384848..ad279b577 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -408,22 +408,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // Emit the AC components.
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
 
-            int runLength = 0;
             int lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
+
+            int runLength = 0;
             for (int zig = 1; zig <= lastValuableIndex; zig++)
             {
-                int ac = spectralBlock[zig];
+                const int zeroRun1 = 1 << 4;
+                const int zeroRun16 = 16 << 4;
 
+                int ac = spectralBlock[zig];
                 if (ac == 0)
                 {
-                    runLength++;
+                    runLength += zeroRun1;
                 }
                 else
                 {
-                    while (runLength > 15)
+                    while (runLength >= zeroRun16)
                     {
                         this.EmitHuff(acHuffTable, 0xf0);
-                        runLength -= 16;
+                        runLength -= zeroRun16;
                     }
 
                     this.EmitHuffRLE(acHuffTable, runLength, ac);
@@ -498,14 +501,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+        /// Emits given value via huffman rle encoding.
         /// </summary>
         /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="runLength">The number of copies to encode.</param>
+        /// <param name="runLength">The number of preceding zeroes, preshifted by 4 to the left.</param>
         /// <param name="value">The value to encode.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuffRLE(int[] table, int runLength, int value)
         {
+            DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits");
+
             int a = value;
             int b = value;
             if (a < 0)
@@ -517,7 +522,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int valueLen = GetHuffmanEncodingLength((uint)a);
 
             // Huffman prefix code
-            int huffPackage = table[(runLength << 4) | valueLen];
+            int huffPackage = table[runLength | valueLen];
             int prefixLen = huffPackage & 0xff;
             uint prefix = (uint)huffPackage & 0xffff_0000u;
 

From 91a95b581404b9f32f773e6672c3c98b9f4cfb48 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 12 Sep 2021 21:44:11 +0300
Subject: [PATCH 33/56] Implemented fallback code for big-endian machines

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 101 +++++++++++++-----
 1 file changed, 77 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index ad279b577..08f676e40 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -445,21 +445,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             return dc;
         }
 
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void FlushRemainingBytes()
-        {
-            // Bytes count we want to write to the output stream
-            int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
-
-            // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
-            uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
-
-            int writeIndex = this.emitWriteIndex;
-            this.emitBuffer[writeIndex - 1] = packedBytes;
-
-            this.FlushToStream((writeIndex * 4) - valuableBytesCount);
-        }
-
         /// <summary>
         /// Emits the least significant count of bits to the stream write buffer.
         /// The precondition is bits
@@ -568,28 +553,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
+        /// <summary>
+        /// Flushes cached bytes to the ouput stream respecting stuff bytes.
+        /// </summary>
+        /// <remarks>
+        /// Bytes cached via <see cref="Emit"/> are stored in 4-bytes blocks which makes
+        /// this method endianness dependent.
+        /// </remarks>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void FlushToStream() => this.FlushToStream(this.emitWriteIndex * 4);
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void FlushToStream(int endIndex)
+        private void FlushToStream()
         {
             Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
 
             int writeIdx = 0;
             int startIndex = emitBytes.Length - 1;
-            for (int i = startIndex; i >= endIndex; i--)
+            int endIndex = this.emitWriteIndex * sizeof(uint);
+
+            // Some platforms may fail to eliminate this if-else branching
+            // Even if it happens - buffer is flushed in big packs,
+            // branching overhead shouldn't be noticeable
+            if (BitConverter.IsLittleEndian)
             {
-                byte value = emitBytes[i];
-                this.streamWriteBuffer[writeIdx++] = value;
-                if (value == 0xff)
+                // For little endian case bytes are ordered and can be
+                // safely written to the stream with stuff bytes
+                // First byte is cached on the most significant index
+                // so we are going from the end of the array to its beginning:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0]
+                for (int i = startIndex; i >= endIndex; i--)
                 {
-                    this.streamWriteBuffer[writeIdx++] = 0x00;
+                    byte value = emitBytes[i];
+                    this.streamWriteBuffer[writeIdx++] = value;
+
+                    // Inserting stuff byte
+                    if (value == 0xff)
+                    {
+                        this.streamWriteBuffer[writeIdx++] = 0x00;
+                    }
+                }
+            }
+            else
+            {
+                // For big endian case bytes are ordered in 4-byte packs
+                // which are ordered like bytes in the little endian case by in 4-byte packs:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3]
+                // So we must write each 4-bytes in 'natural order'
+                for (int i = startIndex; i >= endIndex; i -= 4)
+                {
+                    // This loop is caused by the nature of underlying byte buffer
+                    // implementation and indeed causes performace by somewhat 5%
+                    // compared to little endian scenario
+                    // Even with this performance drop this cached buffer implementation
+                    // is faster than individually writing bytes using binary shifts and binary and(s)
+                    for (int j = i - 3; j <= i; j++)
+                    {
+                        byte value = emitBytes[j];
+                        this.streamWriteBuffer[writeIdx++] = value;
+
+                        // Inserting stuff byte
+                        if (value == 0xff)
+                        {
+                            this.streamWriteBuffer[writeIdx++] = 0x00;
+                        }
+                    }
                 }
             }
 
             this.target.Write(this.streamWriteBuffer, 0, writeIdx);
             this.emitWriteIndex = this.emitBuffer.Length;
         }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void FlushRemainingBytes()
+        {
+            // Flush full 4-byte blocks
+            this.FlushToStream();
+
+            // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
+            // And writing only valuable count of bytes count we want to write to the output stream
+            int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
+            uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
+
+            Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
+            for (int i = 0; i < valuableBytesCount; i++)
+            {
+                emitBytes[i] = (byte)((packedBytes >> ((3 - i) * 8)) & 0xff);
+            }
+
+            // Flush remaining 'tail' bytes
+            this.target.Write(emitBytes, 0, valuableBytesCount);
+        }
     }
 }

From 775610d5a0221e11096bbe500adc5bd31d6cbe63 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 00:35:29 +0300
Subject: [PATCH 34/56] Fixed tests, fixed compilation, added DHT marker
 decoding more meaningful exception messages, fixed invalid jpeg encoding

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 22 ++++++++-----------
 .../Formats/Jpeg/JpegDecoderCore.cs           |  4 ++--
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  | 20 +++++------------
 3 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 08f676e40..3e6b0e5f4 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -561,13 +561,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// this method endianness dependent.
         /// </remarks>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void FlushToStream()
+        private void FlushToStream(int endIndex)
         {
             Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
 
             int writeIdx = 0;
             int startIndex = emitBytes.Length - 1;
-            int endIndex = this.emitWriteIndex * sizeof(uint);
 
             // Some platforms may fail to eliminate this if-else branching
             // Even if it happens - buffer is flushed in big packs,
@@ -621,28 +620,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 
             this.target.Write(this.streamWriteBuffer, 0, writeIdx);
+        }
+
+        private void FlushToStream()
+        {
+            this.FlushToStream(this.emitWriteIndex * 4);
             this.emitWriteIndex = this.emitBuffer.Length;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private void FlushRemainingBytes()
         {
-            // Flush full 4-byte blocks
-            this.FlushToStream();
-
             // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
             // And writing only valuable count of bytes count we want to write to the output stream
             int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
             uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
+            this.emitBuffer[--this.emitWriteIndex] = packedBytes;
 
-            Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
-            for (int i = 0; i < valuableBytesCount; i++)
-            {
-                emitBytes[i] = (byte)((packedBytes >> ((3 - i) * 8)) & 0xff);
-            }
-
-            // Flush remaining 'tail' bytes
-            this.target.Write(emitBytes, 0, valuableBytesCount);
+            // Flush cached bytes to the output stream with padding bits
+            this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount);
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
index 024743ddb..a0f69bb7b 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@@ -1071,13 +1071,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                     // Types 0..1 DC..AC
                     if (tableType > 1)
                     {
-                        JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table type.");
+                        JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table type: {tableType}");
                     }
 
                     // Max tables of each type
                     if (tableIndex > 3)
                     {
-                        JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table index.");
+                        JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table index: {tableIndex}");
                     }
 
                     stream.Read(huffmanDataSpan, 0, 16);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 55d208c5a..b4d3769d7 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,9 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
 #endif
@@ -121,24 +118,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             public void IDCT8x8_Avx(int seed)
             {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                var skip = !Avx.IsSupported;
-#else
-                var skip = true;
-#endif
-
-                if (skip)
+                if (!Avx.IsSupported)
                 {
                     this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
                 }
 
                 Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
+                Block8x8F srcBlock = default;
                 srcBlock.LoadFrom(src);
 
-                var destBlock = default(Block8x8F);
+                Block8x8F destBlock = default;
 
-                var expectedDest = new float[64];
+                float[] expectedDest = new float[64];
 
                 // reference, left part
                 ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
@@ -149,10 +140,11 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 // testee, whole 8x8
                 FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
 
-                var actualDest = new float[64];
+                float[] actualDest = new float[64];
                 destBlock.ScaledCopyTo(actualDest);
 
                 Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+#endif
             }
 
             [Theory]

From a7dada1d4d47260b1f82ba4df310d9698cf7542a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 00:44:02 +0300
Subject: [PATCH 35/56] Fixed huffman lut summary

---
 .../Jpeg/Components/Encoder/HuffmanLut.cs     | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index f563e74e0..44b39dfd7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -4,12 +4,26 @@
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     /// <summary>
-    /// TODO: THIS IS NO LONGER TRUE, INTERNAL REPRESENTATION WAS CHANGED AND THIS DOC SHOULD BE CHANGED TOO!!!
     /// A compiled look-up table representation of a huffmanSpec.
-    /// Each value maps to a int32 of which the 24 most significant bits hold the
-    /// codeword in bits and the 8 least significant bits hold the codeword size.
     /// The maximum codeword size is 16 bits.
     /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Each value maps to a int32 of which the 24 most significant bits hold the
+    /// codeword in bits and the 8 least significant bits hold the codeword size.
+    /// </para>
+    /// <para>
+    /// Code value occupies 24 most significant bits as integer value.
+    /// This value is shifted to the MSB position for performance reasons.
+    /// For example, decimal value 10 is stored like this:
+    /// <code>
+    /// MSB                                LSB
+    /// 1010 0000 00000000 00000000 | 00000100
+    /// </code>
+    /// This was done to eliminate extra binary shifts in the encoder.
+    /// While code length is represented as 8 bit integer value
+    /// </para>
+    /// </remarks>
     internal readonly struct HuffmanLut
     {
         /// <summary>

From 24bf7c111d9e7e3fbdae1c1f5002e0735bdddd20 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 01:20:54 +0300
Subject: [PATCH 36/56] Restored sandbox

---
 .../Program.cs                                | 81 ++-----------------
 1 file changed, 8 insertions(+), 73 deletions(-)

diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
index 7f1817e5d..51d616fc7 100644
--- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
+++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs
@@ -1,10 +1,4 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
 using System;
-using System.Diagnostics;
-using System.IO;
-using SixLabors.ImageSharp.Formats.Jpeg;
 using SixLabors.ImageSharp.Tests.Formats.Jpg;
 using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations;
 using SixLabors.ImageSharp.Tests.ProfilingBenchmarks;
@@ -34,73 +28,14 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox
         /// </param>
         public static void Main(string[] args)
         {
-            BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio444);
-            BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio444);
-            BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio444);
-            BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio444);
-
-            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio420);
-            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio420);
-            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio420);
-            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio420);
-
-            //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance);
-            //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance);
-            //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance);
-            //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance);
-
-            //ReEncodeImage("snow_main", 100);
-            //ReEncodeImage("snow_main", 90);
-            //ReEncodeImage("snow_main", 75);
-            //ReEncodeImage("snow_main", 50);
-
-            Console.WriteLine("Done.");
-        }
-
-        const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg";
-
-        private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color)
-        {
-            string loadPath = String.Format(pathTemplate, fileName);
-
-            using var inputStream = new FileStream(loadPath, FileMode.Open);
-            using var saveStream = new MemoryStream();
-
-            var decoder = new JpegDecoder { IgnoreMetadata = true };
-            using Image img = decoder.Decode(Configuration.Default, inputStream);
-
-            var encoder = new JpegEncoder()
-            {
-                Quality = quality,
-                ColorType = color
-            };
-
-            Stopwatch sw = new Stopwatch();
-            sw.Start();
-            for (int i = 0; i < iterations; i++)
-            {
-                img.SaveAsJpeg(saveStream, encoder);
-                saveStream.Position = 0;
-            }
-            sw.Stop();
-
-            Console.WriteLine($"// Encoding q={quality} | color={color}\n" +
-                $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" +
-                $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms");
-        }
-
-        private static void ReEncodeImage(string fileName, int quality)
-        {
-            string loadPath = String.Format(pathTemplate, fileName);
-            using Image img = Image.Load(loadPath);
-
-            string savePath = String.Format(pathTemplate, $"q{quality}_test_{fileName}");
-            var encoder = new JpegEncoder()
-            {
-                Quality = quality,
-                ColorType = JpegColorType.YCbCrRatio444
-            };
-            img.SaveAsJpeg(savePath, encoder);
+            LoadResizeSaveParallelMemoryStress.Run();
+            // RunJpegEncoderProfilingTests();
+            // RunJpegColorProfilingTests();
+            // RunDecodeJpegProfilingTests();
+            // RunToVector4ProfilingTest();
+            // RunResizeProfilingTest();
+
+            // Console.ReadLine();
         }
 
         private static void RunJpegEncoderProfilingTests()

From 4fd912b9dd84f6a5c8774f110d719f188488f55f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 09:21:35 +0300
Subject: [PATCH 37/56] Fixed Ssse3 zig-zag implementation

---
 .../Formats/Jpeg/Components/Block8x8F.cs      |   4 +-
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 228 ++++++++++--------
 .../Formats/Jpg/Block8x8FTests.cs             |  49 ++--
 .../Formats/Jpg/Utils/JpegFixture.cs          |  32 +++
 .../FeatureTesting/FeatureTestRunner.cs       |  46 ++++
 5 files changed, 241 insertions(+), 118 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index d93375f39..24177c556 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -414,12 +414,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx2.IsSupported)
             {
                 MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
+                ZigZag.ApplyZigZagOrderingAvx(ref dest);
             }
             else if (Ssse3.IsSupported)
             {
                 MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
+                ZigZag.ApplyZigZagOrderingSse(ref dest);
             }
             else
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index abe02d040..eb15c8b55 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -21,6 +21,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
         /// zig zag implementation.
         /// </summary>
+        private static ReadOnlySpan<byte> SseShuffleMasks1 => new byte[]
+        {
+            // row0
+            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
+            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
+            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
+
+            // row1
+            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
+            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
+            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
+
+            // row2
+            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
+            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
+
+            // row3
+            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
+            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
+            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
+            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
+
+            // row4
+            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
+            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
+            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
+
+            // row5
+            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
+            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
+
+            // row6
+            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
+            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
+            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
+
+            // row7
+            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
+            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+        };
+
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
             // row0
@@ -56,7 +97,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             // row4
             // E F G H
-            // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
+            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
             _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
             _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
             _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
@@ -152,112 +193,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <remarks>
         /// Requires Ssse3 support.
         /// </remarks>
-        /// <param name="source">Input matrix.</param>
+        /// <param name="block">Input matrix.</param>
         /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 source, ref Block8x8 dest)
+        public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
-            fixed (byte* maskPtr = SseShuffleMasks)
+            fixed (byte* maskPtr = SseShuffleMasks1)
             {
-                Vector128<byte> rowA = source.V0.AsByte();
-                Vector128<byte> rowB = source.V1.AsByte();
-                Vector128<byte> rowC = source.V2.AsByte();
-                Vector128<byte> rowD = source.V3.AsByte();
-                Vector128<byte> rowE = source.V4.AsByte();
-                Vector128<byte> rowF = source.V5.AsByte();
-                Vector128<byte> rowG = source.V6.AsByte();
-                Vector128<byte> rowH = source.V7.AsByte();
-
-                // row0
-                Vector128<short> row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
-                Vector128<short> row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
-                Vector128<short> row0 = Sse2.Or(row0A, row0B);
-                Vector128<short> row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
-                row0 = Sse2.Or(row0, row0C);
-
-                // row1
-                Vector128<short> row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
-                Vector128<short> row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
-                Vector128<short> row1 = Sse2.Or(row1A, row1B);
-                Vector128<short> row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1C);
-                Vector128<short> row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1D);
-                Vector128<short> row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
-                row1 = Sse2.Or(row1, row1E);
+                Vector128<byte> rowA = block.V0.AsByte();
+                Vector128<byte> rowB = block.V1.AsByte();
+                Vector128<byte> rowC = block.V2.AsByte();
+                Vector128<byte> rowD = block.V3.AsByte();
+                Vector128<byte> rowE = block.V4.AsByte();
+                Vector128<byte> rowF = block.V5.AsByte();
+                Vector128<byte> rowG = block.V6.AsByte();
+                Vector128<byte> rowH = block.V7.AsByte();
+
+                // row0 - A0  A1  B0  C0  B1  A2  A3  B2
+                Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
+                Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(rowA0, rowB0);
+                Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
+                row0 = Sse2.Or(row0, rowC0);
+
+                // row1 - C1  D0  E0  D1  C2  B3  A4  A5
+                Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
+                Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(rowA1, rowC1);
+                Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
+                row1 = Sse2.Or(row1, rowD1);
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
 
                 // row2
-                Vector128<short> row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
-                Vector128<short> row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
-                Vector128<short> row2 = Sse2.Or(row2B, row2C);
-                Vector128<short> row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2D);
-                Vector128<short> row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2E);
-                Vector128<short> row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2F);
-                Vector128<short> row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
-                row2 = Sse2.Or(row2, row2G);
+                Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
+                Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(rowE2, rowF2);
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
 
                 // row3
-                Vector128<short> row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
-                Vector128<short> row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
-                Vector128<short> row3 = Sse2.Or(row3A, row3B);
-                Vector128<short> row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
-                row3 = Sse2.Or(row3, row3C);
-                Vector128<byte> row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
-                Vector128<short> row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16();
-                row3 = Sse2.Or(row3, row3D);
+                Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
+                Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
+                Vector128<short> row3 = Sse2.Or(rowA3, rowB3);
+                Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
+                row3 = Sse2.Or(row3, rowC3);
+                Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
+                Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
+                row3 = Sse2.Or(row3, rowD3);
 
                 // row4
-                Vector128<short> row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16();
-                Vector128<short> row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
-                Vector128<short> row4 = Sse2.Or(row4E, row4F);
-                Vector128<short> row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, row4G);
-                Vector128<short> row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
-                row4 = Sse2.Or(row4, row4H);
+                Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
+                Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(rowE4, rowF4);
+                Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
+                row4 = Sse2.Or(row4, rowG4);
+                Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
+                row4 = Sse2.Or(row4, rowH4);
 
                 // row5
-                Vector128<short> row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
-                Vector128<short> row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
-                Vector128<short> row5 = Sse2.Or(row5B, row5C);
-                Vector128<short> row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, row5D);
-                Vector128<short> row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, row5E);
-                Vector128<short> row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, row5F);
-                Vector128<short> row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
-                row5 = Sse2.Or(row5, row5G);
+                Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
+                Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(rowC5, rowD5);
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
 
                 // row6
-                Vector128<short> row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
-                Vector128<short> row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
-                Vector128<short> row6 = Sse2.Or(row6D, row6E);
-                Vector128<short> row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, row6F);
-                Vector128<short> row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, row6G);
-                Vector128<short> row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
-                row6 = Sse2.Or(row6, row6H);
+                Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
+                Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(rowE6, rowF6);
+                Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
+                row6 = Sse2.Or(row6, rowH6);
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
 
                 // row7
-                Vector128<short> row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
-                Vector128<short> row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
-                Vector128<short> row7 = Sse2.Or(row7F, row7G);
-                Vector128<short> row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
-                row7 = Sse2.Or(row7, row7H);
-
-                dest.V0 = row0;
-                dest.V1 = row1;
-                dest.V2 = row2;
-                dest.V3 = row3;
-                dest.V4 = row4;
-                dest.V5 = row5;
-                dest.V6 = row6;
-                dest.V7 = row7;
+                Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
+                Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(rowG7, rowH7);
+                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
+
+                block.V0 = row0;
+                block.V1 = row1;
+                block.V2 = row2;
+                block.V3 = row3;
+                block.V4 = row4;
+                block.V5 = row5;
+                block.V6 = row6;
+                block.V7 = row7;
             }
         }
 
@@ -267,18 +295,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <remarks>
         /// Requires Avx2 support.
         /// </remarks>
-        /// <param name="source">Input matrix.</param>
+        /// <param name="block">Input matrix.</param>
         /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 source, ref Block8x8 dest)
+        public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
             fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
             {
-                Vector256<byte> rowsAB = source.V01.AsByte();
-                Vector256<byte> rowsCD = source.V23.AsByte();
-                Vector256<byte> rowsEF = source.V45.AsByte();
-                Vector256<byte> rowsGH = source.V67.AsByte();
+                Vector256<byte> rowsAB = block.V01.AsByte();
+                Vector256<byte> rowsCD = block.V23.AsByte();
+                Vector256<byte> rowsEF = block.V45.AsByte();
+                Vector256<byte> rowsGH = block.V67.AsByte();
 
                 // rows 0 1
                 Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
@@ -333,10 +361,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
                 Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
 
-                dest.V01 = row01.AsInt16();
-                dest.V23 = row23.AsInt16();
-                dest.V45 = row45.AsInt16();
-                dest.V67 = row67.AsInt16();
+                block.V01 = row01.AsInt16();
+                block.V23 = row23.AsInt16();
+                block.V45 = row45.AsInt16();
+                block.V67 = row67.AsInt16();
             }
         }
     }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 89ef74d8b..40e42acb3 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -4,7 +4,9 @@
 // Uncomment this to turn unit tests into benchmarks:
 // #define BENCHMARKING
 using System;
-using System.Diagnostics;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics.X86;
+#endif
 
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
@@ -247,30 +249,45 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             this.CompareBlocks(expected, actual, 0);
         }
 
-        // TODO: intrinsic tests
         [Theory]
         [InlineData(1, 2)]
         [InlineData(2, 1)]
         public void Quantize(int srcSeed, int qtSeed)
         {
-            Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
-            Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);
+            static void RunTest(string srcSeedSerialized, string qtSeedSerialized)
+            {
+                int srcSeed = FeatureTestRunner.Deserialize<int>(srcSeedSerialized);
+                int qtSeed = FeatureTestRunner.Deserialize<int>(qtSeedSerialized);
 
-            // Reference implementation quantizes given block via division
-            Block8x8 expected = default;
-            ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
+                Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
 
-            // Actual current implementation quantizes given block via multiplication
-            // With quantization table reciprocal
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                quant[i] = 1f / quant[i];
-            }
+                // Quantization code is used only in jpeg where it's guaranteed that
+                // qunatization valus are greater than 1
+                // Quantize method supports negative numbers by very small numbers can cause troubles
+                Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed);
+
+                // Reference implementation quantizes given block via division
+                Block8x8 expected = default;
+                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
+
+                // Actual current implementation quantizes given block via multiplication
+                // With quantization table reciprocal
+                for (int i = 0; i < Block8x8F.Size; i++)
+                {
+                    quant[i] = 1f / quant[i];
+                }
 
-            Block8x8 actual = default;
-            Block8x8F.Quantize(ref source, ref actual, ref quant);
+                Block8x8 actual = default;
+                Block8x8F.Quantize(ref source, ref actual, ref quant);
 
-            this.CompareBlocks(expected, actual, 1);
+                Assert.True(CompareBlocks(expected, actual, 1, out int diff), $"Blocks are not equal, diff={diff}");
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                srcSeed,
+                qtSeed,
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE);
         }
 
         [Fact]
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
index ccb7f6f1e..1cf9bc4ae 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
@@ -190,6 +190,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
             Assert.False(failed);
         }
 
+        internal static bool CompareBlocks(Block8x8 a, Block8x8 b, int tolerance, out int diff)
+        {
+            bool res = CompareBlocks(a.AsFloatBlock(), b.AsFloatBlock(), tolerance + 1e-5f, out float fdiff);
+            diff = (int)fdiff;
+            return res;
+        }
+
+        internal static bool CompareBlocks(Block8x8F a, Block8x8F b, float tolerance, out float diff) =>
+            CompareBlocks(a.ToArray(), b.ToArray(), tolerance, out diff);
+
+        internal static bool CompareBlocks(Span<float> a, Span<float> b, float tolerance, out float diff)
+        {
+            var comparer = new ApproximateFloatComparer(tolerance);
+            bool failed = false;
+
+            diff = 0;
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = a[i];
+                float actual = b[i];
+                diff += Math.Abs(expected - actual);
+
+                if (!comparer.Equals(expected, actual))
+                {
+                    failed = true;
+                }
+            }
+
+            return !failed;
+        }
+
         internal static JpegDecoderCore ParseJpegStream(string testFileName, bool metaDataOnly = false)
         {
             byte[] bytes = TestFile.Create(testFileName).Bytes;
diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
index fa0f02ca1..0d2f3fcef 100644
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@@ -301,6 +301,52 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
             }
         }
 
+        /// <summary>
+        /// Runs the given test <paramref name="action"/> within an environment
+        /// where the given <paramref name="intrinsics"/> features.
+        /// </summary>
+        /// <param name="action">The test action to run.</param>
+        /// <param name="arg0">The value to pass as a parameter #0 to the test action.</param>
+        /// <param name="arg1">The value to pass as a parameter #1 to the test action.</param>
+        /// <param name="intrinsics">The intrinsics features.</param>
+        public static void RunWithHwIntrinsicsFeature<T>(
+            Action<string, string> action,
+            T arg0,
+            T arg1,
+            HwIntrinsics intrinsics)
+            where T : IConvertible
+        {
+            if (!RemoteExecutor.IsSupported)
+            {
+                return;
+            }
+
+            foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection())
+            {
+                var processStartInfo = new ProcessStartInfo();
+                if (intrinsic.Key != HwIntrinsics.AllowAll)
+                {
+                    processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+                    RemoteExecutor.Invoke(
+                        action,
+                        arg0.ToString(),
+                        arg1.ToString(),
+                        new RemoteInvokeOptions
+                        {
+                            StartInfo = processStartInfo
+                        })
+                        .Dispose();
+                }
+                else
+                {
+                    // Since we are running using the default architecture there is no
+                    // point creating the overhead of running the action in a separate process.
+                    action(arg0.ToString(), arg1.ToString());
+                }
+            }
+        }
+
         internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
         {
             // Loop through and translate the given values into COMPlus equivaluents

From 8cd4c9724c79645a074ee250f2d1739b7464520b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 09:31:32 +0300
Subject: [PATCH 38/56] Removed debug ssse3 zig-zag shuffle table

---
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 66 +------------------
 1 file changed, 1 insertion(+), 65 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index eb15c8b55..01a00180a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -21,107 +21,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
         /// zig zag implementation.
         /// </summary>
-        private static ReadOnlySpan<byte> SseShuffleMasks1 => new byte[]
-        {
-            // row0
-            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
-            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
-
-            // row1
-            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
-            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-
-            // row2
-            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-
-            // row3
-            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
-            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
-            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
-            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
-
-            // row4
-            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
-            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
-            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
-
-            // row5
-            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
-            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-
-            // row6
-            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
-            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-
-            // row7
-            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
-            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
-        };
-
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
             // row0
-            // A B C
             0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
             _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
             _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
 
             // row1
-            // A B C D E
             _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
             2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
             _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-            _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
 
             // row2
-            // B C D E F G
-            8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-            _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-            _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
             _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
             _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-            _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
 
             // row3
-            // A B C D
-            // D shuffle mask is the for row4 E row shuffle mask
             _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
             _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
             _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
             6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
 
             // row4
-            // E F G H
-            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
             _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
             _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
             _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
 
             // row5
-            // B C D E F G
-            _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
             _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
             10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-            _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
-            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
-            _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
 
             // row6
-            // D E F G H
-            _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
             _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
             _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
             4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
 
             // row7
-            // F G H
-            _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
             10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
             _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
         };
@@ -199,7 +135,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
-            fixed (byte* maskPtr = SseShuffleMasks1)
+            fixed (byte* maskPtr = SseShuffleMasks)
             {
                 Vector128<byte> rowA = block.V0.AsByte();
                 Vector128<byte> rowB = block.V1.AsByte();

From c6c9f2beefba2f21c609328930b42a06be86be42 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 13 Sep 2021 09:38:26 +0300
Subject: [PATCH 39/56] Fixed docs

---
 src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 01a00180a..6fa776e2a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -130,7 +130,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Requires Ssse3 support.
         /// </remarks>
         /// <param name="block">Input matrix.</param>
-        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
         public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
@@ -232,7 +231,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Requires Avx2 support.
         /// </remarks>
         /// <param name="block">Input matrix.</param>
-        /// <param name="dest">Matrix to store the result. Can be a reference to input matrix.</param>
         public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");

From 6b3f0f7bd9d838b47b97aa676cbd6b3253dabb14 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 14 Sep 2021 01:12:39 +0300
Subject: [PATCH 40/56] gfoidl fixes

---
 .../Formats/Jpeg/Components/Block8x8.cs          | 10 +++++-----
 .../Jpeg/Components/Block8x8F.Intrinsic.cs       |  6 +++---
 .../Formats/Jpeg/Components/Block8x8F.cs         |  4 ++--
 .../Components/Encoder/HuffmanScanEncoder.cs     |  9 +++++----
 .../Jpeg/Components/FastFloatingPointDCT.cs      |  4 ++--
 .../Formats/Jpeg/Components/ZigZag.Intrinsic.cs  | 14 ++++----------
 .../Formats/Jpg/Block8x8Tests.cs                 | 16 ++++++++--------
 7 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 71077675d..9cefedc1d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
         public static Block8x8 Load(Span<short> data)
         {
-            Block8x8 result = default;
+            Unsafe.SkipInit(out Block8x8 result);
             result.LoadFrom(data);
             return result;
         }
@@ -204,7 +204,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
             ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<short, byte>(destination));
-            Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short));
+            Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short));
         }
 
         /// <summary>
@@ -287,7 +287,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public int GetLastNonZeroIndex()
+        public nint GetLastNonZeroIndex()
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx2.IsSupported)
@@ -298,7 +298,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
                 ref Vector256<short> mcuStride = ref Unsafe.As<Block8x8, Vector256<short>>(ref this);
 
-                for (int i = 3; i >= 0; i--)
+                for (nint i = 3; i >= 0; i--)
                 {
                     int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte());
 
@@ -325,7 +325,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             else
 #endif
             {
-                int index = Size - 1;
+                nint index = Size - 1;
                 ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
 
                 while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 733d32892..e78802472 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -46,7 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             ref Vector256<short> destRef = ref dest.V01;
 
-            for (int i = 0; i < 8; i += 2)
+            for (nint i = 0; i < 8; i += 2)
             {
                 Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
                 Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
@@ -54,7 +54,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
                 row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
 
-                Unsafe.Add(ref destRef, i / 2) = row;
+                Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
             }
         }
 
@@ -73,7 +73,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
 
                 Vector128<short> row = Sse2.PackSignedSaturate(left, right);
-                Unsafe.Add(ref destBase, i / 2) = row;
+                Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row;
             }
         }
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 24177c556..986af3417 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -414,12 +414,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx2.IsSupported)
             {
                 MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingAvx(ref dest);
+                ZigZag.ApplyZigZagOrderingAvx2(ref dest);
             }
             else if (Ssse3.IsSupported)
             {
                 MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingSse(ref dest);
+                ZigZag.ApplyZigZagOrderingSsse3(ref dest);
             }
             else
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 3e6b0e5f4..35e0e2648 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -115,7 +115,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private bool IsFlushNeeded
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            get => this.emitWriteIndex < this.emitBuffer.Length / 2;
+            get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2;
         }
 
         /// <summary>
@@ -408,15 +408,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // Emit the AC components.
             int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
 
-            int lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
+            nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
 
             int runLength = 0;
-            for (int zig = 1; zig <= lastValuableIndex; zig++)
+            ref short blockRef = ref Unsafe.As<Block8x8, short>(ref spectralBlock);
+            for (nint zig = 1; zig <= lastValuableIndex; zig++)
             {
                 const int zeroRun1 = 1 << 4;
                 const int zeroRun16 = 16 << 4;
 
-                int ac = spectralBlock[zig];
+                int ac = Unsafe.Add(ref blockRef, zig);
                 if (ac == 0)
                 {
                     runLength += zeroRun1;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 1c5cfc8d6..4f7db7c59 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -68,7 +68,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Values are also scaled by 8 so DCT code won't do unnecessary division.
         /// </para>
         /// </remarks>
-        public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
+        public static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
         {
             0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
             0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
@@ -104,7 +104,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public static void TransformFDCT(ref Block8x8F block)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported || Sse.IsSupported)
+            if (Sse.IsSupported)
             {
                 ForwardTransformSimd(ref block);
             }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 6fa776e2a..6577739c1 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #pragma warning restore SA1309
 
         /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
         /// zig zag implementation.
         /// </summary>
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
@@ -63,7 +63,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         };
 
         /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx"/>
+        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
         /// zig zag implementation.
         /// </summary>
         private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
@@ -126,11 +126,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <summary>
         /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
         /// </summary>
-        /// <remarks>
-        /// Requires Ssse3 support.
-        /// </remarks>
         /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block)
+        public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
@@ -227,11 +224,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <summary>
         /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
         /// </summary>
-        /// <remarks>
-        /// Requires Avx2 support.
-        /// </remarks>
         /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block)
+        public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index 69375ae1b..3737cce80 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -130,9 +130,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             {
                 Block8x8 data = default;
 
-                int expected = -1;
+                nint expected = -1;
 
-                int actual = data.GetLastNonZeroIndex();
+                nint actual = data.GetLastNonZeroIndex();
 
                 Assert.Equal(expected, actual);
             }
@@ -153,9 +153,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     data[i] = 10;
                 }
 
-                int expected = Block8x8.Size - 1;
+                nint expected = Block8x8.Size - 1;
 
-                int actual = data.GetLastNonZeroIndex();
+                nint actual = data.GetLastNonZeroIndex();
 
                 Assert.Equal(expected, actual);
             }
@@ -182,9 +182,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     int setIndex = rng.Next(1, Block8x8.Size);
                     data[setIndex] = (short)rng.Next(-2000, 2000);
 
-                    int expected = setIndex;
+                    nint expected = setIndex;
 
-                    int actual = data.GetLastNonZeroIndex();
+                    nint actual = data.GetLastNonZeroIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -219,7 +219,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = lastIndex;
 
-                    int actual = data.GetLastNonZeroIndex();
+                    nint actual = data.GetLastNonZeroIndex();
 
                     Assert.Equal(expected, actual);
                 }
@@ -265,7 +265,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                     int expected = secondChunkEnd;
 
-                    int actual = data.GetLastNonZeroIndex();
+                    nint actual = data.GetLastNonZeroIndex();
 
                     Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}");
                 }

From d934bad69e554517df55c204a7e7f482f58ddef4 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 17 Sep 2021 05:01:12 +0300
Subject: [PATCH 41/56] gfoidl fixes

---
 src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 9cefedc1d..9d49b8c45 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -225,10 +225,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         [MethodImpl(InliningOptions.ShortMethod)]
         public void LoadFrom(Span<short> source)
         {
-            ref byte s = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
-            ref byte d = ref Unsafe.As<Block8x8, byte>(ref this);
+            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
+            ref byte destRef = ref Unsafe.As<Block8x8, byte>(ref this);
 
-            Unsafe.CopyBlock(ref d, ref s, Size * sizeof(short));
+            Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short));
         }
 
         /// <summary>

From f7bc8d77479781899924afa2f28773fe61ec48ce Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Wed, 22 Sep 2021 18:23:07 +0200
Subject: [PATCH 42/56] Added test image & test method

---
 ImageSharp.sln                                 |  4 ++--
 .../Formats/Png/PngDecoderTests.cs             | 18 ++++++++++++++++++
 tests/ImageSharp.Tests/TestImages.cs           |  3 +++
 tests/Images/Input/Png/issues/Issue_1765.png   |  3 +++
 4 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 tests/Images/Input/Png/issues/Issue_1765.png

diff --git a/ImageSharp.sln b/ImageSharp.sln
index bf1f3579c..c71ec11d7 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -1,6 +1,6 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.28902.138
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31710.8
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}"
 	ProjectSection(SolutionItems) = preProject
diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
index 9832aeb7b..a517c4a4a 100644
--- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
@@ -368,6 +368,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png
             Assert.Null(ex);
         }
 
+        // https://github.com/SixLabors/ImageSharp/issues/1765
+        [Theory]
+        [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)]
+        public void Issue1765<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            System.Exception ex = Record.Exception(
+                () =>
+                {
+                    using (Image<TPixel> image = provider.GetImage(PngDecoder))
+                    {
+                        image.DebugSave(provider);
+                        image.CompareToOriginal(provider, ImageComparer.Exact);
+                    }
+                });
+            Assert.Null(ex);
+        }
+
         // https://github.com/SixLabors/ImageSharp/issues/410
         [Theory]
         [WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)]
diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
index d1a6624af..ee85029ce 100644
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@@ -111,6 +111,9 @@ namespace SixLabors.ImageSharp.Tests
             // Issue 935: https://github.com/SixLabors/ImageSharp/issues/935
             public const string Issue935 = "Png/issues/Issue_935.png";
 
+            // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765
+            public const string Issue1765 = "png/issues/Issue_1765.png";
+
             public static class Bad
             {
                 public const string MissingDataChunk = "Png/xdtn0g01.png";
diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765.png
new file mode 100644
index 000000000..c9705550f
--- /dev/null
+++ b/tests/Images/Input/Png/issues/Issue_1765.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86ea14567bcd259d76dc782ee366c23a5755714c6d48f636524b23e75b89e5b6
+size 775275

From c1e8c15b88296c01d4c2976c949b4442b7bb73ad Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Wed, 22 Sep 2021 18:23:07 +0200
Subject: [PATCH 43/56] Added test image & test method

---
 ImageSharp.sln                                 |  5 +++--
 .../Formats/Png/PngDecoderTests.cs             | 18 ++++++++++++++++++
 tests/ImageSharp.Tests/TestImages.cs           |  3 +++
 tests/Images/Input/Png/issues/Issue_1765.png   |  3 +++
 4 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 tests/Images/Input/Png/issues/Issue_1765.png

diff --git a/ImageSharp.sln b/ImageSharp.sln
index bf1f3579c..b6f3b5a0f 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -1,6 +1,6 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.28902.138
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31710.8
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}"
 	ProjectSection(SolutionItems) = preProject
@@ -403,6 +403,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{670DD4
 		tests\Images\Input\Png\issues\Issue_1127.png = tests\Images\Input\Png\issues\Issue_1127.png
 		tests\Images\Input\Png\issues\Issue_1177_1.png = tests\Images\Input\Png\issues\Issue_1177_1.png
 		tests\Images\Input\Png\issues\Issue_1177_2.png = tests\Images\Input\Png\issues\Issue_1177_2.png
+		tests\Images\Input\Png\issues\Issue_1765.png = tests\Images\Input\Png\issues\Issue_1765.png
 		tests\Images\Input\Png\issues\Issue_410.png = tests\Images\Input\Png\issues\Issue_410.png
 		tests\Images\Input\Png\issues\Issue_935.png = tests\Images\Input\Png\issues\Issue_935.png
 	EndProjectSection
diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
index 9832aeb7b..a517c4a4a 100644
--- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
@@ -368,6 +368,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png
             Assert.Null(ex);
         }
 
+        // https://github.com/SixLabors/ImageSharp/issues/1765
+        [Theory]
+        [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)]
+        public void Issue1765<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            System.Exception ex = Record.Exception(
+                () =>
+                {
+                    using (Image<TPixel> image = provider.GetImage(PngDecoder))
+                    {
+                        image.DebugSave(provider);
+                        image.CompareToOriginal(provider, ImageComparer.Exact);
+                    }
+                });
+            Assert.Null(ex);
+        }
+
         // https://github.com/SixLabors/ImageSharp/issues/410
         [Theory]
         [WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)]
diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
index d1a6624af..ee85029ce 100644
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@@ -111,6 +111,9 @@ namespace SixLabors.ImageSharp.Tests
             // Issue 935: https://github.com/SixLabors/ImageSharp/issues/935
             public const string Issue935 = "Png/issues/Issue_935.png";
 
+            // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765
+            public const string Issue1765 = "png/issues/Issue_1765.png";
+
             public static class Bad
             {
                 public const string MissingDataChunk = "Png/xdtn0g01.png";
diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765.png
new file mode 100644
index 000000000..c9705550f
--- /dev/null
+++ b/tests/Images/Input/Png/issues/Issue_1765.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86ea14567bcd259d76dc782ee366c23a5755714c6d48f636524b23e75b89e5b6
+size 775275

From 7b7ee4a9fb16e27b556061d87e38e64d5f583758 Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Wed, 22 Sep 2021 18:27:41 +0200
Subject: [PATCH 44/56] Reverted sln version change

---
 ImageSharp.sln | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ImageSharp.sln b/ImageSharp.sln
index b6f3b5a0f..6ae369f2d 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -1,6 +1,6 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.0.31710.8
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.28902.138
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}"
 	ProjectSection(SolutionItems) = preProject

From c967e3653ad1b1f9dd1a54a713e7e0d6709c637d Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Wed, 22 Sep 2021 18:39:40 +0200
Subject: [PATCH 45/56] Was it a capital letter issue?

---
 tests/ImageSharp.Tests/TestImages.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
index ee85029ce..fb6cc7b67 100644
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.Tests
             public const string Issue935 = "Png/issues/Issue_935.png";
 
             // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765
-            public const string Issue1765 = "png/issues/Issue_1765.png";
+            public const string Issue1765 = "Png/issues/Issue_1765.png";
 
             public static class Bad
             {

From b055e8b14bb9e75d8093fb99b6b2cba24b873495 Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Thu, 23 Sep 2021 17:02:55 +0200
Subject: [PATCH 46/56] Renamed the file and file reference

---
 ImageSharp.sln                                                  | 2 +-
 tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs           | 2 +-
 tests/ImageSharp.Tests/TestImages.cs                            | 2 +-
 .../{Issue_1765.png => Issue_1765_Net6DeflateStreamRead.png}    | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename tests/Images/Input/Png/issues/{Issue_1765.png => Issue_1765_Net6DeflateStreamRead.png} (100%)

diff --git a/ImageSharp.sln b/ImageSharp.sln
index 6ae369f2d..c433d22f5 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -403,7 +403,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{670DD4
 		tests\Images\Input\Png\issues\Issue_1127.png = tests\Images\Input\Png\issues\Issue_1127.png
 		tests\Images\Input\Png\issues\Issue_1177_1.png = tests\Images\Input\Png\issues\Issue_1177_1.png
 		tests\Images\Input\Png\issues\Issue_1177_2.png = tests\Images\Input\Png\issues\Issue_1177_2.png
-		tests\Images\Input\Png\issues\Issue_1765.png = tests\Images\Input\Png\issues\Issue_1765.png
+		tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png = tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png
 		tests\Images\Input\Png\issues\Issue_410.png = tests\Images\Input\Png\issues\Issue_410.png
 		tests\Images\Input\Png\issues\Issue_935.png = tests\Images\Input\Png\issues\Issue_935.png
 	EndProjectSection
diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
index a517c4a4a..9fc4d03dd 100644
--- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
@@ -370,7 +370,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png
 
         // https://github.com/SixLabors/ImageSharp/issues/1765
         [Theory]
-        [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)]
+        [WithFile(TestImages.Png.Issue1765_Net6DeflateStreamRead, PixelTypes.Rgba32)]
         public void Issue1765<TPixel>(TestImageProvider<TPixel> provider)
             where TPixel : unmanaged, IPixel<TPixel>
         {
diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
index fb6cc7b67..b0a219711 100644
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.Tests
             public const string Issue935 = "Png/issues/Issue_935.png";
 
             // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765
-            public const string Issue1765 = "Png/issues/Issue_1765.png";
+            public const string Issue1765_Net6DeflateStreamRead = "Png/issues/Issue_1765_Net6DeflateStreamRead.png";
 
             public static class Bad
             {
diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765_Net6DeflateStreamRead.png
similarity index 100%
rename from tests/Images/Input/Png/issues/Issue_1765.png
rename to tests/Images/Input/Png/issues/Issue_1765_Net6DeflateStreamRead.png

From 8d29205076b5bd7265e507a7d3c9a85ae5e410bc Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 23 Sep 2021 22:29:16 +0300
Subject: [PATCH 47/56] Updated encoder benchmark

---
 .../Codecs/Jpeg/EncodeJpeg.cs                 | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 508b4b3b0..0e9bed1d9 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -111,24 +111,24 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 }
 
 /*
-BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042
 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-.NET Core SDK=6.0.100-preview.3.21202.5
-  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
-  DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]     : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
 
 
 |                      Method | Quality |     Mean |    Error |   StdDev | Ratio |
 |---------------------------- |-------- |---------:|---------:|---------:|------:|
-| 'System.Drawing Jpeg 4:2:0' |      75 | 29.41 ms | 0.108 ms | 0.096 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |      75 | 26.30 ms | 0.131 ms | 0.109 ms |  0.89 |
-|     'ImageSharp Jpeg 4:4:4' |      75 | 36.70 ms | 0.303 ms | 0.269 ms |  1.25 |
+| 'System.Drawing Jpeg 4:2:0' |      75 | 30.04 ms | 0.540 ms | 0.479 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |      75 | 19.32 ms | 0.290 ms | 0.257 ms |  0.64 |
+|     'ImageSharp Jpeg 4:4:4' |      75 | 26.76 ms | 0.332 ms | 0.294 ms |  0.89 |
 |                             |         |          |          |          |       |
-| 'System.Drawing Jpeg 4:2:0' |      90 | 32.67 ms | 0.226 ms | 0.211 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |      90 | 33.56 ms | 0.237 ms | 0.222 ms |  1.03 |
-|     'ImageSharp Jpeg 4:4:4' |      90 | 44.82 ms | 0.250 ms | 0.234 ms |  1.37 |
+| 'System.Drawing Jpeg 4:2:0' |      90 | 32.82 ms | 0.184 ms | 0.163 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |      90 | 25.00 ms | 0.408 ms | 0.361 ms |  0.76 |
+|     'ImageSharp Jpeg 4:4:4' |      90 | 31.83 ms | 0.636 ms | 0.595 ms |  0.97 |
 |                             |         |          |          |          |       |
-| 'System.Drawing Jpeg 4:2:0' |     100 | 39.06 ms | 0.233 ms | 0.218 ms |  1.00 |
-|     'ImageSharp Jpeg 4:2:0' |     100 | 40.23 ms | 0.225 ms | 0.277 ms |  1.03 |
-|     'ImageSharp Jpeg 4:4:4' |     100 | 63.35 ms | 0.486 ms | 0.431 ms |  1.62 |
+| 'System.Drawing Jpeg 4:2:0' |     100 | 39.30 ms | 0.359 ms | 0.318 ms |  1.00 |
+|     'ImageSharp Jpeg 4:2:0' |     100 | 34.49 ms | 0.265 ms | 0.235 ms |  0.88 |
+|     'ImageSharp Jpeg 4:4:4' |     100 | 56.40 ms | 0.565 ms | 0.501 ms |  1.44 |
 */

From 6532552b6b8041a7b33f0392476014da29da1208 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 18:50:50 +0300
Subject: [PATCH 48/56] Naming fix & simd else if branch

---
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  2 +-
 .../Formats/Jpeg/Components/Block8x8F.cs      | 10 ++++----
 .../FastFloatingPointDCT.Intrinsic.cs         | 24 +++++++++----------
 .../Jpeg/Components/FastFloatingPointDCT.cs   |  4 ++--
 .../BlockOperations/Block8x8F_Transpose.cs    |  4 ++--
 .../Formats/Jpg/Block8x8FTests.cs             |  4 ++--
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index e78802472..5a00ccd3d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        private void TransposeAvx()
+        private void Transpose_Avx()
         {
             // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
             Vector256<float> r0 = Avx.InsertVector128(
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 986af3417..1d2b19a7b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -612,25 +612,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Transpose the block inplace.
         /// </summary>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public void Transpose()
+        public void TransposeInplace()
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                this.TransposeAvx();
+                this.Transpose_Avx();
             }
             else
 #endif
             {
-                this.TransposeScalar();
+                this.TransposeInplace_Scalar();
             }
         }
 
         /// <summary>
-        /// Scalar inplace transpose implementation for <see cref="Transpose"/>
+        /// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
         /// </summary>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void TransposeScalar()
+        private void TransposeInplace_Scalar()
         {
             float tmp;
             int horIndex, verIndex;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 7a2b0a78c..0ebe9dbf9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -45,33 +45,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
 
             // First pass - process rows
-            block.Transpose();
+            block.TransposeInplace();
             if (Avx.IsSupported)
             {
-                FDCT8x8_avx(ref block);
+                FDCT8x8_Avx(ref block);
             }
-            else if (Sse.IsSupported)
+            else
             {
                 // Left part
-                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
 
                 // Right part
-                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
             }
 
             // Second pass - process columns
-            block.Transpose();
+            block.TransposeInplace();
             if (Avx.IsSupported)
             {
-                FDCT8x8_avx(ref block);
+                FDCT8x8_Avx(ref block);
             }
-            else if (Sse.IsSupported)
+            else
             {
                 // Left part
-                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
+                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
 
                 // Right part
-                FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
+                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
             }
         }
 
@@ -83,7 +83,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Must be called on both 8x4 matrix parts for the full FDCT transform.
         /// </remarks>
         /// <param name="blockRef">Input reference to the first </param>
-        public static void FDCT8x4_sse(ref Vector128<float> blockRef)
+        public static void FDCT8x4_Sse(ref Vector128<float> blockRef)
         {
             DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
 
@@ -135,7 +135,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Requires Avx support.
         /// </remarks>
         /// <param name="block">Input matrix.</param>
-        public static void FDCT8x8_avx(ref Block8x8F block)
+        public static void FDCT8x8_Avx(ref Block8x8F block)
         {
             DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 4f7db7c59..51f29fd51 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -88,9 +88,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <param name="temp">Matrix to store temporal results.</param>
         public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
         {
-            block.Transpose();
+            block.TransposeInplace();
             IDCT8x8(ref block, ref temp);
-            temp.Transpose();
+            temp.TransposeInplace();
             IDCT8x8(ref temp, ref block);
 
             // TODO: This can be fused into quantization table step
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 28899b51e..f60121d33 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -12,9 +12,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
         private Block8x8F source = Create8x8FloatData();
 
         [Benchmark]
-        public float TransposeInto()
+        public float TransposeInplace()
         {
-            this.source.Transpose();
+            this.source.TransposeInplace();
             return this.source[0];
         }
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 40e42acb3..d01b4b501 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -166,7 +166,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         }
 
         [Fact]
-        public void Transpose()
+        public void TransposeInplace()
         {
             static void RunTest()
             {
@@ -176,7 +176,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 var block8x8 = default(Block8x8F);
                 block8x8.LoadFrom(Create8x8FloatData());
 
-                block8x8.Transpose();
+                block8x8.TransposeInplace();
 
                 float[] actual = new float[64];
                 block8x8.ScaledCopyTo(actual);

From 7831caab950e21d093b7eac8349ea6fd92d8ae2d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 18:59:51 +0300
Subject: [PATCH 49/56] DCT fixes, ifdef & accessor

---
 .../Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs   | 2 --
 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs  | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 0ebe9dbf9..7d92c3468 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -188,7 +188,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <param name="d">Destination</param>
         public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
         {
-#if SUPPORTS_RUNTIME_INTRINSICS
             Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
 
             Vector256<float> my1 = s.V1;
@@ -236,7 +235,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V5 = Avx.Subtract(my2, mb2);
             d.V3 = Avx.Add(my3, mb3);
             d.V4 = Avx.Subtract(my3, mb3);
-#endif
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 51f29fd51..985dac1bd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -230,7 +230,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         /// <param name="s">Source</param>
         /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)

From dce87fe2f8ffbd37ddf993e22aa83fc4dbefe69b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 19:02:07 +0300
Subject: [PATCH 50/56] Naming fix

---
 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs | 2 +-
 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 5a00ccd3d..0971ccdca 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        private void Transpose_Avx()
+        private void TransposeInplace_Avx()
         {
             // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
             Vector256<float> r0 = Avx.InsertVector128(
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 1d2b19a7b..0bd20b441 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -617,7 +617,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                this.Transpose_Avx();
+                this.TransposeInplace_Avx();
             }
             else
 #endif

From e4b32dbf28cabb982b14225db773ccf4110dec69 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 21:54:08 +0300
Subject: [PATCH 51/56] Improved scalar transpose implementation

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 63 ++++++++++++++-----
 .../BlockOperations/Block8x8F_Transpose.cs    | 21 ++++---
 2 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 0bd20b441..02f5a1324 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -632,22 +632,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         [MethodImpl(InliningOptions.ShortMethod)]
         private void TransposeInplace_Scalar()
         {
-            float tmp;
-            int horIndex, verIndex;
-
-            // We don't care about the last row as it consists of a single element
-            // Which won't be swapped with anything
-            for (int i = 0; i < 7; i++)
+            ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
+
+            // row #0
+            Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+            Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+            Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+            Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+            Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+            Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+            Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+            // row #1
+            Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+            Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+            Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+            Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+            Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+            Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+            // row #2
+            Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+            Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+            Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+            Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+            Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+            // row #3
+            Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+            Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+            Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+            Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+            // row #4
+            Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+            Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+            Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+            // row #5
+            Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+            Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+            // row #6
+            Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+            static void Swap(ref float a, ref float b)
             {
-                // We don't care about the first element in each row as it's not swapped
-                for (int j = i + 1; j < 8; j++)
-                {
-                    horIndex = (i * 8) + j;
-                    verIndex = (j * 8) + i;
-                    tmp = this[horIndex];
-                    this[horIndex] = this[verIndex];
-                    this[verIndex] = tmp;
-                }
+                float tmp = a;
+                a = b;
+                b = tmp;
             }
         }
 
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index f60121d33..c2efb517a 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -35,15 +35,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
 }
 
 /*
-BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1237 (20H2/October2020Update)
 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
 .NET SDK=6.0.100-preview.3.21202.5
-  [Host]          : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
-  AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
-  No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
-
-|        Method |             Job |      Mean |     Error |    StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
-|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:|
-| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns |  1.00 |     - |     - |     - |         - |
-| TransposeInto |             AVX |  8.613 ns | 0.0249 ns | 0.0208 ns |  0.44 |     - |     - |     - |         - |
+  [Host]             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  2. SSE             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  3. AVX             : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+Runtime=.NET Core 3.1
+
+|           Method |             Job |      Mean |     Error |    StdDev | Ratio |
+|----------------- |----------------:|----------:|----------:|----------:|------:|
+| TransposeInplace | No HwIntrinsics | 12.531 ns | 0.0637 ns | 0.0565 ns |  1.00 |
+| TransposeInplace |             AVX |  5.767 ns | 0.0529 ns | 0.0495 ns |  0.46 |
 */

From bd9f06f42be1d11df0b5080b04e52e577935aa26 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 23:20:03 +0300
Subject: [PATCH 52/56] FDCT sse path via Vector4

---
 .../FastFloatingPointDCT.Intrinsic.cs         |  88 +----------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 142 ++++++++++++++----
 2 files changed, 114 insertions(+), 116 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 7d92c3468..f40ae6e87 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
         private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
 
-        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
-        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
-        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
-        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
-
         private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
         private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
         private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
@@ -40,92 +35,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
         /// <param name="block">Input matrix.</param>
-        private static void ForwardTransformSimd(ref Block8x8F block)
+        private static void ForwardTransform_Avx(ref Block8x8F block)
         {
-            DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
             // First pass - process rows
             block.TransposeInplace();
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref block);
-            }
-            else
-            {
-                // Left part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
-
-                // Right part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
-            }
+            FDCT8x8_Avx(ref block);
 
             // Second pass - process columns
             block.TransposeInplace();
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref block);
-            }
-            else
-            {
-                // Left part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
-
-                // Right part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
-            }
-        }
-
-        /// <summary>
-        /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix.
-        /// </summary>
-        /// <remarks>
-        /// Requires Sse support.
-        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
-        /// </remarks>
-        /// <param name="blockRef">Input reference to the first </param>
-        public static void FDCT8x4_Sse(ref Vector128<float> blockRef)
-        {
-            DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
-
-            Vector128<float> tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
-            Vector128<float> tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
-            Vector128<float> tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
-            Vector128<float> tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
-            Vector128<float> tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
-            Vector128<float> tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
-            Vector128<float> tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
-            Vector128<float> tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
-
-            // Even part
-            Vector128<float> tmp10 = Sse.Add(tmp0, tmp3);
-            Vector128<float> tmp13 = Sse.Subtract(tmp0, tmp3);
-            Vector128<float> tmp11 = Sse.Add(tmp1, tmp2);
-            Vector128<float> tmp12 = Sse.Subtract(tmp1, tmp2);
-
-            Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11);
-            Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11);
-
-            Vector128<float> z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071);
-            Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1);
-            Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Sse.Add(tmp4, tmp5);
-            tmp11 = Sse.Add(tmp5, tmp6);
-            tmp12 = Sse.Add(tmp6, tmp7);
-
-            Vector128<float> z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826);
-            Vector128<float> z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5);
-            Vector128<float> z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5);
-            Vector128<float> z3 = Sse.Multiply(tmp11, mm128_F_0_7071);
-
-            Vector128<float> z11 = Sse.Add(tmp7, z3);
-            Vector128<float> z13 = Sse.Subtract(tmp7, z3);
-
-            Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2);
-            Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2);
-            Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4);
-            Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4);
+            FDCT8x8_Avx(ref block);
         }
 
         /// <summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 985dac1bd..43f6b7a1f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -18,30 +18,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     {
 #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
         private const float C_1_175876 = 1.175875602f;
-
         private const float C_1_961571 = -1.961570560f;
-
         private const float C_0_390181 = -0.390180644f;
-
         private const float C_0_899976 = -0.899976223f;
-
         private const float C_2_562915 = -2.562915447f;
-
         private const float C_0_298631 = 0.298631336f;
-
         private const float C_2_053120 = 2.053119869f;
-
         private const float C_3_072711 = 3.072711026f;
-
         private const float C_1_501321 = 1.501321110f;
-
         private const float C_0_541196 = 0.541196100f;
-
         private const float C_1_847759 = -1.847759065f;
-
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
+
+#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
+        private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
+        private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
+        private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
+#pragma warning restore SA1311, IDE1006
+
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
 
         /// <summary>
@@ -80,23 +77,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
         };
 
-        /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
-        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
-        {
-            block.TransposeInplace();
-            IDCT8x8(ref block, ref temp);
-            temp.TransposeInplace();
-            IDCT8x8(ref temp, ref block);
-
-            // TODO: This can be fused into quantization table step
-            block.MultiplyInPlace(C_0_125);
-        }
-
         /// <summary>
         /// Apply 2D floating point FDCT inplace.
         /// </summary>
@@ -104,14 +84,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public static void TransformFDCT(ref Block8x8F block)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
             {
-                ForwardTransformSimd(ref block);
+                ForwardTransform_Avx(ref block);
             }
             else
 #endif
+            if (Vector.IsHardwareAccelerated)
             {
-                ForwardTransformScalar(ref block);
+                ForwardTransform_Vector4(ref block);
+            }
+            else
+            {
+                ForwardTransform_Scalar(ref block);
             }
         }
 
@@ -122,7 +107,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
         /// </remarks>
         /// <param name="block">Input matrix.</param>
-        private static void ForwardTransformScalar(ref Block8x8F block)
+        private static void ForwardTransform_Scalar(ref Block8x8F block)
         {
             const int dctSize = 8;
 
@@ -225,6 +210,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
+        /// <summary>
+        /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
+        /// </summary>
+        /// <remarks>
+        /// This implementation must be called only if hardware supports 4
+        /// floating point numbers vector. Otherwise explicit scalar
+        /// implementation <see cref="ForwardTransform_Scalar"/> is faster
+        /// because it does not rely on matrix transposition.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransform_Vector4(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+            // First pass - process rows
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);
+
+            // Second pass - process columns
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
+        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+        /// </remarks>
+        /// <param name="blockRef">Input reference to the first </param>
+        private static void FDCT8x4_Vector4(ref Vector4 blockRef)
+        {
+            Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
+            Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
+
+            // Even part
+            Vector4 tmp10 = tmp0 + tmp3;
+            Vector4 tmp13 = tmp0 - tmp3;
+            Vector4 tmp11 = tmp1 + tmp2;
+            Vector4 tmp12 = tmp1 - tmp2;
+
+            Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+            Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
+
+            Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+            Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
+            Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+            Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+            Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+            Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+            Vector4 z11 = tmp7 + z3;
+            Vector4 z13 = tmp7 - z3;
+
+            Unsafe.Add(ref blockRef, 10) = z13 + z2;
+            Unsafe.Add(ref blockRef, 6) = z13 - z2;
+            Unsafe.Add(ref blockRef, 2) = z11 + z4;
+            Unsafe.Add(ref blockRef, 14) = z11 - z4;
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
+        {
+            block.TransposeInplace();
+            IDCT8x8(ref block, ref temp);
+            temp.TransposeInplace();
+            IDCT8x8(ref temp, ref block);
+
+            // TODO: This can be fused into quantization table step
+            block.MultiplyInPlace(C_0_125);
+        }
+
         /// <summary>
         /// Performs 8x8 matrix Inverse Discrete Cosine Transform
         /// </summary>

From e9eaa5222e63ca9b11c6eaeb283060b714f2becf Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 23:29:57 +0300
Subject: [PATCH 53/56] FDCT fma usage

---
 .../Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index f40ae6e87..ab9462632 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -87,8 +87,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             tmp12 = Avx.Add(tmp6, tmp7);
 
             Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
-            Vector256<float> z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5);
-            Vector256<float> z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5);
+            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
             Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
 
             Vector256<float> z11 = Avx.Add(tmp7, z3);

From 4ff29844febdc5e59c0fbd33461741f197d293cf Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 1 Oct 2021 22:35:36 +0300
Subject: [PATCH 54/56] Docs

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 120 +++++++++++++-----
 1 file changed, 90 insertions(+), 30 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 35e0e2648..bbdd3220f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -65,6 +65,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <remarks>Yields codewords by index consisting of [run length | bitsize].</remarks>
         private HuffmanLut[] huffmanTables;
 
+        /// <summary>
+        /// Emitted bits 'micro buffer' before being transferred to the <see cref="emitBuffer"/>.
+        /// </summary>
+        private uint accumulatedBits;
+
         /// <summary>
         /// Buffer for temporal storage of huffman rle encoding bit data.
         /// </summary>
@@ -82,18 +87,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </remarks>
         private readonly byte[] streamWriteBuffer;
 
-        private int emitWriteIndex;
-
-        /// <summary>
-        /// Emitted bits 'micro buffer' before being transferred to the <see cref="emitBuffer"/>.
-        /// </summary>
-        private uint accumulatedBits;
-
         /// <summary>
         /// Number of jagged bits stored in <see cref="accumulatedBits"/>
         /// </summary>
         private int bitCount;
 
+        private int emitWriteIndex;
+
         private Block8x8 tempBlock;
 
         /// <summary>
@@ -101,9 +101,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private readonly Stream target;
 
-        public HuffmanScanEncoder(int componentCount, Stream outputStream)
+        /// <summary>
+        /// Initializes a new instance of the <see cref="HuffmanScanEncoder"/> class.
+        /// </summary>
+        /// <param name="blocksPerCodingUnit">Amount of encoded 8x8 blocks per single jpeg macroblock.</param>
+        /// <param name="outputStream">Output stream for saving encoded data.</param>
+        public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream)
         {
-            int emitBufferByteLength = MaxBytesPerBlock * componentCount;
+            int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit;
             this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)];
             this.emitWriteIndex = this.emitBuffer.Length;
 
@@ -112,7 +117,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.target = outputStream;
         }
 
-        private bool IsFlushNeeded
+        /// <summary>
+        /// Gets a value indicating whether <see cref="emitBuffer"/> is full
+        /// and must be flushed using <see cref="FlushToStream()"/>
+        /// before encoding next 8x8 coding block.
+        /// </summary>
+        private bool IsStreamFlushNeeded
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2;
@@ -174,7 +184,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Cr,
                         ref chrominanceQuantTable);
 
-                    if (this.IsFlushNeeded)
+                    if (this.IsStreamFlushNeeded)
                     {
                         this.FlushToStream();
                     }
@@ -249,7 +259,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Cr,
                         ref chrominanceQuantTable);
 
-                    if (this.IsFlushNeeded)
+                    if (this.IsStreamFlushNeeded)
                     {
                         this.FlushToStream();
                     }
@@ -300,7 +310,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.Y,
                         ref luminanceQuantTable);
 
-                    if (this.IsFlushNeeded)
+                    if (this.IsStreamFlushNeeded)
                     {
                         this.FlushToStream();
                     }
@@ -364,7 +374,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref pixelConverter.B,
                         ref luminanceQuantTable);
 
-                    if (this.IsFlushNeeded)
+                    if (this.IsStreamFlushNeeded)
                     {
                         this.FlushToStream();
                     }
@@ -447,15 +457,48 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// Emits the least significant count of bits to the stream write buffer.
-        /// The precondition is bits
-        /// <example>
-        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
-        /// </example>
-        /// .
+        /// Emits the most significant count of bits to the buffer.
         /// </summary>
-        /// <param name="bits">The packed bits.</param>
-        /// <param name="count">The number of bits</param>
+        /// <remarks>
+        /// <para>
+        /// Supports up to 32 count of bits but, generally speaking, jpeg
+        /// standard assures that there won't be more than 16 bits per single
+        /// value.
+        /// </para>
+        /// <para>
+        /// Emitting algorithm uses 3 intermediate buffers for caching before
+        /// writing to the stream:
+        /// <list type="number">
+        /// <item>
+        /// <term>uint32</term>
+        /// <description>
+        /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits
+        /// are assembled to whole bytes via this intermediate buffer.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>uint32[]</term>
+        /// <description>
+        /// Assembled bytes from uint32 buffer are saved into this buffer.
+        /// uint32 buffer values are saved using indices from the last to the first.
+        /// As bytes are saved to the memory as 4-byte packages endianness matters:
+        /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the
+        /// first eliminates all operations to extract separate bytes. This only works for
+        /// little-endian machines (there are no known examples of big-endian users atm).
+        /// For big-endians this approach is slower due to the separate byte extraction.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>byte[]</term>
+        /// <description>
+        /// Byte buffer used only during <see cref="FlushToStream(int)"/> method.
+        /// </description>
+        /// </item>
+        /// </list>
+        /// </para>
+        /// </remarks>
+        /// <param name="bits">Bits to emit, must be shifted to the left.</param>
+        /// <param name="count">Bits count stored in the bits parameter.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void Emit(uint bits, int count)
         {
@@ -475,10 +518,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// Emits the given value with the given Huffman encoder.
+        /// Emits the given value with the given Huffman table.
         /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="table">Huffman table.</param>
+        /// <param name="value">Value to encode.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuff(int[] table, int value)
         {
@@ -489,9 +532,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Emits given value via huffman rle encoding.
         /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
+        /// <param name="table">Huffman table.</param>
         /// <param name="runLength">The number of preceding zeroes, preshifted by 4 to the left.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="value">Value to encode.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuffRLE(int[] table, int runLength, int value)
         {
@@ -555,11 +598,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// Flushes cached bytes to the ouput stream respecting stuff bytes.
+        /// General method for flushing cached spectral data bytes to
+        /// the ouput stream respecting stuff bytes.
         /// </summary>
         /// <remarks>
-        /// Bytes cached via <see cref="Emit"/> are stored in 4-bytes blocks which makes
-        /// this method endianness dependent.
+        /// Bytes cached via <see cref="Emit"/> are stored in 4-bytes blocks
+        /// which makes this method endianness dependent.
         /// </remarks>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void FlushToStream(int endIndex)
@@ -623,12 +667,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.target.Write(this.streamWriteBuffer, 0, writeIdx);
         }
 
+        /// <summary>
+        /// Flushes spectral data bytes after encoding all channel blocks
+        /// in a single jpeg macroblock using <see cref="WriteBlock"/>.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only if <see cref="IsStreamFlushNeeded"/> is true
+        /// only during the macroblocks encoding routine.
+        /// </remarks>
         private void FlushToStream()
         {
             this.FlushToStream(this.emitWriteIndex * 4);
             this.emitWriteIndex = this.emitBuffer.Length;
         }
 
+        /// <summary>
+        /// Flushes final cached bits to the stream padding 1's to
+        /// complement full bytes.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only once at the end of the encoding routine.
+        /// <see cref="IsStreamFlushNeeded"/> check is not needed.
+        /// </remarks>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void FlushRemainingBytes()
         {

From aae451c84408afccc499c1d4e2af4de289c726c2 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 1 Oct 2021 22:46:45 +0300
Subject: [PATCH 55/56] Quant table adjustment method

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 38 ++++++-------------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 19 +++++++++-
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index bbdd3220f..b3cdbf0a0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -139,12 +139,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
-                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
-            }
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);
 
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
@@ -206,12 +202,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
-                chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
-            }
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);
 
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
@@ -279,11 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
-            }
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
 
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
@@ -325,16 +313,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee.</param>
+        /// <param name="quantTable">Quantization table provided by the callee.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F quantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // Calculate reciprocal quantization tables for FDCT method
-            for (int i = 0; i < Block8x8F.Size; i++)
-            {
-                luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
-            }
+            FastFloatingPointDCT.AdjustToFDCT(ref quantTable);
 
             this.huffmanTables = HuffmanLut.TheHuffmanLut;
 
@@ -360,19 +344,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCR,
                         ref pixelConverter.R,
-                        ref luminanceQuantTable);
+                        ref quantTable);
 
                     prevDCG = this.WriteBlock(
                         QuantIndex.Luminance,
                         prevDCG,
                         ref pixelConverter.G,
-                        ref luminanceQuantTable);
+                        ref quantTable);
 
                     prevDCB = this.WriteBlock(
                         QuantIndex.Luminance,
                         prevDCB,
                         ref pixelConverter.B,
-                        ref luminanceQuantTable);
+                        ref quantTable);
 
                     if (this.IsStreamFlushNeeded)
                     {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 43f6b7a1f..dc88255c5 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -62,10 +62,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <code>
         /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
         /// </code>
-        /// Values are also scaled by 8 so DCT code won't do unnecessary division.
+        /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
         /// </para>
         /// </remarks>
-        public static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+        private static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
         {
             0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
             0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
@@ -77,6 +77,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
         };
 
+        /// <summary>
+        /// Adjusts given quantization table to be complient with FDCT implementation.
+        /// </summary>
+        /// <remarks>
+        /// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
+        /// </remarks>
+        /// <param name="quantizationtable">Quantization table to adjust.</param>
+        public static void AdjustToFDCT(ref Block8x8F quantizationtable)
+        {
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+            }
+        }
+
         /// <summary>
         /// Apply 2D floating point FDCT inplace.
         /// </summary>

From 2dfbff5a90b4ec118829ba345ee5b97e4311f76b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 1 Oct 2021 22:56:40 +0300
Subject: [PATCH 56/56] Access modifier fix

---
 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index dc88255c5..6963c3636 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
-using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -65,7 +64,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
         /// </para>
         /// </remarks>
-        private static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+        internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
         {
             0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
             0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,