From 7229dbf73f6c1898641128b9b9af5728a37ad174 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 12:03:42 +0300
Subject: [PATCH 01/99] Block8x8F explicit layout & 256bit rows support

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 2d19f5ce2..dbc22eaea 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// <summary>
     /// Represents a Jpeg block with <see cref="float"/> coefficients.
     /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
+    [StructLayout(LayoutKind.Explicit)]
     internal partial struct Block8x8F : IEquatable<Block8x8F>
     {
         /// <summary>
@@ -27,29 +27,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public const int Size = 64;
 
 #pragma warning disable SA1600 // ElementsMustBeDocumented
+        [FieldOffset(0)]
         public Vector4 V0L;
+        [FieldOffset(16)]
         public Vector4 V0R;
 
+        [FieldOffset(32)]
         public Vector4 V1L;
+        [FieldOffset(48)]
         public Vector4 V1R;
 
+        [FieldOffset(64)]
         public Vector4 V2L;
+        [FieldOffset(80)]
         public Vector4 V2R;
 
+        [FieldOffset(96)]
         public Vector4 V3L;
+        [FieldOffset(112)]
         public Vector4 V3R;
 
+        [FieldOffset(128)]
         public Vector4 V4L;
+        [FieldOffset(144)]
         public Vector4 V4R;
 
+        [FieldOffset(160)]
         public Vector4 V5L;
+        [FieldOffset(176)]
         public Vector4 V5R;
 
+        [FieldOffset(192)]
         public Vector4 V6L;
+        [FieldOffset(208)]
         public Vector4 V6R;
 
+        [FieldOffset(224)]
         public Vector4 V7L;
+        [FieldOffset(240)]
         public Vector4 V7R;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented
 
         /// <summary>

From fbf0ff1466ef410de2fb77d22c6cdef074cad6ce Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 12:08:26 +0300
Subject: [PATCH 02/99] Block8x8F.MultiplyInPlace no longer use unsafe casts

Improved performance, no need for Unsafe calls.
---
 .../Formats/Jpeg/Components/Block8x8F.cs         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index dbc22eaea..52a1a7aa9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -313,14 +313,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
+                this.V0 = Avx.Multiply(this.V0, valueVec);
+                this.V1 = Avx.Multiply(this.V1, valueVec);
+                this.V2 = Avx.Multiply(this.V2, valueVec);
+                this.V3 = Avx.Multiply(this.V3, valueVec);
+                this.V4 = Avx.Multiply(this.V4, valueVec);
+                this.V5 = Avx.Multiply(this.V5, valueVec);
+                this.V6 = Avx.Multiply(this.V6, valueVec);
+                this.V7 = Avx.Multiply(this.V7, valueVec);
             }
             else
 #endif

From 20236b8c756ecbd6fd75c789b58dca5ed028d1e9 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 12:18:37 +0300
Subject: [PATCH 03/99] Block8x8F.TransposeInto no longer uses unsafe casts
 (partially)

---
 .../Formats/Jpeg/Components/Block8x8F.cs         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 52a1a7aa9..9072ca196 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -840,26 +840,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector256<float> t0 = Avx.UnpackLow(r0, r1);
                 Vector256<float> t2 = Avx.UnpackLow(r2, r3);
                 Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
+                d.V0 = Avx.Blend(t0, v, 0xCC);
+                d.V1 = Avx.Blend(t2, v, 0x33);
 
                 Vector256<float> t4 = Avx.UnpackLow(r4, r5);
                 Vector256<float> t6 = Avx.UnpackLow(r6, r7);
                 v = Avx.Shuffle(t4, t6, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
+                d.V4 = Avx.Blend(t4, v, 0xCC);
+                d.V5 = Avx.Blend(t6, v, 0x33);
 
                 Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
                 Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
                 v = Avx.Shuffle(t1, t3, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
+                d.V2 = Avx.Blend(t1, v, 0xCC);
+                d.V3 = Avx.Blend(t3, v, 0x33);
 
                 Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
                 Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
                 v = Avx.Shuffle(t5, t7, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
+                d.V6 = Avx.Blend(t5, v, 0xCC);
+                d.V7 = Avx.Blend(t7, v, 0x33);
             }
             else
 #endif

From e5188fe4f4b2060ed3329d696d4efb16bb7a51ca Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 12:56:53 +0300
Subject: [PATCH 04/99] Implemented FDCT8x8 using avx instruction set, added
 backward compatibility for FDCT8x4 calls using FDCT8x8(ref Block8x8F, ref
 Block8x8F) method

---
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 120 +++++++++++++++++-
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index a6d0622dd..ad47aa05f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -3,6 +3,10 @@
 
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
@@ -38,6 +42,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
+        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
+        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
+        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
+        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
+        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
+
+        private static Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
+#endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
         private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
 
@@ -308,12 +323,107 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V7R = c0 - c3;
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
+            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
+            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
+            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
+            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
+            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
+            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
+            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
+
+            Vector256<float> c0 = Avx.Add(t0, t3);
+            Vector256<float> c1 = Avx.Add(t1, t2);
+
+            // 0 4
+            d.V0 = Avx.Add(c0, c1);
+            d.V4 = Avx.Subtract(c0, c1);
+
+            Vector256<float> c3 = Avx.Subtract(t0, t3);
+            Vector256<float> c2 = Avx.Subtract(t1, t2);
+
+            // 2 6
+            if (Fma.IsSupported)
+            {
+                d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065));
+                d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065));
+            }
+            else
+            {
+                d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065));
+                d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065));
+            }
+
+            if (Fma.IsSupported)
+            {
+                c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856));
+                c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856));
+            }
+            else
+            {
+                c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856));
+                c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856));
+            }
+
+            if (Fma.IsSupported)
+            {
+                c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6));
+                c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5));
+            }
+            else
+            {
+                c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6));
+                c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5));
+            }
+
+            // 3 5
+            d.V3 = Avx.Subtract(c0, c2);
+            d.V5 = Avx.Subtract(c3, c1);
+
+            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
+            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
+
+            // 1 7
+            d.V1 = Avx.Add(c0, c3);
+            d.V7 = Avx.Subtract(c0, c3);
+        }
+#endif
+
         /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization)
+        /// Performs 8x8 matrix Forward Discrete Cosine Transform
         /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                FDCT8x4_LeftPart(ref s, ref d);
+                FDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Apply floating point FDCT from src into dest
+        /// </summary>
+        /// <remarks></remarks>
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
+        /// <param name="temp">Temporary block provided by the caller for optimization</param>
         /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
         public static void TransformFDCT(
             ref Block8x8F src,
@@ -327,13 +437,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 temp.AddInPlace(-128F);
             }
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
 
             dest.TransposeInto(ref temp);
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
 
             dest.MultiplyInPlace(C_0_125);
         }

From 513e86a904d2352bfb23773aafd221cab71711f8 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 15:37:14 +0300
Subject: [PATCH 05/99] Implemented IDCT algorithm with avx/fma, move IDCT code
 to a different file

---
 .../Components/FastFloatingPointDCT.IDCT.cs   | 263 ++++++++++++++++++
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 151 +---------
 2 files changed, 275 insertions(+), 139 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
new file mode 100644
index 000000000..1c990db6b
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
@@ -0,0 +1,263 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.Numerics;
+using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
+
+// ReSharper disable InconsistentNaming
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    /// <summary>
+    /// Contains inaccurate, but fast forward and inverse DCT implementations.
+    /// </summary>
+    internal static partial class FastFloatingPointDCT
+    {
+        /// <summary>
+        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+        /// </summary>
+        /// <param name="src">Source</param>
+        /// <param name="dest">Destination</param>
+        /// <param name="temp">Temporary block provided by the caller</param>
+        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        {
+            src.TransposeInto(ref temp);
+
+            IDCT8x8(ref temp, ref dest);
+            dest.TransposeInto(ref temp);
+            IDCT8x8(ref temp, ref dest);
+
+            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+            dest.MultiplyInPlace(C_0_125);
+        }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Do IDCT internal operations on the given block.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector256<float> my1 = s.V1;
+            Vector256<float> my7 = s.V7;
+            Vector256<float> mz0 = Avx.Add(my1, my7);
+
+            Vector256<float> my3 = s.V3;
+            Vector256<float> mz2 = Avx.Add(my3, my7);
+            Vector256<float> my5 = s.V5;
+            Vector256<float> mz1 = Avx.Add(my3, my5);
+            Vector256<float> mz3 = Avx.Add(my1, my5);
+
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), w1_1758);
+
+            if (Fma.IsSupported)
+            {
+                mz2 = Fma.MultiplyAdd(mz2, C_V_n1_9615, mz4);
+                mz3 = Fma.MultiplyAdd(mz3, C_V_n0_3901, mz4);
+            }
+            else
+            {
+                mz2 = Avx.Add(Avx.Multiply(mz2, C_V_n1_9615), mz4);
+                mz3 = Avx.Add(Avx.Multiply(mz3, C_V_n0_3901), mz4);
+            }
+
+            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
+            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
+
+
+            Unsafe.SkipInit(out Vector256<float> mb3);
+            Unsafe.SkipInit(out Vector256<float> mb2);
+            Unsafe.SkipInit(out Vector256<float> mb1);
+            Unsafe.SkipInit(out Vector256<float> mb0);
+
+            if (Fma.IsSupported)
+            {
+                mb3 = Avx.Add(Fma.MultiplyAdd(my7, C_V_0_2986, mz0), mz2);
+                mb2 = Avx.Add(Fma.MultiplyAdd(my5, C_V_2_0531, mz1), mz3);
+                mb1 = Avx.Add(Fma.MultiplyAdd(my3, C_V_3_0727, mz1), mz2);
+                mb0 = Avx.Add(Fma.MultiplyAdd(my1, C_V_1_5013, mz0), mz3);
+            }
+            else
+            {
+                mb3 = Avx.Add(Avx.Add(Avx.Multiply(my7, C_V_0_2986), mz0), mz2);
+                mb2 = Avx.Add(Avx.Add(Avx.Multiply(my5, C_V_2_0531), mz1), mz3);
+                mb1 = Avx.Add(Avx.Add(Avx.Multiply(my3, C_V_3_0727), mz1), mz2);
+                mb0 = Avx.Add(Avx.Add(Avx.Multiply(my1, C_V_1_5013), mz0), mz3);
+            }
+
+            Vector256<float> my2 = s.V2;
+            Vector256<float> my6 = s.V6;
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), w0_5411);
+            Vector256<float> my0 = s.V0;
+            Vector256<float> my4 = s.V4;
+            mz0 = Avx.Add(my0, my4);
+            mz1 = Avx.Subtract(my0, my4);
+
+            if (Fma.IsSupported)
+            {
+                mz2 = Fma.MultiplyAdd(my6, C_V_n1_8477, mz4);
+                mz3 = Fma.MultiplyAdd(my2, C_V_0_7653, mz4);
+            }
+            else
+            {
+                mz2 = Avx.Add(Avx.Multiply(my6, C_V_n1_8477), mz4);
+                mz3 = Avx.Add(Avx.Multiply(my2, C_V_0_7653), mz4);
+            }
+
+            my0 = Avx.Add(mz0, mz3);
+            my3 = Avx.Subtract(mz0, mz3);
+            my1 = Avx.Add(mz1, mz2);
+            my2 = Avx.Subtract(mz1, mz2);
+
+            d.V0 = Avx.Add(my0, mb0);
+            d.V7 = Avx.Subtract(my0, mb0);
+            d.V1 = Avx.Add(my1, mb1);
+            d.V6 = Avx.Subtract(my1, mb1);
+            d.V2 = Avx.Add(my2, mb2);
+            d.V5 = Avx.Subtract(my2, mb2);
+            d.V3 = Avx.Add(my3, mb3);
+            d.V4 = Avx.Subtract(my3, mb3);
+        }
+#endif
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index ad47aa05f..4ef4ab7b0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// <summary>
     /// Contains inaccurate, but fast forward and inverse DCT implementations.
     /// </summary>
-    internal static class FastFloatingPointDCT
+    internal static partial class FastFloatingPointDCT
     {
 #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
         private const float C_1_175876 = 1.175875602f;
@@ -51,149 +51,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
         private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
 
+        private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
+        private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
+        private static readonly Vector256<float> C_V_n0_8999 = Vector256.Create(-0.899976223f);
+        private static readonly Vector256<float> C_V_n2_5629 = Vector256.Create(-2.562915447f);
+        private static readonly Vector256<float> C_V_0_2986 = Vector256.Create(0.298631336f);
+        private static readonly Vector256<float> C_V_2_0531 = Vector256.Create(2.053119869f);
+        private static readonly Vector256<float> C_V_3_0727 = Vector256.Create(3.072711026f);
+        private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
+        private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
+        private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
+
         private static Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
 #endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
         private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
 
-        /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-
-            IDCT8x4_LeftPart(ref temp, ref dest);
-            IDCT8x4_RightPart(ref temp, ref dest);
-
-            dest.TransposeInto(ref temp);
-
-            IDCT8x4_LeftPart(ref temp, ref dest);
-            IDCT8x4_RightPart(ref temp, ref dest);
-
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
         /// <summary>
         /// Original:
         /// <see>

From 81c21e5af42088dccea6ce40115034cc84d928f2 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 15:50:24 +0300
Subject: [PATCH 06/99] Fixed "constant" vectors naming

---
 .../Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
index 1c990db6b..fd3ad8d5f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -188,7 +189,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             Vector256<float> mz1 = Avx.Add(my3, my5);
             Vector256<float> mz3 = Avx.Add(my1, my5);
 
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), w1_1758);
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
 
             if (Fma.IsSupported)
             {
@@ -227,7 +228,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             Vector256<float> my2 = s.V2;
             Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), w0_5411);
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
             Vector256<float> my0 = s.V0;
             Vector256<float> my4 = s.V4;
             mz0 = Avx.Add(my0, my4);

From 9bf9644e650b2a67b324e37506e56b435bc2676e Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 20 May 2021 09:41:58 +0300
Subject: [PATCH 07/99] RgbToYCbCrConverterLut.Convert main loop routine now
 uses named constant instead of a 'magic value'

---
 .../Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 3c1a02c5a..1ceea1e08 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -119,7 +119,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             ref Rgb24 rgbStart = ref rgbSpan[0];
 
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
 

From 347ac360ec56e0e63ec97ba32f05d5bf8ea35b32 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 20 May 2021 14:09:32 +0300
Subject: [PATCH 08/99] LuminanceForwardConverter.Convert main loop routine now
 uses named constant instead of a 'magic value'

---
 .../Components/Encoder/LuminanceForwardConverter{TPixel}.cs     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
index cc81130dd..fc5b9a868 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
@@ -49,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             ref Block8x8F yBlock = ref this.Y;
             ref L8 l8Start = ref l8Span[0];
 
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 ref L8 c = ref Unsafe.Add(ref l8Start, i);
                 yBlock[i] = c.PackedValue;

From 86a6d8be975df1ec74963b3201a4b10eaa8aef51 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 20 May 2021 16:06:13 +0300
Subject: [PATCH 09/99] WriteDefineHuffmanTables(...) no longer relies on
 external buffer for stream writes

---
 .../Formats/Jpeg/JpegEncoderCore.cs           | 44 ++++++++++---------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index f5dc1c79f..79f0d3022 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -41,12 +41,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// </summary>
         private readonly byte[] emitBuffer = new byte[64];
 
-        /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables. Max combined table lengths +
-        /// identifier.
-        /// </summary>
-        private readonly byte[] huffmanBuffer = new byte[179];
-
         /// <summary>
         /// Gets or sets the subsampling method to use.
         /// </summary>
@@ -635,30 +629,40 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 markerlen += 1 + 16 + s.Values.Length;
             }
 
+            // TODO: this magic constant (array size) should be defined by HuffmanSpec class
+            // This is a one-time call which can be stackalloc'ed or allocated directly in memory as method local array
+            // Allocation here would be better for GC so it won't live for entire encoding process
+            // TODO: if this is allocated on the heap - pin it right here or following copy code will corrupt memory
+            Span<byte> huffmanBuffer = stackalloc byte[179];
+            byte* huffmanBufferPtr = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(huffmanBuffer));
+
             this.WriteMarkerHeader(JpegConstants.Markers.DHT, markerlen);
             for (int i = 0; i < specs.Length; i++)
             {
                 ref HuffmanSpec spec = ref specs[i];
+
                 int len = 0;
 
-                fixed (byte* huffman = this.huffmanBuffer)
-                fixed (byte* count = spec.Count)
-                fixed (byte* values = spec.Values)
-                {
-                    huffman[len++] = headers[i];
+                // header
+                huffmanBuffer[len++] = headers[i];
 
-                    for (int c = 0; c < spec.Count.Length; c++)
-                    {
-                        huffman[len++] = count[c];
-                    }
+                // count
+                fixed (byte* countPtr = spec.Count)
+                {
+                    int countLen = spec.Count.Length;
+                    Unsafe.CopyBlockUnaligned(huffmanBufferPtr + len, countPtr, (uint)countLen);
+                    len += countLen;
+                }
 
-                    for (int v = 0; v < spec.Values.Length; v++)
-                    {
-                        huffman[len++] = values[v];
-                    }
+                // values
+                fixed (byte* valuesPtr = spec.Values)
+                {
+                    int valuesLen = spec.Values.Length;
+                    Unsafe.CopyBlockUnaligned(huffmanBufferPtr + len, valuesPtr, (uint)valuesLen);
+                    len += valuesLen;
                 }
 
-                this.outputStream.Write(this.huffmanBuffer, 0, len);
+                this.outputStream.Write(huffmanBuffer, 0, len);
             }
         }
 

From f0017556cf06ee0d881b723f1fd6277b858732e4 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 20 May 2021 16:46:55 +0300
Subject: [PATCH 10/99] [WIP] Partially moved encoding logic to a separate
 class

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 532 ++++++++++++++++++
 .../Formats/Jpeg/JpegEncoderCore.cs           |  28 +-
 2 files changed, 539 insertions(+), 21 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
new file mode 100644
index 000000000..2ef053eb1
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -0,0 +1,532 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Text;
+using System.Threading;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    internal class YCbCrEncoder<TPixel>
+    {
+        /// <summary>
+        /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough.
+        /// </summary>
+        private byte[] emitBuffer = new byte[64];
+
+        /// <summary>
+        /// The accumulated bits to write to the stream.
+        /// </summary>
+        private uint accumulatedBits;
+
+        /// <summary>
+        /// The accumulated bit count.
+        /// </summary>
+        private uint bitCount;
+
+        /// <summary>
+        /// The scaled chrominance table, in zig-zag order.
+        /// </summary>
+        private Block8x8F chrominanceQuantTable;
+
+        /// <summary>
+        /// The scaled luminance table, in zig-zag order.
+        /// </summary>
+        private Block8x8F luminanceQuantTable;
+
+        /// <summary>
+        /// The output stream. All attempted writes after the first error become no-ops.
+        /// </summary>
+        private Stream outputStream;
+
+        /// <summary>
+        /// Gets the counts the number of bits needed to hold an integer.
+        /// </summary>
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> BitCountLut => new byte[]
+            {
+                0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
+                5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8,
+            };
+
+        /// <summary>
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
+        /// </summary>
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+            {
+                // Luminance.
+                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
+                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
+                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
+                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
+                100, 120, 92, 101, 103, 99,
+            };
+
+        /// <summary>
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
+        /// </summary>
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+            {
+                // Chrominance.
+                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99,
+            };
+
+
+        public ref Block8x8F ChrominanceQuantizationTable => ref this.chrominanceQuantTable;
+
+        public ref Block8x8F LuminanceQuantizationTable => ref this.luminanceQuantTable;
+
+
+        public YCbCrEncoder(Stream outputStream, int componentCount, int quality)
+        {
+            this.outputStream = outputStream;
+
+            // Convert from a quality rating to a scaling factor.
+            int scale;
+            if (quality < 50)
+            {
+                scale = 5000 / quality;
+            }
+            else
+            {
+                scale = 200 - (quality * 2);
+            }
+
+            // Initialize the quantization tables.
+            InitQuantizationTable(0, scale, ref this.luminanceQuantTable);
+            if (componentCount > 1)
+            {
+                InitQuantizationTable(1, scale, ref this.chrominanceQuantTable);
+            }
+        }
+
+        /// <summary>
+        /// Encodes the image with no subsampling.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        public void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
+            // (Partially done with YCbCrForwardConverter<TPixel>)
+            Block8x8F temp1 = default;
+            Block8x8F temp2 = default;
+
+            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
+            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
+
+            var unzig = ZigZag.CreateUnzigTable();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+
+            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            for (int y = 0; y < pixels.Height; y += 8)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                currentRows.Update(pixelBuffer, y);
+
+                for (int x = 0; x < pixels.Width; x += 8)
+                {
+                    pixelConverter.Convert(frame, x, y, ref currentRows);
+
+                    prevDCY = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCY,
+                        ref pixelConverter.Y,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackLuminanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+
+                    prevDCCb = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCb,
+                        ref pixelConverter.Cb,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackChrominanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+
+                    prevDCCr = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCr,
+                        ref pixelConverter.Cr,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackChrominanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Encodes the image with subsampling. The Cb and Cr components are each subsampled
+        /// at a factor of 2 both horizontally and vertically.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        public void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
+            Block8x8F b = default;
+            Span<Block8x8F> cb = stackalloc Block8x8F[4];
+            Span<Block8x8F> cr = stackalloc Block8x8F[4];
+
+            Block8x8F temp1 = default;
+            Block8x8F temp2 = default;
+
+            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
+            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
+
+            var unzig = ZigZag.CreateUnzigTable();
+
+            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            for (int y = 0; y < pixels.Height; y += 16)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                for (int x = 0; x < pixels.Width; x += 16)
+                {
+                    for (int i = 0; i < 4; i++)
+                    {
+                        int xOff = (i & 1) * 8;
+                        int yOff = (i & 2) * 4;
+
+                        currentRows.Update(pixelBuffer, y + yOff);
+                        pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows);
+
+                        cb[i] = pixelConverter.Cb;
+                        cr[i] = pixelConverter.Cr;
+
+                        prevDCY = this.WriteBlock(
+                            QuantIndex.Luminance,
+                            prevDCY,
+                            ref pixelConverter.Y,
+                            ref temp1,
+                            ref temp2,
+                            ref onStackLuminanceQuantTable,
+                            ref unzig,
+                            ref emitBufferBase);
+                    }
+
+                    Block8x8F.Scale16X16To8X8(ref b, cb);
+                    prevDCCb = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCb,
+                        ref b,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackChrominanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+
+                    Block8x8F.Scale16X16To8X8(ref b, cr);
+                    prevDCCr = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCr,
+                        ref b,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackChrominanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+                }
+            }
+        }
+
+
+        /// <summary>
+        /// Encodes the image with no chroma, just luminance.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
+            // (Partially done with YCbCrForwardConverter<TPixel>)
+            Block8x8F temp1 = default;
+            Block8x8F temp2 = default;
+
+            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
+
+            var unzig = ZigZag.CreateUnzigTable();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0;
+
+            var pixelConverter = LuminanceForwardConverter<TPixel>.Create();
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            for (int y = 0; y < pixels.Height; y += 8)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                currentRows.Update(pixelBuffer, y);
+
+                for (int x = 0; x < pixels.Width; x += 8)
+                {
+                    pixelConverter.Convert(frame, x, y, ref currentRows);
+
+                    prevDCY = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCY,
+                        ref pixelConverter.Y,
+                        ref temp1,
+                        ref temp2,
+                        ref onStackLuminanceQuantTable,
+                        ref unzig,
+                        ref emitBufferBase);
+                }
+            }
+        }
+
+
+        /// <summary>
+        /// Writes a block of pixel data using the given quantization table,
+        /// returning the post-quantized DC value of the DCT-transformed block.
+        /// The block is in natural (not zig-zag) order.
+        /// </summary>
+        /// <param name="index">The quantization table index.</param>
+        /// <param name="prevDC">The previous DC value.</param>
+        /// <param name="src">Source block</param>
+        /// <param name="tempDest1">Temporal block to be used as FDCT Destination</param>
+        /// <param name="tempDest2">Temporal block 2</param>
+        /// <param name="quant">Quantization table</param>
+        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        /// <returns>The <see cref="int"/>.</returns>
+        private int WriteBlock(
+            QuantIndex index,
+            int prevDC,
+            ref Block8x8F src,
+            ref Block8x8F tempDest1,
+            ref Block8x8F tempDest2,
+            ref Block8x8F quant,
+            ref ZigZag unZig,
+            ref byte emitBufferBase)
+        {
+            FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2);
+
+            Block8x8F.Quantize(ref tempDest1, ref tempDest2, ref quant, ref unZig);
+
+            int dc = (int)tempDest2[0];
+
+            // Emit the DC delta.
+            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC, ref emitBufferBase);
+
+            // Emit the AC components.
+            var h = (HuffIndex)((2 * (int)index) + 1);
+            int runLength = 0;
+
+            for (int zig = 1; zig < Block8x8F.Size; zig++)
+            {
+                int ac = (int)tempDest2[zig];
+
+                if (ac == 0)
+                {
+                    runLength++;
+                }
+                else
+                {
+                    while (runLength > 15)
+                    {
+                        this.EmitHuff(h, 0xf0, ref emitBufferBase);
+                        runLength -= 16;
+                    }
+
+                    this.EmitHuffRLE(h, runLength, ac, ref emitBufferBase);
+                    runLength = 0;
+                }
+            }
+
+            if (runLength > 0)
+            {
+                this.EmitHuff(h, 0x00, ref emitBufferBase);
+            }
+
+            return dc;
+        }
+
+        /// <summary>
+        /// Emits the least significant count of bits of bits to the bit-stream.
+        /// The precondition is bits
+        /// <example>
+        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
+        /// </example>
+        /// .
+        /// </summary>
+        /// <param name="bits">The packed bits.</param>
+        /// <param name="count">The number of bits</param>
+        /// <param name="emitBufferBase">The reference to the emitBuffer.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void Emit(uint bits, uint count, ref byte emitBufferBase)
+        {
+            count += this.bitCount;
+            bits <<= (int)(32 - count);
+            bits |= this.accumulatedBits;
+
+            // Only write if more than 8 bits.
+            if (count >= 8)
+            {
+                // Track length
+                int len = 0;
+                while (count >= 8)
+                {
+                    byte b = (byte)(bits >> 24);
+                    Unsafe.Add(ref emitBufferBase, len++) = b;
+                    if (b == byte.MaxValue)
+                    {
+                        Unsafe.Add(ref emitBufferBase, len++) = byte.MinValue;
+                    }
+
+                    bits <<= 8;
+                    count -= 8;
+                }
+
+                if (len > 0)
+                {
+                    this.outputStream.Write(this.emitBuffer, 0, len);
+                }
+            }
+
+            this.accumulatedBits = bits;
+            this.bitCount = count;
+        }
+
+        /// <summary>
+        /// Emits the given value with the given Huffman encoder.
+        /// </summary>
+        /// <param name="index">The index of the Huffman encoder</param>
+        /// <param name="value">The value to encode.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void EmitHuff(HuffIndex index, int value, ref byte emitBufferBase)
+        {
+            uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value];
+            this.Emit(x & ((1 << 24) - 1), x >> 24, ref emitBufferBase);
+        }
+
+        /// <summary>
+        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+        /// </summary>
+        /// <param name="index">The index of the Huffman encoder</param>
+        /// <param name="runLength">The number of copies to encode.</param>
+        /// <param name="value">The value to encode.</param>
+        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void EmitHuffRLE(HuffIndex index, int runLength, int value, ref byte emitBufferBase)
+        {
+            int a = value;
+            int b = value;
+            if (a < 0)
+            {
+                a = -value;
+                b = value - 1;
+            }
+
+            uint bt;
+            if (a < 0x100)
+            {
+                bt = BitCountLut[a];
+            }
+            else
+            {
+                bt = 8 + (uint)BitCountLut[a >> 8];
+            }
+
+            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt), ref emitBufferBase);
+            if (bt > 0)
+            {
+                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt, ref emitBufferBase);
+            }
+        }
+
+
+        /// <summary>
+        /// Initializes quantization table.
+        /// </summary>
+        /// <param name="i">The quantization index.</param>
+        /// <param name="scale">The scaling factor.</param>
+        /// <param name="quant">The quantization table.</param>
+        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
+        {
+            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
+            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
+
+            for (int j = 0; j < Block8x8F.Size; j++)
+            {
+                int x = unscaledQuant[j];
+                x = ((x * scale) + 50) / 100;
+                if (x < 1)
+                {
+                    x = 1;
+                }
+
+                if (x > 255)
+                {
+                    x = 255;
+                }
+
+                quant[j] = x;
+            }
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 79f0d3022..14cb87af3 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -183,23 +183,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             int qlty = Numerics.Clamp(this.quality ?? metadata.GetJpegMetadata().Quality, 1, 100);
             this.subsample ??= qlty >= 91 ? JpegSubsample.Ratio444 : JpegSubsample.Ratio420;
 
-            // Convert from a quality rating to a scaling factor.
-            int scale;
-            if (qlty < 50)
-            {
-                scale = 5000 / qlty;
-            }
-            else
-            {
-                scale = 200 - (qlty * 2);
-            }
-
-            // Initialize the quantization tables.
-            InitQuantizationTable(0, scale, ref this.luminanceQuantTable);
-            if (componentCount > 1)
-            {
-                InitQuantizationTable(1, scale, ref this.chrominanceQuantTable);
-            }
+            YCbCrEncoder<TPixel> scanEncoder = new YCbCrEncoder<TPixel>(stream, componentCount, qlty);
+            this.luminanceQuantTable = scanEncoder.LuminanceQuantizationTable;
+            this.chrominanceQuantTable = scanEncoder.ChrominanceQuantizationTable;
 
             // Write the Start Of Image marker.
             this.WriteApplicationHeader(metadata);
@@ -208,7 +194,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteProfiles(metadata);
 
             // Write the quantization tables.
-            this.WriteDefineQuantizationTables();
+            this.WriteDefineQuantizationTables(ref scanEncoder.LuminanceQuantizationTable, ref scanEncoder.ChrominanceQuantizationTable);
 
             // Write the image dimensions.
             this.WriteStartOfFrame(image.Width, image.Height, componentCount);
@@ -669,7 +655,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// <summary>
         /// Writes the Define Quantization Marker and tables.
         /// </summary>
-        private void WriteDefineQuantizationTables()
+        private void WriteDefineQuantizationTables(ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable)
         {
             // Marker + quantization table lengths
             int markerlen = 2 + (QuantizationTableCount * (1 + Block8x8F.Size));
@@ -681,8 +667,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             byte[] dqt = new byte[dqtCount];
             int offset = 0;
 
-            WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref this.luminanceQuantTable);
-            WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref this.chrominanceQuantTable);
+            WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref luminanceQuantTable);
+            WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref chrominanceQuantTable);
 
             this.outputStream.Write(dqt, 0, dqtCount);
         }

From d91fc408bce53d853e01d55c14c1785b6769b350 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 07:47:51 +0300
Subject: [PATCH 11/99] Removed write buffer parameter injection

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index 2ef053eb1..6c8183244 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -141,7 +141,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+        public void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -178,8 +178,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
 
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
@@ -188,8 +187,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
 
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
@@ -198,8 +196,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
                 }
             }
         }
@@ -212,7 +209,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+        public void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -259,8 +256,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                             ref temp1,
                             ref temp2,
                             ref onStackLuminanceQuantTable,
-                            ref unzig,
-                            ref emitBufferBase);
+                            ref unzig);
                     }
 
                     Block8x8F.Scale16X16To8X8(ref b, cb);
@@ -271,8 +267,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
 
                     Block8x8F.Scale16X16To8X8(ref b, cr);
                     prevDCCr = this.WriteBlock(
@@ -282,8 +277,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
                 }
             }
         }
@@ -296,7 +290,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
+        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -332,8 +326,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref temp1,
                         ref temp2,
                         ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
+                        ref unzig);
                 }
             }
         }
@@ -360,8 +353,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             ref Block8x8F tempDest1,
             ref Block8x8F tempDest2,
             ref Block8x8F quant,
-            ref ZigZag unZig,
-            ref byte emitBufferBase)
+            ref ZigZag unZig)
         {
             FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2);
 
@@ -370,7 +362,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int dc = (int)tempDest2[0];
 
             // Emit the DC delta.
-            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC, ref emitBufferBase);
+            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC);
 
             // Emit the AC components.
             var h = (HuffIndex)((2 * (int)index) + 1);
@@ -388,18 +380,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 {
                     while (runLength > 15)
                     {
-                        this.EmitHuff(h, 0xf0, ref emitBufferBase);
+                        this.EmitHuff(h, 0xf0);
                         runLength -= 16;
                     }
 
-                    this.EmitHuffRLE(h, runLength, ac, ref emitBufferBase);
+                    this.EmitHuffRLE(h, runLength, ac);
                     runLength = 0;
                 }
             }
 
             if (runLength > 0)
             {
-                this.EmitHuff(h, 0x00, ref emitBufferBase);
+                this.EmitHuff(h, 0x00);
             }
 
             return dc;
@@ -417,7 +409,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="count">The number of bits</param>
         /// <param name="emitBufferBase">The reference to the emitBuffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(uint bits, uint count, ref byte emitBufferBase)
+        private void Emit(uint bits, uint count)
         {
             count += this.bitCount;
             bits <<= (int)(32 - count);
@@ -431,10 +423,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 while (count >= 8)
                 {
                     byte b = (byte)(bits >> 24);
-                    Unsafe.Add(ref emitBufferBase, len++) = b;
+                    this.emitBuffer[len++] = b;
                     if (b == byte.MaxValue)
                     {
-                        Unsafe.Add(ref emitBufferBase, len++) = byte.MinValue;
+                        this.emitBuffer[len++] = byte.MinValue;
                     }
 
                     bits <<= 8;
@@ -458,10 +450,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="value">The value to encode.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuff(HuffIndex index, int value, ref byte emitBufferBase)
+        private void EmitHuff(HuffIndex index, int value)
         {
             uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value];
-            this.Emit(x & ((1 << 24) - 1), x >> 24, ref emitBufferBase);
+            this.Emit(x & ((1 << 24) - 1), x >> 24);
         }
 
         /// <summary>
@@ -472,7 +464,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="value">The value to encode.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuffRLE(HuffIndex index, int runLength, int value, ref byte emitBufferBase)
+        private void EmitHuffRLE(HuffIndex index, int runLength, int value)
         {
             int a = value;
             int b = value;
@@ -492,10 +484,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 bt = 8 + (uint)BitCountLut[a >> 8];
             }
 
-            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt), ref emitBufferBase);
+            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt));
             if (bt > 0)
             {
-                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt, ref emitBufferBase);
+                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt);
             }
         }
 

From 66b5a8df67437cb66dad2756e2a598df2aad1385 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 08:07:47 +0300
Subject: [PATCH 12/99] [WIP] Moved SOS writing logic to separate class

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 29 ++++++++++--
 .../Formats/Jpeg/JpegEncoderCore.cs           | 44 ++++++++++---------
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index 6c8183244..a8411e218 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -141,7 +141,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -209,7 +209,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -290,7 +290,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -331,6 +331,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
+        public void WriteStartOfScan<TPixel>(Image<TPixel> image, JpegColorType? colorType, JpegSubsample? subsample, CancellationToken cancellationToken)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            if (colorType == JpegColorType.Luminance)
+            {
+                this.EncodeGrayscale(image, cancellationToken);
+            }
+            else
+            {
+                switch (subsample)
+                {
+                    case JpegSubsample.Ratio444:
+                        this.Encode444(image, cancellationToken);
+                        break;
+                    case JpegSubsample.Ratio420:
+                        this.Encode420(image, cancellationToken);
+                        break;
+                }
+            }
+
+            // Pad the last byte with 1's.
+            this.Emit(0x7f, 7);
+        }
 
         /// <summary>
         /// Writes a block of pixel data using the given quantization table,
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 14cb87af3..f1dd7f6bf 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -203,7 +203,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteDefineHuffmanTables(componentCount);
 
             // Write the image data.
-            this.WriteStartOfScan(image, componentCount, cancellationToken);
+            this.WriteStartOfScan(scanEncoder, image, componentCount, cancellationToken);
 
             // Write the End Of Image marker.
             this.buffer[0] = JpegConstants.Markers.XFF;
@@ -969,7 +969,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// <param name="image">The pixel accessor providing access to the image pixels.</param>
         /// <param name="componentCount">The number of components in a pixel.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void WriteStartOfScan<TPixel>(Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
+        private void WriteStartOfScan<TPixel>(YCbCrEncoder<TPixel> scanEncoder, Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -1015,26 +1015,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.buffer[sosSize + 1] = 0x00; // Ah + Ah (Successive approximation bit position high + low)
             this.outputStream.Write(this.buffer, 0, sosSize + 2);
 
-            ref byte emitBufferBase = ref MemoryMarshal.GetReference<byte>(this.emitBuffer);
-            if (this.colorType == JpegColorType.Luminance)
-            {
-                this.EncodeGrayscale(image, cancellationToken, ref emitBufferBase);
-            }
-            else
-            {
-                switch (this.subsample)
-                {
-                    case JpegSubsample.Ratio444:
-                        this.Encode444(image, cancellationToken, ref emitBufferBase);
-                        break;
-                    case JpegSubsample.Ratio420:
-                        this.Encode420(image, cancellationToken, ref emitBufferBase);
-                        break;
-                }
-            }
 
-            // Pad the last byte with 1's.
-            this.Emit(0x7f, 7, ref emitBufferBase);
+            scanEncoder.WriteStartOfScan(image, this.colorType, this.subsample, cancellationToken);
+            //ref byte emitBufferBase = ref MemoryMarshal.GetReference<byte>(this.emitBuffer);
+            //if (this.colorType == JpegColorType.Luminance)
+            //{
+            //    scanEncoder.EncodeGrayscale(image, cancellationToken);
+            //}
+            //else
+            //{
+            //    switch (this.subsample)
+            //    {
+            //        case JpegSubsample.Ratio444:
+            //            scanEncoder.Encode444(image, cancellationToken);
+            //            break;
+            //        case JpegSubsample.Ratio420:
+            //            scanEncoder.Encode420(image, cancellationToken);
+            //            break;
+            //    }
+            //}
+
+            //// Pad the last byte with 1's.
+            //this.Emit(0x7f, 7, ref emitBufferBase);
         }
 
         /// <summary>

From 0d7e4b13f2df0a33bb9e1b36aa7878cf1c82f4a9 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 08:27:41 +0300
Subject: [PATCH 13/99] Removed unrelevant code from JpegDecoderCore

---
 .../Formats/Jpeg/JpegEncoderCore.cs           | 473 ------------------
 1 file changed, 473 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index f1dd7f6bf..019be629b 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -92,67 +92,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.colorType = options.ColorType;
         }
 
-        /// <summary>
-        /// Gets the counts the number of bits needed to hold an integer.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> BitCountLut => new byte[]
-            {
-                0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
-                5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8,
-            };
-
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
-            {
-                // Luminance.
-                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-                100, 120, 92, 101, 103, 99,
-            };
-
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
-            {
-                // Chrominance.
-                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99,
-            };
-
         /// <summary>
         /// Encode writes the image to the jpeg baseline format with the given options.
         /// </summary>
@@ -228,248 +167,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             }
         }
 
-        /// <summary>
-        /// Initializes quantization table.
-        /// </summary>
-        /// <param name="i">The quantization index.</param>
-        /// <param name="scale">The scaling factor.</param>
-        /// <param name="quant">The quantization table.</param>
-        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
-        {
-            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
-            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
-
-            for (int j = 0; j < Block8x8F.Size; j++)
-            {
-                int x = unscaledQuant[j];
-                x = ((x * scale) + 50) / 100;
-                if (x < 1)
-                {
-                    x = 1;
-                }
-
-                if (x > 255)
-                {
-                    x = 255;
-                }
-
-                quant[j] = x;
-            }
-        }
-
-        /// <summary>
-        /// Emits the least significant count of bits of bits to the bit-stream.
-        /// The precondition is bits
-        /// <example>
-        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
-        /// </example>
-        /// .
-        /// </summary>
-        /// <param name="bits">The packed bits.</param>
-        /// <param name="count">The number of bits</param>
-        /// <param name="emitBufferBase">The reference to the emitBuffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(uint bits, uint count, ref byte emitBufferBase)
-        {
-            count += this.bitCount;
-            bits <<= (int)(32 - count);
-            bits |= this.accumulatedBits;
-
-            // Only write if more than 8 bits.
-            if (count >= 8)
-            {
-                // Track length
-                int len = 0;
-                while (count >= 8)
-                {
-                    byte b = (byte)(bits >> 24);
-                    Unsafe.Add(ref emitBufferBase, len++) = b;
-                    if (b == byte.MaxValue)
-                    {
-                        Unsafe.Add(ref emitBufferBase, len++) = byte.MinValue;
-                    }
-
-                    bits <<= 8;
-                    count -= 8;
-                }
-
-                if (len > 0)
-                {
-                    this.outputStream.Write(this.emitBuffer, 0, len);
-                }
-            }
-
-            this.accumulatedBits = bits;
-            this.bitCount = count;
-        }
-
-        /// <summary>
-        /// Emits the given value with the given Huffman encoder.
-        /// </summary>
-        /// <param name="index">The index of the Huffman encoder</param>
-        /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuff(HuffIndex index, int value, ref byte emitBufferBase)
-        {
-            uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value];
-            this.Emit(x & ((1 << 24) - 1), x >> 24, ref emitBufferBase);
-        }
-
-        /// <summary>
-        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
-        /// </summary>
-        /// <param name="index">The index of the Huffman encoder</param>
-        /// <param name="runLength">The number of copies to encode.</param>
-        /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuffRLE(HuffIndex index, int runLength, int value, ref byte emitBufferBase)
-        {
-            int a = value;
-            int b = value;
-            if (a < 0)
-            {
-                a = -value;
-                b = value - 1;
-            }
-
-            uint bt;
-            if (a < 0x100)
-            {
-                bt = BitCountLut[a];
-            }
-            else
-            {
-                bt = 8 + (uint)BitCountLut[a >> 8];
-            }
-
-            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt), ref emitBufferBase);
-            if (bt > 0)
-            {
-                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt, ref emitBufferBase);
-            }
-        }
-
-        /// <summary>
-        /// Encodes the image with no subsampling.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
-
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 8)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                currentRows.Update(pixelBuffer, y);
-
-                for (int x = 0; x < pixels.Width; x += 8)
-                {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
-
-                    prevDCY = this.WriteBlock(
-                        QuantIndex.Luminance,
-                        prevDCY,
-                        ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    prevDCCb = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCb,
-                        ref pixelConverter.Cb,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    prevDCCr = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCr,
-                        ref pixelConverter.Cr,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
-        }
-
-        /// <summary>
-        /// Encodes the image with no chroma, just luminance.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0;
-
-            var pixelConverter = LuminanceForwardConverter<TPixel>.Create();
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 8)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                currentRows.Update(pixelBuffer, y);
-
-                for (int x = 0; x < pixels.Width; x += 8)
-                {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
-
-                    prevDCY = this.WriteBlock(
-                        QuantIndex.Luminance,
-                        prevDCY,
-                        ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
-        }
-
         /// <summary>
         /// Writes the application header containing the JFIF identifier plus extra data.
         /// </summary>
@@ -519,72 +216,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.outputStream.Write(this.buffer, 0, 20);
         }
 
-        /// <summary>
-        /// Writes a block of pixel data using the given quantization table,
-        /// returning the post-quantized DC value of the DCT-transformed block.
-        /// The block is in natural (not zig-zag) order.
-        /// </summary>
-        /// <param name="index">The quantization table index.</param>
-        /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="tempDest1">Temporal block to be used as FDCT Destination</param>
-        /// <param name="tempDest2">Temporal block 2</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        /// <returns>The <see cref="int"/>.</returns>
-        private int WriteBlock(
-            QuantIndex index,
-            int prevDC,
-            ref Block8x8F src,
-            ref Block8x8F tempDest1,
-            ref Block8x8F tempDest2,
-            ref Block8x8F quant,
-            ref ZigZag unZig,
-            ref byte emitBufferBase)
-        {
-            FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2);
-
-            Block8x8F.Quantize(ref tempDest1, ref tempDest2, ref quant, ref unZig);
-
-            int dc = (int)tempDest2[0];
-
-            // Emit the DC delta.
-            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC, ref emitBufferBase);
-
-            // Emit the AC components.
-            var h = (HuffIndex)((2 * (int)index) + 1);
-            int runLength = 0;
-
-            for (int zig = 1; zig < Block8x8F.Size; zig++)
-            {
-                int ac = (int)tempDest2[zig];
-
-                if (ac == 0)
-                {
-                    runLength++;
-                }
-                else
-                {
-                    while (runLength > 15)
-                    {
-                        this.EmitHuff(h, 0xf0, ref emitBufferBase);
-                        runLength -= 16;
-                    }
-
-                    this.EmitHuffRLE(h, runLength, ac, ref emitBufferBase);
-                    runLength = 0;
-                }
-            }
-
-            if (runLength > 0)
-            {
-                this.EmitHuff(h, 0x00, ref emitBufferBase);
-            }
-
-            return dc;
-        }
-
         /// <summary>
         /// Writes the Define Huffman Table marker and tables.
         /// </summary>
@@ -1017,110 +648,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
 
 
             scanEncoder.WriteStartOfScan(image, this.colorType, this.subsample, cancellationToken);
-            //ref byte emitBufferBase = ref MemoryMarshal.GetReference<byte>(this.emitBuffer);
-            //if (this.colorType == JpegColorType.Luminance)
-            //{
-            //    scanEncoder.EncodeGrayscale(image, cancellationToken);
-            //}
-            //else
-            //{
-            //    switch (this.subsample)
-            //    {
-            //        case JpegSubsample.Ratio444:
-            //            scanEncoder.Encode444(image, cancellationToken);
-            //            break;
-            //        case JpegSubsample.Ratio420:
-            //            scanEncoder.Encode420(image, cancellationToken);
-            //            break;
-            //    }
-            //}
-
-            //// Pad the last byte with 1's.
-            //this.Emit(0x7f, 7, ref emitBufferBase);
-        }
-
-        /// <summary>
-        /// Encodes the image with subsampling. The Cb and Cr components are each subsampled
-        /// at a factor of 2 both horizontally and vertically.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            Block8x8F b = default;
-            Span<Block8x8F> cb = stackalloc Block8x8F[4];
-            Span<Block8x8F> cr = stackalloc Block8x8F[4];
-
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 16)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                for (int x = 0; x < pixels.Width; x += 16)
-                {
-                    for (int i = 0; i < 4; i++)
-                    {
-                        int xOff = (i & 1) * 8;
-                        int yOff = (i & 2) * 4;
-
-                        currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows);
-
-                        cb[i] = pixelConverter.Cb;
-                        cr[i] = pixelConverter.Cr;
-
-                        prevDCY = this.WriteBlock(
-                            QuantIndex.Luminance,
-                            prevDCY,
-                            ref pixelConverter.Y,
-                            ref temp1,
-                            ref temp2,
-                            ref onStackLuminanceQuantTable,
-                            ref unzig,
-                            ref emitBufferBase);
-                    }
-
-                    Block8x8F.Scale16X16To8X8(ref b, cb);
-                    prevDCCb = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCb,
-                        ref b,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    Block8x8F.Scale16X16To8X8(ref b, cr);
-                    prevDCCr = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCr,
-                        ref b,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
         }
 
         /// <summary>

From d593479a8d692e3bdb593c658acbce4ce33f9d29 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 08:34:26 +0300
Subject: [PATCH 14/99] Removed remaining unrelevant code from JpegEncoderCore

---
 .../Formats/Jpeg/JpegEncoderCore.cs           | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 019be629b..2625d490c 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -36,11 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// </summary>
         private readonly byte[] buffer = new byte[20];
 
-        /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough.
-        /// </summary>
-        private readonly byte[] emitBuffer = new byte[64];
-
         /// <summary>
         /// Gets or sets the subsampling method to use.
         /// </summary>
@@ -56,26 +51,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// </summary>
         private readonly JpegColorType? colorType;
 
-        /// <summary>
-        /// The accumulated bits to write to the stream.
-        /// </summary>
-        private uint accumulatedBits;
-
-        /// <summary>
-        /// The accumulated bit count.
-        /// </summary>
-        private uint bitCount;
-
-        /// <summary>
-        /// The scaled chrominance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F chrominanceQuantTable;
-
-        /// <summary>
-        /// The scaled luminance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F luminanceQuantTable;
-
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
@@ -123,8 +98,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.subsample ??= qlty >= 91 ? JpegSubsample.Ratio444 : JpegSubsample.Ratio420;
 
             YCbCrEncoder<TPixel> scanEncoder = new YCbCrEncoder<TPixel>(stream, componentCount, qlty);
-            this.luminanceQuantTable = scanEncoder.LuminanceQuantizationTable;
-            this.chrominanceQuantTable = scanEncoder.ChrominanceQuantizationTable;
 
             // Write the Start Of Image marker.
             this.WriteApplicationHeader(metadata);

From 296ee10c91f008c2627fe96b0e800e9eda7fffe9 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 11:43:30 +0300
Subject: [PATCH 15/99] Optimized jpeg encoder stream Write calls but a lot ->
 huge performance gain

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index a8411e218..7412b4d91 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -14,10 +14,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     internal class YCbCrEncoder<TPixel>
     {
+        private const int EmitBufferSizeInBytes = 1024;
+
         /// <summary>
         /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough.
         /// </summary>
-        private byte[] emitBuffer = new byte[64];
+        private byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
 
         /// <summary>
         /// The accumulated bits to write to the stream.
@@ -353,6 +355,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // Pad the last byte with 1's.
             this.Emit(0x7f, 7);
+            this.outputStream.Write(this.emitBuffer, 0, this.emitLen);
         }
 
         /// <summary>
@@ -420,8 +423,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             return dc;
         }
 
+        private int emitLen = 0;
+
         /// <summary>
-        /// Emits the least significant count of bits of bits to the bit-stream.
+        /// Emits the least significant count of bits to the stream write buffer.
         /// The precondition is bits
         /// <example>
         /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
@@ -442,23 +447,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             if (count >= 8)
             {
                 // Track length
-                int len = 0;
                 while (count >= 8)
                 {
                     byte b = (byte)(bits >> 24);
-                    this.emitBuffer[len++] = b;
+                    this.emitBuffer[this.emitLen++] = b;
                     if (b == byte.MaxValue)
                     {
-                        this.emitBuffer[len++] = byte.MinValue;
+                        this.emitBuffer[this.emitLen++] = byte.MinValue;
                     }
 
                     bits <<= 8;
                     count -= 8;
                 }
 
-                if (len > 0)
+                // This can emit 4 times of:
+                // 1 byte guaranteed
+                // 1 extra byte.MinValue byte if previous one was byte.MaxValue
+                // Thus writing (1 + 1) * 4 = 8 bytes max
+                // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
+                if (this.emitLen > EmitBufferSizeInBytes - 8)
                 {
-                    this.outputStream.Write(this.emitBuffer, 0, len);
+                    this.outputStream.Write(this.emitBuffer, 0, this.emitLen);
+                    this.emitLen = 0;
                 }
             }
 

From 56822d1bcc1f19c58601bc3e1ae541d8203e658d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 11:46:53 +0300
Subject: [PATCH 16/99] Removed obsolete parameter config from various methods

---
 .../Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs        | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index 7412b4d91..d5bf797bb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -142,7 +142,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
@@ -210,7 +209,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         private void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
@@ -291,7 +289,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
@@ -370,7 +367,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="tempDest2">Temporal block 2</param>
         /// <param name="quant">Quantization table</param>
         /// <param name="unZig">The 8x8 Unzig block.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         /// <returns>The <see cref="int"/>.</returns>
         private int WriteBlock(
             QuantIndex index,
@@ -435,7 +431,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <param name="bits">The packed bits.</param>
         /// <param name="count">The number of bits</param>
-        /// <param name="emitBufferBase">The reference to the emitBuffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void Emit(uint bits, uint count)
         {
@@ -481,7 +476,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <param name="index">The index of the Huffman encoder</param>
         /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuff(HuffIndex index, int value)
         {
@@ -495,7 +489,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="index">The index of the Huffman encoder</param>
         /// <param name="runLength">The number of copies to encode.</param>
         /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuffRLE(HuffIndex index, int runLength, int value)
         {

From 690e80cf69800038debc08856e2bfe4a3254a60f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 12:29:11 +0300
Subject: [PATCH 17/99] YCbCrEncoder now has builtin temporal 8x8F blocks for
 internal calculations

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 38 +++++--------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index d5bf797bb..5b63d0588 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -41,6 +41,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Block8x8F luminanceQuantTable;
 
+        private Block8x8F temporalBlock1;
+        private Block8x8F temporalBlock2;
+
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
@@ -145,11 +148,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
             Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
             Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
 
@@ -176,8 +174,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
                         ref onStackLuminanceQuantTable,
                         ref unzig);
 
@@ -185,8 +181,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref pixelConverter.Cb,
-                        ref temp1,
-                        ref temp2,
                         ref onStackChrominanceQuantTable,
                         ref unzig);
 
@@ -194,8 +188,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref pixelConverter.Cr,
-                        ref temp1,
-                        ref temp2,
                         ref onStackChrominanceQuantTable,
                         ref unzig);
                 }
@@ -217,9 +209,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             Span<Block8x8F> cb = stackalloc Block8x8F[4];
             Span<Block8x8F> cr = stackalloc Block8x8F[4];
 
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
             Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
             Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
 
@@ -253,8 +242,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                             QuantIndex.Luminance,
                             prevDCY,
                             ref pixelConverter.Y,
-                            ref temp1,
-                            ref temp2,
                             ref onStackLuminanceQuantTable,
                             ref unzig);
                     }
@@ -264,8 +251,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref b,
-                        ref temp1,
-                        ref temp2,
                         ref onStackChrominanceQuantTable,
                         ref unzig);
 
@@ -274,8 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref b,
-                        ref temp1,
-                        ref temp2,
                         ref onStackChrominanceQuantTable,
                         ref unzig);
                 }
@@ -322,8 +305,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
                         ref onStackLuminanceQuantTable,
                         ref unzig);
                 }
@@ -372,16 +353,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             QuantIndex index,
             int prevDC,
             ref Block8x8F src,
-            ref Block8x8F tempDest1,
-            ref Block8x8F tempDest2,
             ref Block8x8F quant,
             ref ZigZag unZig)
         {
-            FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2);
+            ref Block8x8F refTemp1 = ref this.temporalBlock1;
+            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+
+            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
 
-            Block8x8F.Quantize(ref tempDest1, ref tempDest2, ref quant, ref unZig);
+            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
 
-            int dc = (int)tempDest2[0];
+            int dc = (int)refTemp2[0];
 
             // Emit the DC delta.
             this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC);
@@ -392,7 +374,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             for (int zig = 1; zig < Block8x8F.Size; zig++)
             {
-                int ac = (int)tempDest2[zig];
+                int ac = (int)refTemp2[zig];
 
                 if (ac == 0)
                 {

From b3a993806c64331c633ce154b53590a4f48e8bf6 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 13:06:51 +0300
Subject: [PATCH 18/99] Updated & fixed xml documentation

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 33 ++++++++++++-------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index 5b63d0588..a10f40b09 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -13,21 +13,34 @@ using SixLabors.ImageSharp.PixelFormats;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     internal class YCbCrEncoder<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
     {
+        /// <summary>
+        /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+        /// </summary>
+        /// <remarks>
+        /// This is subject to change, 1024 seems to be the best value in terms of performance.
+        /// <see cref="YCbCrEncoder{TPixel}.Emit(uint, uint)"/> expects it to be at least 8 (see comments in method body).
+        /// </remarks>
         private const int EmitBufferSizeInBytes = 1024;
 
         /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough.
+        /// A buffer for reducing the number of stream writes when emitting Huffman tables.
         /// </summary>
         private byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
 
         /// <summary>
-        /// The accumulated bits to write to the stream.
+        /// Number of filled bytes in <see cref="emitBuffer"/> buffer
+        /// </summary>
+        private int emitLen = 0;
+
+        /// <summary>
+        /// Emmited bits 'micro buffer' before being transfered to the <see cref="YCbCrEncoder{TPixel}.emitBuffer"/>.
         /// </summary>
         private uint accumulatedBits;
 
         /// <summary>
-        /// The accumulated bit count.
+        /// Number of jagged bits stored in <see cref="accumulatedBits"/>
         /// </summary>
         private uint bitCount;
 
@@ -44,10 +57,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private Block8x8F temporalBlock1;
         private Block8x8F temporalBlock2;
 
+        private ImageFrame<TPixel> source;
+
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
-        private Stream outputStream;
+        private Stream target;
 
         /// <summary>
         /// Gets the counts the number of bits needed to hold an integer.
@@ -118,7 +133,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
         public YCbCrEncoder(Stream outputStream, int componentCount, int quality)
         {
-            this.outputStream = outputStream;
+            this.target = outputStream;
 
             // Convert from a quality rating to a scaling factor.
             int scale;
@@ -333,7 +348,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // Pad the last byte with 1's.
             this.Emit(0x7f, 7);
-            this.outputStream.Write(this.emitBuffer, 0, this.emitLen);
+            this.target.Write(this.emitBuffer, 0, this.emitLen);
         }
 
         /// <summary>
@@ -344,8 +359,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="index">The quantization table index.</param>
         /// <param name="prevDC">The previous DC value.</param>
         /// <param name="src">Source block</param>
-        /// <param name="tempDest1">Temporal block to be used as FDCT Destination</param>
-        /// <param name="tempDest2">Temporal block 2</param>
         /// <param name="quant">Quantization table</param>
         /// <param name="unZig">The 8x8 Unzig block.</param>
         /// <returns>The <see cref="int"/>.</returns>
@@ -401,8 +414,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             return dc;
         }
 
-        private int emitLen = 0;
-
         /// <summary>
         /// Emits the least significant count of bits to the stream write buffer.
         /// The precondition is bits
@@ -444,7 +455,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
                 if (this.emitLen > EmitBufferSizeInBytes - 8)
                 {
-                    this.outputStream.Write(this.emitBuffer, 0, this.emitLen);
+                    this.target.Write(this.emitBuffer, 0, this.emitLen);
                     this.emitLen = 0;
                 }
             }

From 4e73471d96f1ed4c6078f75bc4d1b4f14a342ed7 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 13:09:08 +0300
Subject: [PATCH 19/99] Small QoL fixes

---
 .../Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs  | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index a10f40b09..051acf0e8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -2,18 +2,15 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Collections.Generic;
 using System.IO;
 using System.Runtime.CompilerServices;
-using System.Text;
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
-    internal class YCbCrEncoder<TPixel>
-        where TPixel : unmanaged, IPixel<TPixel>
+    internal class YCbCrEncoder
     {
         /// <summary>
         /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
@@ -57,8 +54,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private Block8x8F temporalBlock1;
         private Block8x8F temporalBlock2;
 
-        private ImageFrame<TPixel> source;
-
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
@@ -290,11 +285,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
             Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
 
             var unzig = ZigZag.CreateUnzigTable();

From 368f89e4509a053a35c5b52d9fc679ba6163c10a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 16:11:17 +0300
Subject: [PATCH 20/99] Moved quantization table initialization logic to
 JpegEncoderCore

---
 .../Encoder/YCbCrEncoder{TPixel}.cs           | 146 +++---------------
 .../Formats/Jpeg/JpegEncoderCore.cs           | 110 ++++++++++++-
 2 files changed, 123 insertions(+), 133 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index 051acf0e8..db2a3c354 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -41,16 +41,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private uint bitCount;
 
-        /// <summary>
-        /// The scaled chrominance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F chrominanceQuantTable;
-
-        /// <summary>
-        /// The scaled luminance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F luminanceQuantTable;
-
         private Block8x8F temporalBlock1;
         private Block8x8F temporalBlock2;
 
@@ -82,71 +72,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 8, 8, 8,
             };
 
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
-            {
-                // Luminance.
-                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-                100, 120, 92, 101, 103, 99,
-            };
-
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
-            {
-                // Chrominance.
-                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99,
-            };
-
-
-        public ref Block8x8F ChrominanceQuantizationTable => ref this.chrominanceQuantTable;
-
-        public ref Block8x8F LuminanceQuantizationTable => ref this.luminanceQuantTable;
-
-
-        public YCbCrEncoder(Stream outputStream, int componentCount, int quality)
+        public YCbCrEncoder(Stream outputStream)
         {
             this.target = outputStream;
-
-            // Convert from a quality rating to a scaling factor.
-            int scale;
-            if (quality < 50)
-            {
-                scale = 5000 / quality;
-            }
-            else
-            {
-                scale = 200 - (quality * 2);
-            }
-
-            // Initialize the quantization tables.
-            InitQuantizationTable(0, scale, ref this.luminanceQuantTable);
-            if (componentCount > 1)
-            {
-                InitQuantizationTable(1, scale, ref this.chrominanceQuantTable);
-            }
         }
 
         /// <summary>
@@ -155,12 +83,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
             var unzig = ZigZag.CreateUnzigTable();
 
             // ReSharper disable once InconsistentNaming
@@ -184,21 +109,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref onStackLuminanceQuantTable,
+                        ref luminanceQuantTable,
                         ref unzig);
 
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref pixelConverter.Cb,
-                        ref onStackChrominanceQuantTable,
+                        ref chrominanceQuantTable,
                         ref unzig);
 
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref pixelConverter.Cr,
-                        ref onStackChrominanceQuantTable,
+                        ref chrominanceQuantTable,
                         ref unzig);
                 }
             }
@@ -211,7 +136,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -219,9 +144,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             Span<Block8x8F> cb = stackalloc Block8x8F[4];
             Span<Block8x8F> cr = stackalloc Block8x8F[4];
 
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
             var unzig = ZigZag.CreateUnzigTable();
 
             var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
@@ -252,7 +174,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                             QuantIndex.Luminance,
                             prevDCY,
                             ref pixelConverter.Y,
-                            ref onStackLuminanceQuantTable,
+                            ref luminanceQuantTable,
                             ref unzig);
                     }
 
@@ -261,7 +183,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCb,
                         ref b,
-                        ref onStackChrominanceQuantTable,
+                        ref chrominanceQuantTable,
                         ref unzig);
 
                     Block8x8F.Scale16X16To8X8(ref b, cr);
@@ -269,7 +191,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Chrominance,
                         prevDCCr,
                         ref b,
-                        ref onStackChrominanceQuantTable,
+                        ref chrominanceQuantTable,
                         ref unzig);
                 }
             }
@@ -282,11 +204,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken)
+        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-
             var unzig = ZigZag.CreateUnzigTable();
 
             // ReSharper disable once InconsistentNaming
@@ -310,28 +230,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         QuantIndex.Luminance,
                         prevDCY,
                         ref pixelConverter.Y,
-                        ref onStackLuminanceQuantTable,
+                        ref luminanceQuantTable,
                         ref unzig);
                 }
             }
         }
 
-        public void WriteStartOfScan<TPixel>(Image<TPixel> image, JpegColorType? colorType, JpegSubsample? subsample, CancellationToken cancellationToken)
+        public void WriteStartOfScan<TPixel>(
+            Image<TPixel> image,
+            JpegColorType? colorType,
+            JpegSubsample? subsample,
+            ref Block8x8F luminanceQuantTable,
+            ref Block8x8F chrominanceTable,
+            CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             if (colorType == JpegColorType.Luminance)
             {
-                this.EncodeGrayscale(image, cancellationToken);
+                this.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
             }
             else
             {
                 switch (subsample)
                 {
                     case JpegSubsample.Ratio444:
-                        this.Encode444(image, cancellationToken);
+                        this.Encode444(image, ref luminanceQuantTable, ref chrominanceTable, cancellationToken);
                         break;
                     case JpegSubsample.Ratio420:
-                        this.Encode420(image, cancellationToken);
+                        this.Encode420(image, ref luminanceQuantTable, ref chrominanceTable, cancellationToken);
                         break;
                 }
             }
@@ -499,35 +425,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt);
             }
         }
-
-
-        /// <summary>
-        /// Initializes quantization table.
-        /// </summary>
-        /// <param name="i">The quantization index.</param>
-        /// <param name="scale">The scaling factor.</param>
-        /// <param name="quant">The quantization table.</param>
-        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
-        {
-            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
-            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
-
-            for (int j = 0; j < Block8x8F.Size; j++)
-            {
-                int x = unscaledQuant[j];
-                x = ((x * scale) + 50) / 100;
-                if (x < 1)
-                {
-                    x = 1;
-                }
-
-                if (x > 255)
-                {
-                    x = 255;
-                }
-
-                quant[j] = x;
-            }
-        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 2625d490c..6b58ef483 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -31,6 +31,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// </summary>
         private const int QuantizationTableCount = 2;
 
+        /// <summary>
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
+        /// </summary>
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+            {
+                // Luminance.
+                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
+                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
+                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
+                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
+                100, 120, 92, 101, 103, 99,
+            };
+
+        /// <summary>
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
+        /// </summary>
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+            {
+                // Chrominance.
+                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99,
+            };
+
+
         /// <summary>
         /// A scratch buffer to reduce allocations.
         /// </summary>
@@ -97,7 +136,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             int qlty = Numerics.Clamp(this.quality ?? metadata.GetJpegMetadata().Quality, 1, 100);
             this.subsample ??= qlty >= 91 ? JpegSubsample.Ratio444 : JpegSubsample.Ratio420;
 
-            YCbCrEncoder<TPixel> scanEncoder = new YCbCrEncoder<TPixel>(stream, componentCount, qlty);
+            // Convert from a quality rating to a scaling factor.
+            int scale;
+            if (qlty < 50)
+            {
+                scale = 5000 / qlty;
+            }
+            else
+            {
+                scale = 200 - (qlty * 2);
+            }
+
+            // Initialize the quantization tables.
+            // TODO: This looks ugly, should we write chrominance table for luminance-only images?
+            // If not - this can code can be simplified
+            Block8x8F luminanceQuantTable = default;
+            Block8x8F chrominanceQuantTable = default;
+            InitQuantizationTable(0, scale, ref luminanceQuantTable);
+            if (componentCount > 1)
+            {
+                InitQuantizationTable(1, scale, ref chrominanceQuantTable);
+            }
 
             // Write the Start Of Image marker.
             this.WriteApplicationHeader(metadata);
@@ -106,7 +165,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteProfiles(metadata);
 
             // Write the quantization tables.
-            this.WriteDefineQuantizationTables(ref scanEncoder.LuminanceQuantizationTable, ref scanEncoder.ChrominanceQuantizationTable);
+            this.WriteDefineQuantizationTables(ref luminanceQuantTable, ref chrominanceQuantTable);
 
             // Write the image dimensions.
             this.WriteStartOfFrame(image.Width, image.Height, componentCount);
@@ -114,8 +173,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             // Write the Huffman tables.
             this.WriteDefineHuffmanTables(componentCount);
 
-            // Write the image data.
-            this.WriteStartOfScan(scanEncoder, image, componentCount, cancellationToken);
+            // Write the scan header.
+            this.WriteStartOfScan(image, componentCount, cancellationToken);
+
+            // Write the scan compressed data.
+            new YCbCrEncoder(stream).WriteStartOfScan(
+                image,
+                this.colorType,
+                this.subsample,
+                ref luminanceQuantTable,
+                ref chrominanceQuantTable,
+                cancellationToken);
 
             // Write the End Of Image marker.
             this.buffer[0] = JpegConstants.Markers.XFF;
@@ -573,7 +641,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         /// <param name="image">The pixel accessor providing access to the image pixels.</param>
         /// <param name="componentCount">The number of components in a pixel.</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void WriteStartOfScan<TPixel>(YCbCrEncoder<TPixel> scanEncoder, Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
+        private void WriteStartOfScan<TPixel>(Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -618,9 +686,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.buffer[sosSize] = 0x3f; // Se - End of spectral selection.
             this.buffer[sosSize + 1] = 0x00; // Ah + Ah (Successive approximation bit position high + low)
             this.outputStream.Write(this.buffer, 0, sosSize + 2);
-
-
-            scanEncoder.WriteStartOfScan(image, this.colorType, this.subsample, cancellationToken);
         }
 
         /// <summary>
@@ -637,5 +702,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.buffer[3] = (byte)(length & 0xff);
             this.outputStream.Write(this.buffer, 0, 4);
         }
+
+        /// <summary>
+        /// Initializes quantization table.
+        /// </summary>
+        /// <param name="i">The quantization index.</param>
+        /// <param name="scale">The scaling factor.</param>
+        /// <param name="quant">The quantization table.</param>
+        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
+        {
+            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
+            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
+
+            for (int j = 0; j < Block8x8F.Size; j++)
+            {
+                int x = unscaledQuant[j];
+                x = ((x * scale) + 50) / 100;
+                if (x < 1)
+                {
+                    x = 1;
+                }
+
+                if (x > 255)
+                {
+                    x = 255;
+                }
+
+                quant[j] = x;
+            }
+        }
     }
 }

From 9d7adb6bf795a2941057ea20c335e9a747861078 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 16:14:38 +0300
Subject: [PATCH 21/99] Fixed comments

---
 .../Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
index db2a3c354..8256348a8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
@@ -17,7 +17,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <remarks>
         /// This is subject to change, 1024 seems to be the best value in terms of performance.
-        /// <see cref="YCbCrEncoder{TPixel}.Emit(uint, uint)"/> expects it to be at least 8 (see comments in method body).
+        /// <see cref="Emit(uint, uint)"/> expects it to be at least 8 (see comments in method body).
         /// </remarks>
         private const int EmitBufferSizeInBytes = 1024;
 
@@ -32,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private int emitLen = 0;
 
         /// <summary>
-        /// Emmited bits 'micro buffer' before being transfered to the <see cref="YCbCrEncoder{TPixel}.emitBuffer"/>.
+        /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
         /// </summary>
         private uint accumulatedBits;
 
@@ -82,6 +82,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         private void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
@@ -135,6 +137,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         private void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
@@ -203,6 +207,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
         private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>

From 3380bdf0d017dad810521d7e30197289f6495147 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 16:15:27 +0300
Subject: [PATCH 22/99] Renamed YCbCrEncoder to HuffmanScanEncoder as it is in
 decoding logic

---
 .../{YCbCrEncoder{TPixel}.cs => HuffmanScanEncoder.cs}        | 4 ++--
 src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename src/ImageSharp/Formats/Jpeg/Components/Encoder/{YCbCrEncoder{TPixel}.cs => HuffmanScanEncoder.cs} (99%)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
similarity index 99%
rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8256348a8..72300e6fb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrEncoder{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -10,7 +10,7 @@ using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
-    internal class YCbCrEncoder
+    internal class HuffmanScanEncoder
     {
         /// <summary>
         /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
@@ -72,7 +72,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 8, 8, 8,
             };
 
-        public YCbCrEncoder(Stream outputStream)
+        public HuffmanScanEncoder(Stream outputStream)
         {
             this.target = outputStream;
         }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 6b58ef483..e9a5f7e02 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -177,7 +177,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteStartOfScan(image, componentCount, cancellationToken);
 
             // Write the scan compressed data.
-            new YCbCrEncoder(stream).WriteStartOfScan(
+            new HuffmanScanEncoder(stream).WriteStartOfScan(
                 image,
                 this.colorType,
                 this.subsample,

From 7e0a317461e8eba128c97bb205396d71ae687a6d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 16:54:09 +0300
Subject: [PATCH 23/99] Moved encode method choice to the JpegEncoderCore

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 41 +++++--------------
 .../Formats/Jpeg/JpegEncoderCore.cs           | 35 +++++++++-------
 2 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 72300e6fb..0b05b955d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -85,7 +85,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
         /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+        public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             var unzig = ZigZag.CreateUnzigTable();
@@ -129,6 +129,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref unzig);
                 }
             }
+
+            // Pad the last byte with 1's.
+            this.Emit(0x7f, 7);
+            this.target.Write(this.emitBuffer, 0, this.emitLen);
         }
 
         /// <summary>
@@ -140,7 +144,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
         /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
@@ -199,6 +203,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref unzig);
                 }
             }
+
+            // Pad the last byte with 1's.
+            this.Emit(0x7f, 7);
+            this.target.Write(this.emitBuffer, 0, this.emitLen);
         }
 
 
@@ -209,7 +217,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
         /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
         /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             var unzig = ZigZag.CreateUnzigTable();
@@ -239,33 +247,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref unzig);
                 }
             }
-        }
-
-        public void WriteStartOfScan<TPixel>(
-            Image<TPixel> image,
-            JpegColorType? colorType,
-            JpegSubsample? subsample,
-            ref Block8x8F luminanceQuantTable,
-            ref Block8x8F chrominanceTable,
-            CancellationToken cancellationToken)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            if (colorType == JpegColorType.Luminance)
-            {
-                this.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
-            }
-            else
-            {
-                switch (subsample)
-                {
-                    case JpegSubsample.Ratio444:
-                        this.Encode444(image, ref luminanceQuantTable, ref chrominanceTable, cancellationToken);
-                        break;
-                    case JpegSubsample.Ratio420:
-                        this.Encode420(image, ref luminanceQuantTable, ref chrominanceTable, cancellationToken);
-                        break;
-                }
-            }
 
             // Pad the last byte with 1's.
             this.Emit(0x7f, 7);
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index e9a5f7e02..9ff334453 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -86,9 +86,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         private readonly int? quality;
 
         /// <summary>
-        /// Gets or sets the subsampling method to use.
+        /// Component count.
         /// </summary>
-        private readonly JpegColorType? colorType;
+        private readonly int componentCount;
 
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
@@ -103,7 +103,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         {
             this.quality = options.Quality;
             this.subsample = options.Subsample;
-            this.colorType = options.ColorType;
+            this.componentCount = (options.ColorType == JpegColorType.Luminance) ? 1 : 3;
         }
 
         /// <summary>
@@ -129,9 +129,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.outputStream = stream;
             ImageMetadata metadata = image.Metadata;
 
-            // Compute number of components based on color type in options.
-            int componentCount = (this.colorType == JpegColorType.Luminance) ? 1 : 3;
-
             // System.Drawing produces identical output for jpegs with a quality parameter of 0 and 1.
             int qlty = Numerics.Clamp(this.quality ?? metadata.GetJpegMetadata().Quality, 1, 100);
             this.subsample ??= qlty >= 91 ? JpegSubsample.Ratio444 : JpegSubsample.Ratio420;
@@ -153,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             Block8x8F luminanceQuantTable = default;
             Block8x8F chrominanceQuantTable = default;
             InitQuantizationTable(0, scale, ref luminanceQuantTable);
-            if (componentCount > 1)
+            if (this.componentCount > 1)
             {
                 InitQuantizationTable(1, scale, ref chrominanceQuantTable);
             }
@@ -177,13 +174,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.WriteStartOfScan(image, componentCount, cancellationToken);
 
             // Write the scan compressed data.
-            new HuffmanScanEncoder(stream).WriteStartOfScan(
-                image,
-                this.colorType,
-                this.subsample,
-                ref luminanceQuantTable,
-                ref chrominanceQuantTable,
-                cancellationToken);
+            var scanEncoder = new HuffmanScanEncoder(stream);
+            if (this.componentCount == 1)
+            {
+                scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
+            }
+            else
+            {
+                switch (subsample)
+                {
+                    case JpegSubsample.Ratio444:
+                        scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        break;
+                    case JpegSubsample.Ratio420:
+                        scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        break;
+                }
+            }
 
             // Write the End Of Image marker.
             this.buffer[0] = JpegConstants.Markers.XFF;

From 1b1d136f8c860bed912809ef86e43100bb80987d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 17:13:22 +0300
Subject: [PATCH 24/99] Fixed unresolved reference this.colorType

---
 src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 9ff334453..b8568c4ab 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -587,7 +587,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 0x01
             };
 
-            if (this.colorType == JpegColorType.Luminance)
+            if (this.componentCount == 1)
             {
                 subsamples = stackalloc byte[]
                 {

From 5b05a0a1da0497661e98f499b5b482193c189c4e Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 17:35:40 +0300
Subject: [PATCH 25/99] Added QoL throw helper method for jpeg w/h size check
 before encoding

---
 src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs | 8 ++++----
 src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index b8568c4ab..169a3cbb7 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -118,14 +118,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         {
             Guard.NotNull(image, nameof(image));
             Guard.NotNull(stream, nameof(stream));
-            cancellationToken.ThrowIfCancellationRequested();
 
-            const ushort max = JpegConstants.MaxLength;
-            if (image.Width >= max || image.Height >= max)
+            if (image.Width >= JpegConstants.MaxLength || image.Height >= JpegConstants.MaxLength)
             {
-                throw new ImageFormatException($"Image is too large to encode at {image.Width}x{image.Height}.");
+                JpegThrowHelper.ThrowDimensionsTooLarge(image.Width, image.Height);
             }
 
+            cancellationToken.ThrowIfCancellationRequested();
+
             this.outputStream = stream;
             ImageMetadata metadata = image.Metadata;
 
diff --git a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
index fa9eb8391..cc75870e1 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
@@ -46,5 +46,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
 
         [MethodImpl(InliningOptions.ColdPath)]
         public static void ThrowInvalidImageDimensions(int width, int height) => throw new InvalidImageContentException($"Invalid image dimensions: {width}x{height}.");
+
+        [MethodImpl(InliningOptions.ColdPath)]
+        public static void ThrowDimensionsTooLarge(int width, int height) => throw new ImageFormatException($"Image is too large to encode at {width}x{height} for JPEG format.");
     }
 }

From 84a143d0951b59c657730a7f0f4df57b4cfa92ce Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 21 May 2021 17:38:55 +0300
Subject: [PATCH 26/99] Moved end of image marker writing code to a separate
 method

---
 src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 169a3cbb7..744f82bda 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -193,9 +193,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             }
 
             // Write the End Of Image marker.
-            this.buffer[0] = JpegConstants.Markers.XFF;
-            this.buffer[1] = JpegConstants.Markers.EOI;
-            stream.Write(this.buffer, 0, 2);
+            this.WriteEndOfImageMarker();
+
             stream.Flush();
         }
 
@@ -695,6 +694,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.outputStream.Write(this.buffer, 0, sosSize + 2);
         }
 
+        /// <summary>
+        /// Writes the EndOfImage marker.
+        /// </summary>
+        private void WriteEndOfImageMarker()
+        {
+            this.buffer[0] = JpegConstants.Markers.XFF;
+            this.buffer[1] = JpegConstants.Markers.EOI;
+            this.outputStream.Write(this.buffer, 0, 2);
+        }
+
         /// <summary>
         /// Writes the header for a marker with the given length.
         /// </summary>

From d4fa8b254bce6c82ee8cdd2b7fa1a5d27e766508 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 08:17:31 +0300
Subject: [PATCH 27/99] Rolled back to initial JpegEncoderCore options
 implementation.

---
 src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index 744f82bda..b7459bdc7 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -86,9 +86,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         private readonly int? quality;
 
         /// <summary>
-        /// Component count.
+        /// Gets or sets the subsampling method to use.
         /// </summary>
-        private readonly int componentCount;
+        private readonly JpegColorType? colorType;
 
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
@@ -103,7 +103,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         {
             this.quality = options.Quality;
             this.subsample = options.Subsample;
-            this.componentCount = (options.ColorType == JpegColorType.Luminance) ? 1 : 3;
+            this.colorType = options.ColorType;
         }
 
         /// <summary>
@@ -129,6 +129,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             this.outputStream = stream;
             ImageMetadata metadata = image.Metadata;
 
+            // Compute number of components based on color type in options.
+            int componentCount = (this.colorType == JpegColorType.Luminance) ? 1 : 3;
+
             // System.Drawing produces identical output for jpegs with a quality parameter of 0 and 1.
             int qlty = Numerics.Clamp(this.quality ?? metadata.GetJpegMetadata().Quality, 1, 100);
             this.subsample ??= qlty >= 91 ? JpegSubsample.Ratio444 : JpegSubsample.Ratio420;
@@ -150,7 +153,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
             Block8x8F luminanceQuantTable = default;
             Block8x8F chrominanceQuantTable = default;
             InitQuantizationTable(0, scale, ref luminanceQuantTable);
-            if (this.componentCount > 1)
+            if (componentCount > 1)
             {
                 InitQuantizationTable(1, scale, ref chrominanceQuantTable);
             }
@@ -175,7 +178,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
 
             // Write the scan compressed data.
             var scanEncoder = new HuffmanScanEncoder(stream);
-            if (this.componentCount == 1)
+            if (this.colorType == JpegColorType.Luminance)
             {
                 scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
             }
@@ -586,7 +589,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 0x01
             };
 
-            if (this.componentCount == 1)
+            if (this.colorType == JpegColorType.Luminance)
             {
                 subsamples = stackalloc byte[]
                 {

From 980f2d2e7f17d98c7cad64b518590c23d457961e Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 08:29:45 +0300
Subject: [PATCH 28/99] Revert "Block8x8F.MultiplyInPlace no longer use unsafe
 casts"

This reverts commit fbf0ff1466ef410de2fb77d22c6cdef074cad6ce.
---
 .../Formats/Jpeg/Components/Block8x8F.cs         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 9072ca196..91aec3005 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -313,14 +313,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                this.V0 = Avx.Multiply(this.V0, valueVec);
-                this.V1 = Avx.Multiply(this.V1, valueVec);
-                this.V2 = Avx.Multiply(this.V2, valueVec);
-                this.V3 = Avx.Multiply(this.V3, valueVec);
-                this.V4 = Avx.Multiply(this.V4, valueVec);
-                this.V5 = Avx.Multiply(this.V5, valueVec);
-                this.V6 = Avx.Multiply(this.V6, valueVec);
-                this.V7 = Avx.Multiply(this.V7, valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
+                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
             }
             else
 #endif

From f1886add1639105fe89050f18feb7fa8d00423f7 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 08:29:48 +0300
Subject: [PATCH 29/99] Revert "Block8x8F.TransposeInto no longer uses unsafe
 casts (partially)"

This reverts commit 20236b8c756ecbd6fd75c789b58dca5ed028d1e9.
---
 .../Formats/Jpeg/Components/Block8x8F.cs         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 91aec3005..dbc22eaea 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -840,26 +840,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector256<float> t0 = Avx.UnpackLow(r0, r1);
                 Vector256<float> t2 = Avx.UnpackLow(r2, r3);
                 Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                d.V0 = Avx.Blend(t0, v, 0xCC);
-                d.V1 = Avx.Blend(t2, v, 0x33);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
 
                 Vector256<float> t4 = Avx.UnpackLow(r4, r5);
                 Vector256<float> t6 = Avx.UnpackLow(r6, r7);
                 v = Avx.Shuffle(t4, t6, 0x4E);
-                d.V4 = Avx.Blend(t4, v, 0xCC);
-                d.V5 = Avx.Blend(t6, v, 0x33);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
 
                 Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
                 Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
                 v = Avx.Shuffle(t1, t3, 0x4E);
-                d.V2 = Avx.Blend(t1, v, 0xCC);
-                d.V3 = Avx.Blend(t3, v, 0x33);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
 
                 Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
                 Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
                 v = Avx.Shuffle(t5, t7, 0x4E);
-                d.V6 = Avx.Blend(t5, v, 0xCC);
-                d.V7 = Avx.Blend(t7, v, 0x33);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
+                Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
             }
             else
 #endif

From a8f717d7815e6a8c9b31e4a06b715368f7c1378b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 09:50:40 +0300
Subject: [PATCH 30/99] Made DCT code prettier with SimdUtils, added summary to
 8x8 dct methods, added debug assertion

---
 .../Components/FastFloatingPointDCT.IDCT.cs   | 59 +++++--------------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 19 +++---
 2 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
index fd3ad8d5f..369172a2d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -171,14 +172,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V4R = my3 - mb3;
         }
 
-#if SUPPORTS_RUNTIME_INTRINSICS
         /// <summary>
-        /// Do IDCT internal operations on the given block.
+        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
         /// </summary>
         /// <param name="s">Source</param>
         /// <param name="d">Destination</param>
         public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
         {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
             Vector256<float> my1 = s.V1;
             Vector256<float> my7 = s.V7;
             Vector256<float> mz0 = Avx.Add(my1, my7);
@@ -191,40 +195,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
 
-            if (Fma.IsSupported)
-            {
-                mz2 = Fma.MultiplyAdd(mz2, C_V_n1_9615, mz4);
-                mz3 = Fma.MultiplyAdd(mz3, C_V_n0_3901, mz4);
-            }
-            else
-            {
-                mz2 = Avx.Add(Avx.Multiply(mz2, C_V_n1_9615), mz4);
-                mz3 = Avx.Add(Avx.Multiply(mz3, C_V_n0_3901), mz4);
-            }
-
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
             mz0 = Avx.Multiply(mz0, C_V_n0_8999);
             mz1 = Avx.Multiply(mz1, C_V_n2_5629);
 
+            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
+            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
+            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
+            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
 
-            Unsafe.SkipInit(out Vector256<float> mb3);
-            Unsafe.SkipInit(out Vector256<float> mb2);
-            Unsafe.SkipInit(out Vector256<float> mb1);
-            Unsafe.SkipInit(out Vector256<float> mb0);
-
-            if (Fma.IsSupported)
-            {
-                mb3 = Avx.Add(Fma.MultiplyAdd(my7, C_V_0_2986, mz0), mz2);
-                mb2 = Avx.Add(Fma.MultiplyAdd(my5, C_V_2_0531, mz1), mz3);
-                mb1 = Avx.Add(Fma.MultiplyAdd(my3, C_V_3_0727, mz1), mz2);
-                mb0 = Avx.Add(Fma.MultiplyAdd(my1, C_V_1_5013, mz0), mz3);
-            }
-            else
-            {
-                mb3 = Avx.Add(Avx.Add(Avx.Multiply(my7, C_V_0_2986), mz0), mz2);
-                mb2 = Avx.Add(Avx.Add(Avx.Multiply(my5, C_V_2_0531), mz1), mz3);
-                mb1 = Avx.Add(Avx.Add(Avx.Multiply(my3, C_V_3_0727), mz1), mz2);
-                mb0 = Avx.Add(Avx.Add(Avx.Multiply(my1, C_V_1_5013), mz0), mz3);
-            }
 
             Vector256<float> my2 = s.V2;
             Vector256<float> my6 = s.V6;
@@ -233,17 +213,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             Vector256<float> my4 = s.V4;
             mz0 = Avx.Add(my0, my4);
             mz1 = Avx.Subtract(my0, my4);
-
-            if (Fma.IsSupported)
-            {
-                mz2 = Fma.MultiplyAdd(my6, C_V_n1_8477, mz4);
-                mz3 = Fma.MultiplyAdd(my2, C_V_0_7653, mz4);
-            }
-            else
-            {
-                mz2 = Avx.Add(Avx.Multiply(my6, C_V_n1_8477), mz4);
-                mz3 = Avx.Add(Avx.Multiply(my2, C_V_0_7653), mz4);
-            }
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
 
             my0 = Avx.Add(mz0, mz3);
             my3 = Avx.Subtract(mz0, mz3);
@@ -258,7 +229,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V5 = Avx.Subtract(my2, mb2);
             d.V3 = Avx.Add(my3, mb3);
             d.V4 = Avx.Subtract(my3, mb3);
-        }
 #endif
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 4ef4ab7b0..493c0a688 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -196,14 +197,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V7R = c0 - c3;
         }
 
-#if SUPPORTS_RUNTIME_INTRINSICS
         /// <summary>
-        /// 
+        /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
         /// </summary>
         /// <param name="s">Source</param>
         /// <param name="d">Destination</param>
         private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
         {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
             Vector256<float> t0 = Avx.Add(s.V0, s.V7);
             Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
             Vector256<float> t1 = Avx.Add(s.V1, s.V6);
@@ -224,36 +228,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             Vector256<float> c2 = Avx.Subtract(t1, t2);
 
             // 2 6
+            d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
             if (Fma.IsSupported)
             {
-                d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065));
                 d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065));
             }
             else
             {
-                d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065));
                 d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065));
             }
 
+            c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
             if (Fma.IsSupported)
             {
-                c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856));
                 c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856));
             }
             else
             {
-                c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856));
                 c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856));
             }
 
+            c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
             if (Fma.IsSupported)
             {
-                c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6));
                 c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5));
             }
             else
             {
-                c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6));
                 c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5));
             }
 
@@ -267,8 +268,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             // 1 7
             d.V1 = Avx.Add(c0, c3);
             d.V7 = Avx.Subtract(c0, c3);
-        }
 #endif
+        }
 
         /// <summary>
         /// Performs 8x8 matrix Forward Discrete Cosine Transform

From dfb181db8ab693224b7d1f88b669a501f50c409b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 09:52:12 +0300
Subject: [PATCH 31/99] Combined FDCT and IDCT code into single file

---
 .../Components/FastFloatingPointDCT.IDCT.cs   | 235 ------------------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 214 ++++++++++++++++
 2 files changed, 214 insertions(+), 235 deletions(-)
 delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
deleted file mode 100644
index 369172a2d..000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
-
-// ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components
-{
-    /// <summary>
-    /// Contains inaccurate, but fast forward and inverse DCT implementations.
-    /// </summary>
-    internal static partial class FastFloatingPointDCT
-    {
-        /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-
-            IDCT8x8(ref temp, ref dest);
-            dest.TransposeInto(ref temp);
-            IDCT8x8(ref temp, ref dest);
-
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
-        }
-
-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> my1 = s.V1;
-            Vector256<float> my7 = s.V7;
-            Vector256<float> mz0 = Avx.Add(my1, my7);
-
-            Vector256<float> my3 = s.V3;
-            Vector256<float> mz2 = Avx.Add(my3, my7);
-            Vector256<float> my5 = s.V5;
-            Vector256<float> mz1 = Avx.Add(my3, my5);
-            Vector256<float> mz3 = Avx.Add(my1, my5);
-
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
-
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
-            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
-            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
-
-            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
-            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
-            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
-            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
-
-
-            Vector256<float> my2 = s.V2;
-            Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
-            Vector256<float> my0 = s.V0;
-            Vector256<float> my4 = s.V4;
-            mz0 = Avx.Add(my0, my4);
-            mz1 = Avx.Subtract(my0, my4);
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
-
-            my0 = Avx.Add(mz0, mz3);
-            my3 = Avx.Subtract(mz0, mz3);
-            my1 = Avx.Add(mz1, mz2);
-            my2 = Avx.Subtract(mz1, mz2);
-
-            d.V0 = Avx.Add(my0, mb0);
-            d.V7 = Avx.Subtract(my0, mb0);
-            d.V1 = Avx.Add(my1, mb1);
-            d.V6 = Avx.Subtract(my1, mb1);
-            d.V2 = Avx.Add(my2, mb2);
-            d.V5 = Avx.Subtract(my2, mb2);
-            d.V3 = Avx.Add(my3, mb3);
-            d.V4 = Avx.Subtract(my3, mb3);
-#endif
-        }
-    }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 493c0a688..d7101abfd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -319,5 +319,219 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             dest.MultiplyInPlace(C_0_125);
         }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+            Vector256<float> my1 = s.V1;
+            Vector256<float> my7 = s.V7;
+            Vector256<float> mz0 = Avx.Add(my1, my7);
+
+            Vector256<float> my3 = s.V3;
+            Vector256<float> mz2 = Avx.Add(my3, my7);
+            Vector256<float> my5 = s.V5;
+            Vector256<float> mz1 = Avx.Add(my3, my5);
+            Vector256<float> mz3 = Avx.Add(my1, my5);
+
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
+
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
+            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
+            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
+
+            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
+            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
+            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
+            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
+
+
+            Vector256<float> my2 = s.V2;
+            Vector256<float> my6 = s.V6;
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
+            Vector256<float> my0 = s.V0;
+            Vector256<float> my4 = s.V4;
+            mz0 = Avx.Add(my0, my4);
+            mz1 = Avx.Subtract(my0, my4);
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
+
+            my0 = Avx.Add(mz0, mz3);
+            my3 = Avx.Subtract(mz0, mz3);
+            my1 = Avx.Add(mz1, mz2);
+            my2 = Avx.Subtract(mz1, mz2);
+
+            d.V0 = Avx.Add(my0, mb0);
+            d.V7 = Avx.Subtract(my0, mb0);
+            d.V1 = Avx.Add(my1, mb1);
+            d.V6 = Avx.Subtract(my1, mb1);
+            d.V2 = Avx.Add(my2, mb2);
+            d.V5 = Avx.Subtract(my2, mb2);
+            d.V3 = Avx.Add(my3, mb3);
+            d.V4 = Avx.Subtract(my3, mb3);
+#endif
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+        /// </summary>
+        /// <param name="src">Source</param>
+        /// <param name="dest">Destination</param>
+        /// <param name="temp">Temporary block provided by the caller</param>
+        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        {
+            src.TransposeInto(ref temp);
+
+            IDCT8x8(ref temp, ref dest);
+            dest.TransposeInto(ref temp);
+            IDCT8x8(ref temp, ref dest);
+
+            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+            dest.MultiplyInPlace(C_0_125);
+        }
     }
 }

From 0424d8db71a9d216e51e118a83655b9a6d41be45 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 22 May 2021 11:31:55 +0300
Subject: [PATCH 32/99] Codestyle changes

---
 .../Jpeg/Components/Encoder/HuffmanScanEncoder.cs | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 0b05b955d..8b23211d3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -24,7 +24,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// A buffer for reducing the number of stream writes when emitting Huffman tables.
         /// </summary>
-        private byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+        private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
 
         /// <summary>
         /// Number of filled bytes in <see cref="emitBuffer"/> buffer
@@ -47,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
-        private Stream target;
+        private readonly Stream target;
+
+        public HuffmanScanEncoder(Stream outputStream)
+        {
+            this.target = outputStream;
+        }
 
         /// <summary>
         /// Gets the counts the number of bits needed to hold an integer.
@@ -72,11 +77,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 8, 8, 8,
             };
 
-        public HuffmanScanEncoder(Stream outputStream)
-        {
-            this.target = outputStream;
-        }
-
         /// <summary>
         /// Encodes the image with no subsampling.
         /// </summary>
@@ -209,7 +209,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.target.Write(this.emitBuffer, 0, this.emitLen);
         }
 
-
         /// <summary>
         /// Encodes the image with no chroma, just luminance.
         /// </summary>

From d12bb3e648d9dcb7242e49f36b80274063ea0c0b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 24 May 2021 15:47:32 +0300
Subject: [PATCH 33/99] Improved jpeg encoding benchmark, updated benchmark
 'baseline' for current encoding implementation

---
 .../Codecs/Jpeg/EncodeJpeg.cs                 | 63 ++++++++++++++-----
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 5a9ceea94..839f19e87 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -4,6 +4,7 @@
 using System.Drawing.Imaging;
 using System.IO;
 using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests;
 using SDImage = System.Drawing.Image;
@@ -12,10 +13,23 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 {
     public class EncodeJpeg
     {
-        // System.Drawing needs this.
-        private Stream bmpStream;
+        private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
+        private const int EncodingQuality = 100;
+
+        // GDI+ uses 4:1:1 subsampling - https://stackoverflow.com/questions/745610/how-to-disable-subsampling-with-net-gdi
+        // ImageSharp lowest subsampling is 4:2:0 which is an okay approximation
+        private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;
+
+        // System.Drawing
         private SDImage bmpDrawing;
+        private Stream bmpStream;
+        private ImageCodecInfo jpegCodec;
+        private EncoderParameters encoderParameters;
+
+        // ImageSharp
         private Image<Rgba32> bmpCore;
+        private JpegEncoder encoder;
+
         private MemoryStream destinationStream;
 
         [GlobalSetup]
@@ -23,12 +37,19 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
         {
             if (this.bmpStream == null)
             {
-                const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
                 this.bmpStream = File.OpenRead(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImage));
+
                 this.bmpCore = Image.Load<Rgba32>(this.bmpStream);
                 this.bmpCore.Metadata.ExifProfile = null;
+                this.encoder = new JpegEncoder { Quality = EncodingQuality, Subsample = EncodingSubsampling };
+
                 this.bmpStream.Position = 0;
                 this.bmpDrawing = SDImage.FromStream(this.bmpStream);
+                this.jpegCodec = GetEncoder(ImageFormat.Jpeg);
+                this.encoderParameters = new EncoderParameters(1);
+                // Quality cast to long is necessary
+                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)EncodingQuality);
+
                 this.destinationStream = new MemoryStream();
             }
         }
@@ -45,29 +66,43 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
         [Benchmark(Baseline = true, Description = "System.Drawing Jpeg")]
         public void JpegSystemDrawing()
         {
-            this.bmpDrawing.Save(this.destinationStream, ImageFormat.Jpeg);
+            this.bmpDrawing.Save(this.destinationStream, this.jpegCodec, this.encoderParameters);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
 
         [Benchmark(Description = "ImageSharp Jpeg")]
         public void JpegCore()
         {
-            this.bmpCore.SaveAsJpeg(this.destinationStream);
+            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
+
+        // https://docs.microsoft.com/en-us/dotnet/api/system.drawing.imaging.encoderparameter?redirectedfrom=MSDN&view=net-5.0
+        private static ImageCodecInfo GetEncoder(ImageFormat format)
+        {
+            ImageCodecInfo[] codecs = ImageCodecInfo.GetImageDecoders();
+            foreach (ImageCodecInfo codec in codecs)
+            {
+                if (codec.FormatID == format.Guid)
+                {
+                    return codec;
+                }
+            }
+            return null;
+        }
     }
 }
 
 /*
-BenchmarkDotNet=v0.12.1, OS=Windows 10.0.18363.959 (1909/November2018Update/19H2)
-Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
-.NET Core SDK=3.1.302
-  [Host]     : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
-  DefaultJob : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
+BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET Core SDK=6.0.100-preview.3.21202.5
+  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
+  DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
 
 
-|                Method |     Mean |     Error |    StdDev | Ratio | RatioSD |
-|---------------------- |---------:|----------:|----------:|------:|--------:|
-| 'System.Drawing Jpeg' | 4.297 ms | 0.0244 ms | 0.0228 ms |  1.00 |    0.00 |
-|     'ImageSharp Jpeg' | 5.286 ms | 0.1034 ms | 0.0967 ms |  1.23 |    0.02 |
+|                Method |     Mean |    Error |   StdDev | Ratio | RatioSD |
+|---------------------- |---------:|---------:|---------:|------:|--------:|
+| 'System.Drawing Jpeg' | 39.54 ms | 0.269 ms | 0.225 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg' | 47.25 ms | 0.937 ms | 1.219 ms |  1.20 |    0.02 |
 */

From ae85722da6fe06f7ee68422e58af4f8830170aab Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 24 May 2021 16:33:47 +0300
Subject: [PATCH 34/99] Simplified WriteDefineHuffmanTables method

---
 .../Formats/Jpeg/JpegEncoderCore.cs           | 34 ++-----------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index b7459bdc7..c68c0ffb0 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -296,40 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 markerlen += 1 + 16 + s.Values.Length;
             }
 
-            // TODO: this magic constant (array size) should be defined by HuffmanSpec class
-            // This is a one-time call which can be stackalloc'ed or allocated directly in memory as method local array
-            // Allocation here would be better for GC so it won't live for entire encoding process
-            // TODO: if this is allocated on the heap - pin it right here or following copy code will corrupt memory
-            Span<byte> huffmanBuffer = stackalloc byte[179];
-            byte* huffmanBufferPtr = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(huffmanBuffer));
-
             this.WriteMarkerHeader(JpegConstants.Markers.DHT, markerlen);
             for (int i = 0; i < specs.Length; i++)
             {
-                ref HuffmanSpec spec = ref specs[i];
-
-                int len = 0;
-
-                // header
-                huffmanBuffer[len++] = headers[i];
-
-                // count
-                fixed (byte* countPtr = spec.Count)
-                {
-                    int countLen = spec.Count.Length;
-                    Unsafe.CopyBlockUnaligned(huffmanBufferPtr + len, countPtr, (uint)countLen);
-                    len += countLen;
-                }
-
-                // values
-                fixed (byte* valuesPtr = spec.Values)
-                {
-                    int valuesLen = spec.Values.Length;
-                    Unsafe.CopyBlockUnaligned(huffmanBufferPtr + len, valuesPtr, (uint)valuesLen);
-                    len += valuesLen;
-                }
-
-                this.outputStream.Write(huffmanBuffer, 0, len);
+                this.outputStream.WriteByte(headers[i]);
+                this.outputStream.Write(specs[i].Count);
+                this.outputStream.Write(specs[i].Values);
             }
         }
 

From a65e50377de0c08c715d08b93ac5c2202e546150 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 25 May 2021 14:45:26 +0300
Subject: [PATCH 35/99] Added MultiplySubstract method to the HwIntrinsics

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 4faf577fd..00c0d89f0 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -532,6 +532,7 @@ namespace SixLabors.ImageSharp
             /// <summary>
             /// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
             /// </summary>
+            /// <remarks>ret = (vm0 * vm1) + va</remarks>
             /// <param name="va">The vector to add to the intermediate result.</param>
             /// <param name="vm0">The first vector to multiply.</param>
             /// <param name="vm1">The second vector to multiply.</param>
@@ -552,6 +553,31 @@ namespace SixLabors.ImageSharp
                 }
             }
 
+            /// <summary>
+            /// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
+            /// </summary>
+            /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+            /// <param name="vs">The vector to substract from the intermediate result.</param>
+            /// <param name="vm0">The first vector to multiply.</param>
+            /// <param name="vm1">The second vector to multiply.</param>
+            /// <returns>The <see cref="Vector256{T}"/>.</returns>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public static Vector256<float> MultiplySubstract(
+                in Vector256<float> vs,
+                in Vector256<float> vm0,
+                in Vector256<float> vm1)
+            {
+                if (Fma.IsSupported)
+                {
+                    return Fma.MultiplySubtract(vm1, vm0, vs);
+                }
+                else
+                {
+                    return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+                }
+            }
+
+
             /// <summary>
             /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>

From 86abb73799c4792036713493d4ccfea2b355ad4a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 25 May 2021 14:57:48 +0300
Subject: [PATCH 36/99] Made FDCT8x8_Avx(...) method prettier with SimdUtils

---
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 27 +++----------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index d7101abfd..afcf4158b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -229,34 +229,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 
             // 2 6
             d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
-            if (Fma.IsSupported)
-            {
-                d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065));
-            }
-            else
-            {
-                d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065));
-            }
+            d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
 
             c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
-            if (Fma.IsSupported)
-            {
-                c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856));
-            }
-            else
-            {
-                c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856));
-            }
+            c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
 
             c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
-            if (Fma.IsSupported)
-            {
-                c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5));
-            }
-            else
-            {
-                c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5));
-            }
+            c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
 
             // 3 5
             d.V3 = Avx.Subtract(c0, c2);

From 0664f298d9aa8f4abbfaad608144c762a3024f3c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 26 May 2021 13:26:31 +0300
Subject: [PATCH 37/99] Replaced bit count lookup table to lzcnt
 implementation, Added MinimimBitsToStore to Numberics.cs

---
 src/ImageSharp/Common/Helpers/Numerics.cs     | 12 +++++++
 .../Components/Encoder/HuffmanScanEncoder.cs  | 34 ++-----------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 058199301..e8ba6dde6 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -825,5 +825,17 @@ namespace SixLabors.ImageSharp
             return Sse2.ConvertToInt32(vsum);
         }
 #endif
+
+        /// <summary>
+        /// Calculates how many minimum bits needed to store given value.
+        /// </summary>
+        /// <param name="number">Unsigned integer to store</param>
+        /// <returns>Minimum number of bits needed to store given value</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int MinimumBitsToStore(uint number)
+        {
+            const int bitInUnsignedInteger = sizeof(uint) * 8;
+            return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number);
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8b23211d3..0c1b4dedc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -3,6 +3,7 @@
 
 using System;
 using System.IO;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
@@ -54,29 +55,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.target = outputStream;
         }
 
-        /// <summary>
-        /// Gets the counts the number of bits needed to hold an integer.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> BitCountLut => new byte[]
-            {
-                0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
-                5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8,
-            };
-
         /// <summary>
         /// Encodes the image with no subsampling.
         /// </summary>
@@ -394,15 +372,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 b = value - 1;
             }
 
-            uint bt;
-            if (a < 0x100)
-            {
-                bt = BitCountLut[a];
-            }
-            else
-            {
-                bt = 8 + (uint)BitCountLut[a >> 8];
-            }
+            uint bt = (uint)Numerics.MinimumBitsToStore((uint)a);
 
             this.EmitHuff(index, (int)((uint)(runLength << 4) | bt));
             if (bt > 0)

From 28ea2adb08fef8c59ad50dfc0bc1ad6b7cbf3714 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 26 May 2021 14:15:48 +0300
Subject: [PATCH 38/99] Fixed comments, removed todo, updated benchmark results

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs  | 1 -
 tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs      | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 0c1b4dedc..28eefadc7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -125,7 +125,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
             Block8x8F b = default;
             Span<Block8x8F> cb = stackalloc Block8x8F[4];
             Span<Block8x8F> cr = stackalloc Block8x8F[4];
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 839f19e87..90b0501eb 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -16,8 +16,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
         private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
         private const int EncodingQuality = 100;
 
-        // GDI+ uses 4:1:1 subsampling - https://stackoverflow.com/questions/745610/how-to-disable-subsampling-with-net-gdi
-        // ImageSharp lowest subsampling is 4:2:0 which is an okay approximation
+        // GDI+ uses 4:2:0 subsampling
         private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;
 
         // System.Drawing
@@ -103,6 +102,6 @@ Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
 
 |                Method |     Mean |    Error |   StdDev | Ratio | RatioSD |
 |---------------------- |---------:|---------:|---------:|------:|--------:|
-| 'System.Drawing Jpeg' | 39.54 ms | 0.269 ms | 0.225 ms |  1.00 |    0.00 |
-|     'ImageSharp Jpeg' | 47.25 ms | 0.937 ms | 1.219 ms |  1.20 |    0.02 |
+| 'System.Drawing Jpeg' | 39.67 ms | 0.774 ms | 0.828 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg' | 45.39 ms | 0.415 ms | 0.346 ms |  1.14 |    0.03 |
 */

From d2510036a6e19180f0199d8ef37986d932c86f51 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 26 May 2021 21:53:27 +0300
Subject: [PATCH 39/99] Implemented fallback code for runtimes where
 BitOperations class is not supported.

---
 shared-infrastructure                         |  2 +-
 src/ImageSharp/Common/Helpers/Numerics.cs     | 35 ++++++++++++++++++-
 .../Components/Encoder/HuffmanScanEncoder.cs  |  2 +-
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index 48e73f455..25f565310 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506
+Subproject commit 25f56531057293e9f1fa8e070b2f780a0c3d7e0c
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index e8ba6dde6..37d2a943c 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -23,6 +23,25 @@ namespace SixLabors.ImageSharp
         private const int ShuffleAlphaControl = 0b_11_11_11_11;
 #endif
 
+#if !SUPPORTS_BITOPERATIONS
+        private static ReadOnlySpan<byte> BitCountLut => new byte[]
+            {
+                0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
+                5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8,
+            };
+#endif
+
         /// <summary>
         /// Determine the Greatest CommonDivisor (GCD) of two numbers.
         /// </summary>
@@ -832,10 +851,24 @@ namespace SixLabors.ImageSharp
         /// <param name="number">Unsigned integer to store</param>
         /// <returns>Minimum number of bits needed to store given value</returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int MinimumBitsToStore(uint number)
+        public static int MinimumBitsToStore16(uint number)
         {
+#if SUPPORTS_BITOPERATIONS
             const int bitInUnsignedInteger = sizeof(uint) * 8;
             return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number);
+#else
+            int bt;
+            if (number < 0x100)
+            {
+                bt = BitCountLut[(int)number];
+            }
+            else
+            {
+                bt = 8 + BitCountLut[(int)(number >> 8)];
+            }
+
+            return bt;
+#endif
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 28eefadc7..8f133f0de 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -371,7 +371,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 b = value - 1;
             }
 
-            uint bt = (uint)Numerics.MinimumBitsToStore((uint)a);
+            uint bt = (uint)Numerics.MinimumBitsToStore16((uint)a);
 
             this.EmitHuff(index, (int)((uint)(runLength << 4) | bt));
             if (bt > 0)

From ceb4fdfae098187e1cce85e3803305d65085ee0f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 14:17:14 +0300
Subject: [PATCH 40/99] Replaced unsafe Block8x8F/Vector4<float> ->
 Vector256<float> casts

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 105 ++++++------------
 .../Encoder/RgbToYCbCrConverterVectorized.cs  |   8 +-
 2 files changed, 41 insertions(+), 72 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index dbc22eaea..340d8e5c5 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -313,14 +313,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
+                this.V0 = Avx.Multiply(this.V0, valueVec);
+                this.V1 = Avx.Multiply(this.V1, valueVec);
+                this.V2 = Avx.Multiply(this.V2, valueVec);
+                this.V3 = Avx.Multiply(this.V3, valueVec);
+                this.V4 = Avx.Multiply(this.V4, valueVec);
+                this.V5 = Avx.Multiply(this.V5, valueVec);
+                this.V6 = Avx.Multiply(this.V6, valueVec);
+                this.V7 = Avx.Multiply(this.V7, valueVec);
             }
             else
 #endif
@@ -354,45 +354,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V0L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V0L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V1L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V1L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V2L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V2L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V3L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V3L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V4L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V4L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V5L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V5L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V6L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V6L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V7L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V7L));
+                this.V0 = Avx.Multiply(this.V0, other.V0);
+                this.V1 = Avx.Multiply(this.V1, other.V1);
+                this.V2 = Avx.Multiply(this.V2, other.V2);
+                this.V3 = Avx.Multiply(this.V3, other.V3);
+                this.V4 = Avx.Multiply(this.V4, other.V4);
+                this.V5 = Avx.Multiply(this.V5, other.V5);
+                this.V6 = Avx.Multiply(this.V6, other.V6);
+                this.V7 = Avx.Multiply(this.V7, other.V7);
             }
             else
 #endif
@@ -427,14 +396,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
+                this.V0 = Avx.Add(this.V0, valueVec);
+                this.V1 = Avx.Add(this.V1, valueVec);
+                this.V2 = Avx.Add(this.V2, valueVec);
+                this.V3 = Avx.Add(this.V3, valueVec);
+                this.V4 = Avx.Add(this.V4, valueVec);
+                this.V5 = Avx.Add(this.V5, valueVec);
+                this.V6 = Avx.Add(this.V6, valueVec);
+                this.V7 = Avx.Add(this.V7, valueVec);
             }
             else
 #endif
@@ -529,12 +498,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             var f2 = Vector256.Create(2f);
             var f025 = Vector256.Create(0.25f);
             Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
-            ref Vector256<float> destRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref destination);
+            ref Vector256<float> destRef = ref destination.V0;
 
             for (int i = 0; i < 2; i++)
             {
-                ref Vector256<float> in1 = ref Unsafe.As<Block8x8F, Vector256<float>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i));
-                ref Vector256<float> in2 = ref Unsafe.As<Block8x8F, Vector256<float>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1));
+                ref Vector256<float> in1 = ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i).V0;
+                ref Vector256<float> in2 = ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1).V0;
 
                 for (int j = 0; j < 8; j += 2)
                 {
@@ -588,8 +557,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 var vadd = Vector256.Create(.5F);
                 var vone = Vector256.Create(1f);
 
-                ref Vector256<float> aBase = ref Unsafe.AsRef(Unsafe.As<Vector4, Vector256<float>>(ref a.V0L));
-                ref Vector256<float> bBase = ref Unsafe.AsRef(Unsafe.As<Vector4, Vector256<float>>(ref b.V0L));
+                ref Vector256<float> aBase = ref a.V0;
+                ref Vector256<float> bBase = ref b.V0;
                 ref Vector256<float> aEnd = ref Unsafe.Add(ref aBase, 8);
 
                 do
@@ -840,26 +809,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector256<float> t0 = Avx.UnpackLow(r0, r1);
                 Vector256<float> t2 = Avx.UnpackLow(r2, r3);
                 Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
+                d.V0 = Avx.Blend(t0, v, 0xCC);
+                d.V1 = Avx.Blend(t2, v, 0x33);
 
                 Vector256<float> t4 = Avx.UnpackLow(r4, r5);
                 Vector256<float> t6 = Avx.UnpackLow(r6, r7);
                 v = Avx.Shuffle(t4, t6, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
+                d.V4 = Avx.Blend(t4, v, 0xCC);
+                d.V5 = Avx.Blend(t6, v, 0x33);
 
                 Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
                 Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
                 v = Avx.Shuffle(t1, t3, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
+                d.V2 = Avx.Blend(t1, v, 0xCC);
+                d.V3 = Avx.Blend(t3, v, 0x33);
 
                 Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
                 Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
                 v = Avx.Shuffle(t5, t7, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
+                d.V6 = Avx.Blend(t5, v, 0xCC);
+                d.V7 = Avx.Blend(t7, v, 0x33);
             }
             else
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 209cc3c6a..3ee1ca989 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 using System;
@@ -64,9 +64,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             var zero = Vector256.Create(0).AsByte();
 
             ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
-            ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
-            ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
-            ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
+            ref Vector256<float> destYRef = ref yBlock.V0;
+            ref Vector256<float> destCbRef = ref cbBlock.V0;
+            ref Vector256<float> destCrRef = ref crBlock.V0;
 
             var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
             var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));

From 70474c8fae925037899579bef0a37cfe0f42a9ac Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 16:02:58 +0300
Subject: [PATCH 41/99] Removed redundant enum casting durint huffman encoding

---
 .../Jpeg/Components/Encoder/HuffmanScanEncoder.cs      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8f133f0de..afd5acb4b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -257,10 +257,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int dc = (int)refTemp2[0];
 
             // Emit the DC delta.
-            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC);
+            this.EmitHuffRLE((2 * (int)index) + 0, 0, dc - prevDC);
 
             // Emit the AC components.
-            var h = (HuffIndex)((2 * (int)index) + 1);
+            int h = (2 * (int)index) + 1;
             int runLength = 0;
 
             for (int zig = 1; zig < Block8x8F.Size; zig++)
@@ -348,9 +348,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="index">The index of the Huffman encoder</param>
         /// <param name="value">The value to encode.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuff(HuffIndex index, int value)
+        private void EmitHuff(int index, int value)
         {
-            uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value];
+            uint x = HuffmanLut.TheHuffmanLut[index].Values[value];
             this.Emit(x & ((1 << 24) - 1), x >> 24);
         }
 
@@ -361,7 +361,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="runLength">The number of copies to encode.</param>
         /// <param name="value">The value to encode.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuffRLE(HuffIndex index, int runLength, int value)
+        private void EmitHuffRLE(int index, int runLength, int value)
         {
             int a = value;
             int b = value;

From 52e60362680ed54d7d67e7722d885af1f36ea3e6 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 16:35:09 +0300
Subject: [PATCH 42/99] Reimplemented Emit methods in HuffmanScanEncoder to get
 rid of unreadable amount of int/uint casts

---
 .../Components/Encoder/HuffmanScanEncoder.cs     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index afd5acb4b..bbc997018 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -35,12 +35,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
         /// </summary>
-        private uint accumulatedBits;
+        private int accumulatedBits;
 
         /// <summary>
         /// Number of jagged bits stored in <see cref="accumulatedBits"/>
         /// </summary>
-        private uint bitCount;
+        private int bitCount;
 
         private Block8x8F temporalBlock1;
         private Block8x8F temporalBlock2;
@@ -303,10 +303,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="bits">The packed bits.</param>
         /// <param name="count">The number of bits</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(uint bits, uint count)
+        private void Emit(int bits, int count)
         {
             count += this.bitCount;
-            bits <<= (int)(32 - count);
+            bits <<= 32 - count;
             bits |= this.accumulatedBits;
 
             // Only write if more than 8 bits.
@@ -350,7 +350,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuff(int index, int value)
         {
-            uint x = HuffmanLut.TheHuffmanLut[index].Values[value];
+            int x = (int)HuffmanLut.TheHuffmanLut[index].Values[value];
             this.Emit(x & ((1 << 24) - 1), x >> 24);
         }
 
@@ -371,12 +371,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 b = value - 1;
             }
 
-            uint bt = (uint)Numerics.MinimumBitsToStore16((uint)a);
+            int bt = Numerics.MinimumBitsToStore16((uint)a);
 
-            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt));
+            this.EmitHuff(index, (runLength << 4) | bt);
             if (bt > 0)
             {
-                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt);
+                this.Emit(b & ((1 << bt) - 1), bt);
             }
         }
     }

From 7fb8feef50df5417bbb467bca451e43987637705 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 17:10:36 +0300
Subject: [PATCH 43/99] Fixed xml docs

---
 shared-infrastructure                                      | 2 +-
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs  | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index 25f565310..1f7ee7028 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 25f56531057293e9f1fa8e070b2f780a0c3d7e0c
+Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index bbc997018..571a80698 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <remarks>
         /// This is subject to change, 1024 seems to be the best value in terms of performance.
-        /// <see cref="Emit(uint, uint)"/> expects it to be at least 8 (see comments in method body).
+        /// <see cref="Emit(int, int)"/> expects it to be at least 8 (see comments in method body).
         /// </remarks>
         private const int EmitBufferSizeInBytes = 1024;
 
@@ -374,10 +374,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int bt = Numerics.MinimumBitsToStore16((uint)a);
 
             this.EmitHuff(index, (runLength << 4) | bt);
-            if (bt > 0)
-            {
-                this.Emit(b & ((1 << bt) - 1), bt);
-            }
+            this.Emit(b & ((1 << bt) - 1), bt);
         }
     }
 }

From d7fd9478762b59408021bdb4039beeca43502289 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 18:08:59 +0300
Subject: [PATCH 44/99] Updated default quality settings in jpeg encoding
 benchmark

---
 shared-infrastructure                                 | 2 +-
 tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index 1f7ee7028..48e73f455 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7
+Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 90b0501eb..e22259f76 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -14,7 +14,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
     public class EncodeJpeg
     {
         private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
-        private const int EncodingQuality = 100;
+        // GDI+ most likely uses 75 as default quality - https://stackoverflow.com/questions/3957477/what-quality-level-does-image-save-use-for-jpeg-files
+        private const int EncodingQuality = 75;
 
         // GDI+ uses 4:2:0 subsampling
         private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;

From 81979e0f29ccbd425158da6c49604550e437ff62 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 19:23:35 +0300
Subject: [PATCH 45/99] Improved flush logic after main encode methods run

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 571a80698..d69473124 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -108,9 +108,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            // Pad the last byte with 1's.
-            this.Emit(0x7f, 7);
-            this.target.Write(this.emitBuffer, 0, this.emitLen);
+            this.FlushInternalBuffer();
         }
 
         /// <summary>
@@ -181,9 +179,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            // Pad the last byte with 1's.
-            this.Emit(0x7f, 7);
-            this.target.Write(this.emitBuffer, 0, this.emitLen);
+            this.FlushInternalBuffer();
         }
 
         /// <summary>
@@ -224,9 +220,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            // Pad the last byte with 1's.
-            this.Emit(0x7f, 7);
-            this.target.Write(this.emitBuffer, 0, this.emitLen);
+            this.FlushInternalBuffer();
         }
 
         /// <summary>
@@ -376,5 +370,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             this.EmitHuff(index, (runLength << 4) | bt);
             this.Emit(b & ((1 << bt) - 1), bt);
         }
+
+        /// <summary>
+        /// Writes remaining bytes from internal buffer to the target stream.
+        /// </summary>
+        /// <remarks>Pads last byte with 1's if necessary</remarks>
+        private void FlushInternalBuffer()
+        {
+            // pad last byte with 1's
+            int padBitsCount = 8 - (this.bitCount % 8);
+            if (padBitsCount != 0)
+            {
+                this.Emit(0xff, padBitsCount);
+            }
+
+            // flush remaining bytes
+            if (this.emitLen != 0)
+            {
+                this.target.Write(this.emitBuffer, 0, this.emitLen);
+            }
+        }
     }
 }

From 16842496be84834e88a90fad70b33ede1d2ecf82 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 22:30:30 +0300
Subject: [PATCH 46/99] Brought back if check

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index d69473124..af8192749 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -368,7 +368,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int bt = Numerics.MinimumBitsToStore16((uint)a);
 
             this.EmitHuff(index, (runLength << 4) | bt);
-            this.Emit(b & ((1 << bt) - 1), bt);
+            if (bt > 0)
+            {
+                this.Emit(b & ((1 << bt) - 1), bt);
+            }
         }
 
         /// <summary>

From 9c0999e9db43f4adca0174d266d9eb49fb077aea Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 27 May 2021 23:54:50 +0300
Subject: [PATCH 47/99] Huffman lookup tables are now integers instead of
 unsigned integers

---
 .../Formats/Jpeg/Components/Encoder/HuffmanLut.cs      | 10 +++++-----
 .../Jpeg/Components/Encoder/HuffmanScanEncoder.cs      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index bc2c7634b..bc6c8c6cc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
@@ -44,7 +44,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            this.Values = new uint[maxValue + 1];
+            this.Values = new int[maxValue + 1];
 
             int code = 0;
             int k = 0;
@@ -54,7 +54,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 int bits = (i + 1) << 24;
                 for (int j = 0; j < spec.Count[i]; j++)
                 {
-                    this.Values[spec.Values[k]] = (uint)(bits | code);
+                    this.Values[spec.Values[k]] = bits | code;
                     code++;
                     k++;
                 }
@@ -66,6 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Gets the collection of huffman values.
         /// </summary>
-        public uint[] Values { get; }
+        public int[] Values { get; }
     }
-}
\ No newline at end of file
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index af8192749..0320229a2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -344,7 +344,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(InliningOptions.ShortMethod)]
         private void EmitHuff(int index, int value)
         {
-            int x = (int)HuffmanLut.TheHuffmanLut[index].Values[value];
+            int x = HuffmanLut.TheHuffmanLut[index].Values[value];
             this.Emit(x & ((1 << 24) - 1), x >> 24);
         }
 

From 169e98bbcd15c42424f710b557a1471a88c150a5 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 28 May 2021 11:47:16 +0300
Subject: [PATCH 48/99] Simplified Block8x8F.DivideRoundAll() method

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 340d8e5c5..0acc6408e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -68,6 +68,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public Vector4 V7R;
 
 #if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
         [FieldOffset(0)]
         public Vector256<float> V0;
         [FieldOffset(32)]
@@ -557,19 +562,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 var vadd = Vector256.Create(.5F);
                 var vone = Vector256.Create(1f);
 
-                ref Vector256<float> aBase = ref a.V0;
-                ref Vector256<float> bBase = ref b.V0;
-                ref Vector256<float> aEnd = ref Unsafe.Add(ref aBase, 8);
-
-                do
+                for (int i = 0; i < RowCount; i++)
                 {
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aBase), vone), vadd);
-                    Unsafe.Add(ref aBase, 0) = Avx.Add(Avx.Divide(aBase, bBase), voff);
-
-                    aBase = ref Unsafe.Add(ref aBase, 1);
-                    bBase = ref Unsafe.Add(ref bBase, 1);
+                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
+                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
+                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
+                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
                 }
-                while (Unsafe.IsAddressLessThan(ref aBase, ref aEnd));
             }
             else
 #endif

From 6ac2b6660bf015ee95637c7af948bbffa18a1c4f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 14:21:49 +0300
Subject: [PATCH 49/99] Added comments to vectorized rgb->ycbcr converter for
 further code changes

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 3ee1ca989..a6ff21bdc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -47,6 +47,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         };
 #endif
 
+        /// <summary>
+        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices
+        /// </summary>
+        /// <remarks>Total size of rgb span must be 200 bytes</remarks>
+        /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
+        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
+        /// <param name="cbBlock">8x8 destination matrix of Chrominance(Cb) converted data</param>
+        /// <param name="crBlock">8x8 destination matrix of Chrominance(Cr) converted data</param>
         public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@@ -63,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             var f05 = Vector256.Create(0.5f);
             var zero = Vector256.Create(0).AsByte();
 
-            ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
             ref Vector256<float> destYRef = ref yBlock.V0;
             ref Vector256<float> destCbRef = ref cbBlock.V0;
             ref Vector256<float> destCrRef = ref crBlock.V0;
@@ -72,9 +80,31 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
             Vector256<byte> rgb, rg, bx;
             Vector256<float> r, g, b;
+
+            // TODO: probably remove this after the draft
+            // rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total
+            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
+            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
+            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
+            // stride 0    0    - 192  -(+64bits)-> 256
+            // stride 1    192  - 384  -(+64bits)-> 448
+            // stride 2    384  - 576  -(+64bits)-> 640
+            // stride 3    576  - 768  -(+64bits)-> 832
+            // stride 4    768  - 960  -(+64bits)-> 1024
+            // stride 5    960  - 1152 -(+64bits)-> 1216
+            // stride 6    1152 - 1344 -(+64bits)-> 1408
+            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
+            //
+            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
+            // This is not permitted - we are reading foreign memory
+            // That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved
+            //
+            // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
+            // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
+            const int bytesPerRgbStride = 24;
             for (int i = 0; i < 7; i++)
             {
-                rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
+                rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte();
 
                 rgb = Avx2.Shuffle(rgb, extractRgbMask);
 
@@ -96,7 +126,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 
             extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
-            rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
+            rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
             rgb = Avx2.Shuffle(rgb, extractRgbMask);
 
             rg = Avx2.UnpackLow(rgb, zero);

From a845c00f6f5698dc2ba5e11a39791d49bc443eb6 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 14:47:06 +0300
Subject: [PATCH 50/99] Simplified RgbToYCbCrConverterVectorized.Convert()
 method

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 28 +------------------
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  | 17 +++++++----
 2 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index a6ff21bdc..62e82243c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -34,12 +34,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
         };
 
-        private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
-        {
-            2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
-            5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
-        };
-
         private static ReadOnlySpan<byte> ExtractRgb => new byte[]
         {
             0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
@@ -102,7 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
             // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
             const int bytesPerRgbStride = 24;
-            for (int i = 0; i < 7; i++)
+            for (int i = 0; i < 8; i++)
             {
                 rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte();
 
@@ -124,26 +118,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
                 Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
             }
-
-            extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
-            rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
-            rgb = Avx2.Shuffle(rgb, extractRgbMask);
-
-            rg = Avx2.UnpackLow(rgb, zero);
-            bx = Avx2.UnpackHigh(rgb, zero);
-
-            r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
-            g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
-            b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
-
-            // (0.299F * r) + (0.587F * g) + (0.114F * b);
-            Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
-
-            // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-            Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
-
-            // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-            Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
 #endif
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 81e64b277..ee4626b86 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
 
@@ -42,14 +43,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Temporal RGB block
         /// </summary>
-        private GenericBlock8x8<Rgb24> rgbBlock;
+        private Span<Rgb24> rgbSpan;
 
         public static YCbCrForwardConverter<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter<TPixel>);
+
+            // creating rgb pixel bufferr
+            // TODO: this is subject to discuss
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[200].AsSpan());
+
+            // Avoid creating lookup tables, when vectorized converter is supported
             if (!RgbToYCbCrConverterVectorized.IsSupported)
             {
-                // Avoid creating lookup tables, when vectorized converter is supported
                 result.colorTables = RgbToYCbCrConverterLut.Create();
             }
 
@@ -63,8 +69,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
 
-            Span<Rgb24> rgbSpan = this.rgbBlock.AsSpanUnsafe();
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan);
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), this.rgbSpan);
 
             ref Block8x8F yBlock = ref this.Y;
             ref Block8x8F cbBlock = ref this.Cb;
@@ -72,11 +77,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
             else
             {
-                this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
     }

From 2ad3ddb0364784916b95bce180618da7b279783b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 19:31:39 +0300
Subject: [PATCH 51/99] [WIP] Introduced RgbToYCbCrConverterVectorized 420
 sampling

---
 .../Components/Encoder/HuffmanScanEncoder.cs  |  21 +--
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 141 +++++++++++++++++-
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  |  36 +++++
 3 files changed, 184 insertions(+), 14 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 0320229a2..dc41e179e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -83,7 +83,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 for (int x = 0; x < pixels.Width; x += 8)
                 {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
+                    pixelConverter.Convert444(frame, x, y, ref currentRows);
 
                     prevDCY = this.WriteBlock(
                         QuantIndex.Luminance,
@@ -123,9 +123,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            Block8x8F b = default;
-            Span<Block8x8F> cb = stackalloc Block8x8F[4];
-            Span<Block8x8F> cr = stackalloc Block8x8F[4];
+            Span<Block8x8F> temporalBlocks = stackalloc Block8x8F[2];
 
             var unzig = ZigZag.CreateUnzigTable();
 
@@ -148,32 +146,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         int yOff = (i & 2) * 4;
 
                         currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows);
-
-                        cb[i] = pixelConverter.Cb;
-                        cr[i] = pixelConverter.Cr;
+                        pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, ref temporalBlocks[0], i);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
-                            ref pixelConverter.Y,
+                            ref temporalBlocks[0],
                             ref luminanceQuantTable,
                             ref unzig);
                     }
 
-                    Block8x8F.Scale16X16To8X8(ref b, cb);
+                    pixelConverter.ConvertCbCr(ref temporalBlocks[0], ref temporalBlocks[1]);
+
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCb,
-                        ref b,
+                        ref temporalBlocks[0],
                         ref chrominanceQuantTable,
                         ref unzig);
 
-                    Block8x8F.Scale16X16To8X8(ref b, cr);
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCr,
-                        ref b,
+                        ref temporalBlocks[1],
                         ref chrominanceQuantTable,
                         ref unzig);
                 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 62e82243c..9760e9e93 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -42,7 +42,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
 
         /// <summary>
-        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices
+        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling
         /// </summary>
         /// <remarks>Total size of rgb span must be 200 bytes</remarks>
         /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
@@ -120,5 +120,144 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 #endif
         }
+
+        /// <summary>
+        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
+        /// </summary>
+        /// <remarks>Total size of rgb span must be 200 bytes</remarks>
+        /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
+        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
+        /// <param name="rAcc"></param>
+        /// <param name="gAcc"></param>
+        /// <param name="bAcc"></param>
+        /// <param name="idx"></param>
+        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F rAcc, ref Block8x8F gAcc, ref Block8x8F bAcc, int idx)
+        {
+            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            var f0299 = Vector256.Create(0.299f);
+            var f0587 = Vector256.Create(0.587f);
+            var f0114 = Vector256.Create(0.114f);
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+            var zero = Vector256.Create(0).AsByte();
+
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+            ref Vector256<float> destYRef = ref yBlock.V0;
+
+            int destOffset = (idx & 2) * 4 + (idx & 1);
+
+            ref Vector128<float> destRedRef   = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref rAcc), destOffset);
+            ref Vector128<float> destGreenRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref gAcc), destOffset);
+            ref Vector128<float> destBlueRef  = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref bAcc), destOffset);
+
+            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
+            Vector256<byte> rgb, rg, bx;
+            Vector256<float> r, g, b;
+
+            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
+
+            const int bytesPerRgbStride = 24;
+            for (int i = 0; i < 2; i++)
+            {
+                // each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7]
+                for (int j = 0; j < 4; j++)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
+
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                int localDestOffset = (i & 1) * 4;
+
+                // red
+                Vector256<float> twoLane = Scale_8x4_4x2(rDataLanes);
+                Unsafe.Add(ref destRedRef, localDestOffset) = twoLane.GetLower();
+                Unsafe.Add(ref destRedRef, localDestOffset + 2) = twoLane.GetUpper();
+
+                // green
+                twoLane = Scale_8x4_4x2(gDataLanes);
+                Unsafe.Add(ref destGreenRef, localDestOffset) = twoLane.GetLower();
+                Unsafe.Add(ref destGreenRef, localDestOffset + 2) = twoLane.GetUpper();
+
+                // blue
+                twoLane = Scale_8x4_4x2(bDataLanes);
+                Unsafe.Add(ref destBlueRef, localDestOffset) = twoLane.GetLower();
+                Unsafe.Add(ref destBlueRef, localDestOffset + 2) = twoLane.GetUpper();
+            }
+#endif
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
+        {
+            Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
+            var f025 = Vector256.Create(0.25f);
+
+            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[1]);
+            Vector256<float> botPairSum = SumHorizontalPairs(v[2], v[3]);
+
+            return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector256<float> SumHorizontalPairs(Vector256<float> v0, Vector256<float> v1)
+            => Avx.Add(Avx.Shuffle(v0, v1, 0b10_00_10_00), Avx.Shuffle(v0, v1, 0b11_01_11_01));
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
+            => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
+
+        public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        {
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+
+            ref Vector256<float> destCbRef = ref cbBlock.V0;
+            ref Vector256<float> destCrRef = ref crBlock.V0;
+
+            ref Vector256<float> rRef = ref rBlock.V0;
+            ref Vector256<float> gRef = ref gBlock.V0;
+            ref Vector256<float> bRef = ref bBlock.V0;
+
+            for (int i = 0; i < 8; i++)
+            {
+                ref Vector256<float> r = ref Unsafe.Add(ref rRef, i);
+                ref Vector256<float> g = ref Unsafe.Add(ref gRef, i);
+                ref Vector256<float> b = ref Unsafe.Add(ref bRef, i);
+
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+            }
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index ee4626b86..7bf7b8547 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -84,5 +84,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
+        /// </summary>
+        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, ref Block8x8F yBlock, int idx)
+        {
+            this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
+
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), this.rgbSpan);
+
+            ref Block8x8F rSub = ref this.Y;
+            ref Block8x8F gSub = ref this.Cb;
+            ref Block8x8F bSub = ref this.Cr;
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref yBlock, ref rSub, ref gSub, ref bSub, idx);
+            }
+            else
+            {
+                throw new NotSupportedException("This is not yet implemented");
+                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+        }
+
+        public void ConvertCbCr(ref Block8x8F cb, ref Block8x8F cr)
+        {
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.ConvertCbCr(ref this.Y, ref this.Cb, ref this.Cr, ref cb, ref cr);
+            }
+            else
+            {
+                throw new NotSupportedException("This is not yet implemented");
+            }
+        }
     }
 }

From 201c5341e69fbedcbe5bc619edb81ee85419321f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 19:40:13 +0300
Subject: [PATCH 52/99] Fixed HuffmanScanEncoder error

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index dc41e179e..3d99a1b95 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -83,7 +83,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 for (int x = 0; x < pixels.Width; x += 8)
                 {
-                    pixelConverter.Convert444(frame, x, y, ref currentRows);
+                    pixelConverter.Convert(frame, x, y, ref currentRows);
 
                     prevDCY = this.WriteBlock(
                         QuantIndex.Luminance,

From 8a7749644ab7b1170fc86194b400007885144678 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 21:19:36 +0300
Subject: [PATCH 53/99] Imporved internal rgb -> rcbcr conversion api for 420
 subsampling

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 10 +++---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 36 ++++++++-----------
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  | 20 ++---------
 3 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 3d99a1b95..ff5ce957e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -146,29 +146,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         int yOff = (i & 2) * 4;
 
                         currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, ref temporalBlocks[0], i);
+                        pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, i);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
-                            ref temporalBlocks[0],
+                            ref pixelConverter.Y,
                             ref luminanceQuantTable,
                             ref unzig);
                     }
 
-                    pixelConverter.ConvertCbCr(ref temporalBlocks[0], ref temporalBlocks[1]);
-
                     prevDCCb = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCb,
-                        ref temporalBlocks[0],
+                        ref pixelConverter.Cb,
                         ref chrominanceQuantTable,
                         ref unzig);
 
                     prevDCCr = this.WriteBlock(
                         QuantIndex.Chrominance,
                         prevDCCr,
-                        ref temporalBlocks[1],
+                        ref pixelConverter.Cr,
                         ref chrominanceQuantTable,
                         ref unzig);
                 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 9760e9e93..055c7176a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -126,12 +126,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <remarks>Total size of rgb span must be 200 bytes</remarks>
         /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
-        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
-        /// <param name="rAcc"></param>
-        /// <param name="gAcc"></param>
-        /// <param name="bAcc"></param>
-        /// <param name="idx"></param>
-        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F rAcc, ref Block8x8F gAcc, ref Block8x8F bAcc, int idx)
+        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>ф
+        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 
@@ -152,9 +148,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             int destOffset = (idx & 2) * 4 + (idx & 1);
 
-            ref Vector128<float> destRedRef   = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref rAcc), destOffset);
-            ref Vector128<float> destGreenRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref gAcc), destOffset);
-            ref Vector128<float> destBlueRef  = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref bAcc), destOffset);
+            ref Vector128<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref cbBlock), destOffset);
+            ref Vector128<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref crBlock), destOffset);
 
             var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
             var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
@@ -192,20 +187,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 int localDestOffset = (i & 1) * 4;
 
-                // red
-                Vector256<float> twoLane = Scale_8x4_4x2(rDataLanes);
-                Unsafe.Add(ref destRedRef, localDestOffset) = twoLane.GetLower();
-                Unsafe.Add(ref destRedRef, localDestOffset + 2) = twoLane.GetUpper();
+                r = Scale_8x4_4x2(rDataLanes);
+                g = Scale_8x4_4x2(gDataLanes);
+                b = Scale_8x4_4x2(bDataLanes);
 
-                // green
-                twoLane = Scale_8x4_4x2(gDataLanes);
-                Unsafe.Add(ref destGreenRef, localDestOffset) = twoLane.GetLower();
-                Unsafe.Add(ref destGreenRef, localDestOffset + 2) = twoLane.GetUpper();
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Vector256<float> cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+                Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower();
+                Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper();
 
-                // blue
-                twoLane = Scale_8x4_4x2(bDataLanes);
-                Unsafe.Add(ref destBlueRef, localDestOffset) = twoLane.GetLower();
-                Unsafe.Add(ref destBlueRef, localDestOffset + 2) = twoLane.GetUpper();
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Vector256<float> cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+                Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower();
+                Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper();
             }
 #endif
         }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 7bf7b8547..c835e8df8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -88,19 +88,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
         /// </summary>
-        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, ref Block8x8F yBlock, int idx)
+        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
         {
             this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
 
             PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), this.rgbSpan);
 
-            ref Block8x8F rSub = ref this.Y;
-            ref Block8x8F gSub = ref this.Cb;
-            ref Block8x8F bSub = ref this.Cr;
-
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref yBlock, ref rSub, ref gSub, ref bSub, idx);
+                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.Y, ref this.Cb, ref this.Cr, idx);
             }
             else
             {
@@ -108,17 +104,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
-
-        public void ConvertCbCr(ref Block8x8F cb, ref Block8x8F cr)
-        {
-            if (RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                RgbToYCbCrConverterVectorized.ConvertCbCr(ref this.Y, ref this.Cb, ref this.Cr, ref cb, ref cr);
-            }
-            else
-            {
-                throw new NotSupportedException("This is not yet implemented");
-            }
-        }
     }
 }

From 052ebde3ad4abd3a68d9648a66fc4ae9be37df82 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 29 May 2021 22:31:17 +0300
Subject: [PATCH 54/99] Replaced GenericBlocl8x8 with Span in ycbcr converter

---
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  | 65 +++++++++++++++++--
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index c835e8df8..952dde111 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
@@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Temporal 8x8 block to hold TPixel data
         /// </summary>
-        private GenericBlock8x8<TPixel> pixelBlock;
+        private Span<TPixel> pixelSpan;
 
         /// <summary>
         /// Temporal RGB block
@@ -52,6 +53,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // creating rgb pixel bufferr
             // TODO: this is subject to discuss
             result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[200].AsSpan());
+            result.pixelSpan = new TPixel[64].AsSpan();
 
             // Avoid creating lookup tables, when vectorized converter is supported
             if (!RgbToYCbCrConverterVectorized.IsSupported)
@@ -67,9 +69,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
         {
-            this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
+            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y);
 
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), this.rgbSpan);
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
             ref Block8x8F yBlock = ref this.Y;
             ref Block8x8F cbBlock = ref this.Cb;
@@ -90,9 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
         {
-            this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
+            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(8, buffer.Width - x), Math.Min(8, buffer.Height - y));
 
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), this.rgbSpan);
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
@@ -104,5 +108,56 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
+
+        // TODO: add DebugGuard checks?
+        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height)
+        {
+            //Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width));
+            //Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width));
+
+            // TODO: this is a strange check, most likely it was introduces due to 2x 8x8 blocks subsampling, should be gone after new 4:2:0 implementation
+            if (width <= 0 || height <= 0)
+            {
+                return;
+            }
+
+            uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
+            int remainderXCount = 8 - width;
+
+            ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
+            int rowSizeInBytes = 8 * Unsafe.SizeOf<TPixel>();
+
+            for (int y = 0; y < height; y++)
+            {
+                Span<TPixel> row = source[y];
+
+                ref byte s = ref Unsafe.As<TPixel, byte>(ref row[startX]);
+                ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes);
+
+                Unsafe.CopyBlock(ref d, ref s, byteWidth);
+
+                ref TPixel last = ref Unsafe.Add(ref Unsafe.As<byte, TPixel>(ref d), width - 1);
+
+                for (int x = 1; x <= remainderXCount; x++)
+                {
+                    Unsafe.Add(ref last, x) = last;
+                }
+            }
+
+            int remainderYCount = 8 - height;
+
+            if (remainderYCount == 0)
+            {
+                return;
+            }
+
+            ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes);
+
+            for (int y = 1; y <= remainderYCount; y++)
+            {
+                ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y);
+                Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes);
+            }
+        }
     }
 }

From d50e255c854cd3c3e46238f7588f102ea3298fd7 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 00:13:06 +0300
Subject: [PATCH 55/99] [WIP] Implemented 16x8 420 subsampling convertion

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 19 ++--
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 86 ++++++++++++++++++-
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  | 24 ++++--
 3 files changed, 110 insertions(+), 19 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index ff5ce957e..f6e55153a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -123,8 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            Span<Block8x8F> temporalBlocks = stackalloc Block8x8F[2];
-
             var unzig = ZigZag.CreateUnzigTable();
 
             var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
@@ -140,18 +138,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 cancellationToken.ThrowIfCancellationRequested();
                 for (int x = 0; x < pixels.Width; x += 16)
                 {
-                    for (int i = 0; i < 4; i++)
+                    for(int i = 0; i < 2; i++)
                     {
-                        int xOff = (i & 1) * 8;
-                        int yOff = (i & 2) * 4;
-
+                        int yOff = i * 8;
                         currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, i);
+                        pixelConverter.Convert420(frame, x, y, ref currentRows, i);
+
+                        prevDCY = this.WriteBlock(
+                            QuantIndex.Luminance,
+                            prevDCY,
+                            ref pixelConverter.twinBlocksY[0],
+                            ref luminanceQuantTable,
+                            ref unzig);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
-                            ref pixelConverter.Y,
+                            ref pixelConverter.twinBlocksY[1],
                             ref luminanceQuantTable,
                             ref unzig);
                     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 055c7176a..a44b174d8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -204,14 +204,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
+        /// <summary>
+        /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
+        /// </summary>
+        /// <param name="rgbSpan"></param>
+        /// <param name="yBlock0"></param>
+        /// <param name="yBlock1"></param>
+        /// <param name="cbBlock"></param>
+        /// <param name="crBlock"></param>
+        /// <param name="row"></param>
+        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        {
+            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            var f0299 = Vector256.Create(0.299f);
+            var f0587 = Vector256.Create(0.587f);
+            var f0114 = Vector256.Create(0.114f);
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+            var zero = Vector256.Create(0).AsByte();
+
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+
+            int destOffset = row * 4;
+
+            ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset);
+            ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset);
+
+            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
+            Vector256<byte> rgb, rg, bx;
+            Vector256<float> r, g, b;
+
+            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
+
+            const int bytesPerRgbStride = 24;
+            for (int i = 0; i < 4; i++)
+            {
+                // 16x2 => 8x1
+                for (int j = 0; j < 4; j++)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
+
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+                    int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                r = Scale_8x4_4x2(rDataLanes);
+                g = Scale_8x4_4x2(gDataLanes);
+                b = Scale_8x4_4x2(bDataLanes);
+
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+            }
+#endif
+        }
+
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
         {
             Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
             var f025 = Vector256.Create(0.25f);
 
-            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[1]);
-            Vector256<float> botPairSum = SumHorizontalPairs(v[2], v[3]);
+            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]);
+            Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]);
 
             return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
         }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 952dde111..120b21e10 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -46,14 +46,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
+        public Span<Block8x8F> twinBlocksY;
+
         public static YCbCrForwardConverter<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter<TPixel>);
 
             // creating rgb pixel bufferr
             // TODO: this is subject to discuss
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[200].AsSpan());
-            result.pixelSpan = new TPixel[64].AsSpan();
+            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
+            // TODO: this size should be configurable
+            result.pixelSpan = new TPixel[128].AsSpan();
+
+            result.twinBlocksY = new Block8x8F[2].AsSpan();
 
             // Avoid creating lookup tables, when vectorized converter is supported
             if (!RgbToYCbCrConverterVectorized.IsSupported)
@@ -70,7 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
         {
             Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y);
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y, new Size(8));
 
             PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
@@ -94,13 +100,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
         {
             Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(8, buffer.Width - x), Math.Min(8, buffer.Height - y));
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(16, buffer.Width - x), Math.Min(8, buffer.Height - y), new Size(16, 8));
 
             PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.Y, ref this.Cb, ref this.Cr, idx);
+                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
             }
             else
             {
@@ -110,7 +116,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         // TODO: add DebugGuard checks?
-        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height)
+        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height, Size areaSize)
         {
             //Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width));
             //Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width));
@@ -122,10 +128,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 
             uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
-            int remainderXCount = 8 - width;
+            int remainderXCount = areaSize.Width - width;
 
             ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
-            int rowSizeInBytes = 8 * Unsafe.SizeOf<TPixel>();
+            int rowSizeInBytes = areaSize.Width * Unsafe.SizeOf<TPixel>();
 
             for (int y = 0; y < height; y++)
             {
@@ -144,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            int remainderYCount = 8 - height;
+            int remainderYCount = areaSize.Height - height;
 
             if (remainderYCount == 0)
             {

From 5ed7e2d1b734c57148e1f6253aee45ae944f9c14 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 01:09:41 +0300
Subject: [PATCH 56/99] Added quality params to the jpeg encoder benchmark

---
 tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index e22259f76..e807c416b 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -13,9 +13,10 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 {
     public class EncodeJpeg
     {
+        [Params(50, 75, 95, 100)]
+        public int Quality;
+
         private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
-        // GDI+ most likely uses 75 as default quality - https://stackoverflow.com/questions/3957477/what-quality-level-does-image-save-use-for-jpeg-files
-        private const int EncodingQuality = 75;
 
         // GDI+ uses 4:2:0 subsampling
         private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;
@@ -41,14 +42,14 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 
                 this.bmpCore = Image.Load<Rgba32>(this.bmpStream);
                 this.bmpCore.Metadata.ExifProfile = null;
-                this.encoder = new JpegEncoder { Quality = EncodingQuality, Subsample = EncodingSubsampling };
+                this.encoder = new JpegEncoder { Quality = Quality, Subsample = EncodingSubsampling };
 
                 this.bmpStream.Position = 0;
                 this.bmpDrawing = SDImage.FromStream(this.bmpStream);
                 this.jpegCodec = GetEncoder(ImageFormat.Jpeg);
                 this.encoderParameters = new EncoderParameters(1);
                 // Quality cast to long is necessary
-                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)EncodingQuality);
+                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)Quality);
 
                 this.destinationStream = new MemoryStream();
             }

From d6db6b6be75dbc73dbb238cc02c6fcca31131d0c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 01:23:09 +0300
Subject: [PATCH 57/99] Fixed compilation errors for non-intrinsic platforms

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs           | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index a44b174d8..e5fe4dea2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -125,8 +125,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
         /// </summary>
         /// <remarks>Total size of rgb span must be 200 bytes</remarks>
-        /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
-        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>ф
         public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@@ -207,12 +205,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
         /// </summary>
-        /// <param name="rgbSpan"></param>
-        /// <param name="yBlock0"></param>
-        /// <param name="yBlock1"></param>
-        /// <param name="cbBlock"></param>
-        /// <param name="crBlock"></param>
-        /// <param name="row"></param>
         public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@@ -286,6 +278,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
 
+#if SUPPORTS_RUNTIME_INTRINSICS
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
         {
@@ -335,5 +328,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
             }
         }
+#endif
     }
 }

From 39569866fc022d08e431dd11c4eda5b9985b40f8 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 11:43:32 +0300
Subject: [PATCH 58/99] Added debug guard checks to LoadAndStretchEdges

---
 .../Encoder/YCbCrForwardConverter{TPixel}.cs         | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 120b21e10..a059f978d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -115,17 +115,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
-        // TODO: add DebugGuard checks?
+
         private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height, Size areaSize)
         {
-            //Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width));
-            //Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width));
-
-            // TODO: this is a strange check, most likely it was introduces due to 2x 8x8 blocks subsampling, should be gone after new 4:2:0 implementation
-            if (width <= 0 || height <= 0)
-            {
-                return;
-            }
+            DebugGuard.MustBeBetweenOrEqualTo(width, 1, areaSize.Width, nameof(width));
+            DebugGuard.MustBeBetweenOrEqualTo(height, 1, areaSize.Height, nameof(height));
 
             uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
             int remainderXCount = areaSize.Width - width;

From 0d94435d653d5dc9cf88e162182a7e3be84c15b1 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 12:55:23 +0300
Subject: [PATCH 59/99] Simplified LoadAndStretchEdges call logic

---
 .../Encoder/YCbCrForwardConverter{TPixel}.cs           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index a059f978d..963e6dd9e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -76,7 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
         {
             Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y, new Size(8));
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, y, new Size(8), new Size(buffer.Width, buffer.Height));
 
             PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
@@ -100,7 +100,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
         {
             Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(16, buffer.Width - x), Math.Min(8, buffer.Height - y), new Size(16, 8));
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, y, new Size(16, 8), new Size(buffer.Width, buffer.Height));
 
             PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
 
@@ -116,10 +116,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
 
-        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height, Size areaSize)
+        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int startY, Size areaSize, Size borders)
         {
-            DebugGuard.MustBeBetweenOrEqualTo(width, 1, areaSize.Width, nameof(width));
-            DebugGuard.MustBeBetweenOrEqualTo(height, 1, areaSize.Height, nameof(height));
+            int width = Math.Min(areaSize.Width, borders.Width - startX);
+            int height = Math.Min(areaSize.Height, borders.Height - startY);
 
             uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
             int remainderXCount = areaSize.Width - width;

From 13e7cf358fb64b18aa06ba646f0d6feedac426fc Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 15:43:48 +0300
Subject: [PATCH 60/99] Divided YCbCr converters into 444/420 subsampling
 categories

---
 .../Components/Encoder/HuffmanScanEncoder.cs  |   4 +-
 .../YCbCrForwardConverter444{TPixel}.cs       | 118 +++++++++++++++++
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  | 122 ++----------------
 3 files changed, 130 insertions(+), 114 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index f6e55153a..4fbd9e4ec 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
 
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
+            var pixelConverter = YCbCrForwardConverter444<TPixel>.Create();
             ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
             Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
             RowOctet<TPixel> currentRows = default;
@@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             var unzig = ZigZag.CreateUnzigTable();
 
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
+            var pixelConverter = YCbCrForwardConverter444<TPixel>.Create();
 
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
new file mode 100644
index 000000000..58bb1d559
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -0,0 +1,118 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    /// <summary>
+    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+    /// </summary>
+    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
+    internal ref struct YCbCrForwardConverter444<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        /// <summary>
+        /// The Y component
+        /// </summary>
+        public Block8x8F Y;
+
+        /// <summary>
+        /// The Cb component
+        /// </summary>
+        public Block8x8F Cb;
+
+        /// <summary>
+        /// The Cr component
+        /// </summary>
+        public Block8x8F Cr;
+
+        /// <summary>
+        /// The color conversion tables
+        /// </summary>
+        private RgbToYCbCrConverterLut colorTables;
+
+        /// <summary>
+        /// Temporal 8x8 block to hold TPixel data
+        /// </summary>
+        private Span<TPixel> pixelSpan;
+
+        /// <summary>
+        /// Temporal RGB block
+        /// </summary>
+        private Span<Rgb24> rgbSpan;
+
+        public Span<Block8x8F> twinBlocksY;
+
+        public static YCbCrForwardConverter444<TPixel> Create()
+        {
+            var result = default(YCbCrForwardConverter444<TPixel>);
+
+            // creating rgb pixel bufferr
+            // TODO: this is subject to discuss
+            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
+            // TODO: this size should be configurable
+            result.pixelSpan = new TPixel[128].AsSpan();
+
+            result.twinBlocksY = new Block8x8F[2].AsSpan();
+
+            // Avoid creating lookup tables, when vectorized converter is supported
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                result.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
+        /// </summary>
+        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
+        {
+            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(8), new Size(buffer.Width, buffer.Height));
+
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
+
+            ref Block8x8F yBlock = ref this.Y;
+            ref Block8x8F cbBlock = ref this.Cb;
+            ref Block8x8F crBlock = ref this.Cr;
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+            else
+            {
+                this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+        }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
+        /// </summary>
+        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
+        {
+            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(16, 8), new Size(buffer.Width, buffer.Height));
+
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
+            }
+            else
+            {
+                throw new NotSupportedException("This is not yet implemented");
+                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 963e6dd9e..f5ef77091 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -4,134 +4,32 @@
 using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
-    /// <summary>
-    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
-    /// </summary>
-    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
-    internal ref struct YCbCrForwardConverter<TPixel>
+    internal static class YCbCrForwardConverter<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
-        /// <summary>
-        /// The Y component
-        /// </summary>
-        public Block8x8F Y;
-
-        /// <summary>
-        /// The Cb component
-        /// </summary>
-        public Block8x8F Cb;
-
-        /// <summary>
-        /// The Cr component
-        /// </summary>
-        public Block8x8F Cr;
-
-        /// <summary>
-        /// The color conversion tables
-        /// </summary>
-        private RgbToYCbCrConverterLut colorTables;
-
-        /// <summary>
-        /// Temporal 8x8 block to hold TPixel data
-        /// </summary>
-        private Span<TPixel> pixelSpan;
-
-        /// <summary>
-        /// Temporal RGB block
-        /// </summary>
-        private Span<Rgb24> rgbSpan;
-
-        public Span<Block8x8F> twinBlocksY;
-
-        public static YCbCrForwardConverter<TPixel> Create()
-        {
-            var result = default(YCbCrForwardConverter<TPixel>);
-
-            // creating rgb pixel bufferr
-            // TODO: this is subject to discuss
-            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
-            // TODO: this size should be configurable
-            result.pixelSpan = new TPixel[128].AsSpan();
-
-            result.twinBlocksY = new Block8x8F[2].AsSpan();
-
-            // Avoid creating lookup tables, when vectorized converter is supported
-            if (!RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                result.colorTables = RgbToYCbCrConverterLut.Create();
-            }
-
-            return result;
-        }
-
-        /// <summary>
-        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
-        /// </summary>
-        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
+        public static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, Point start, Size sampleSize, Size totalSize)
         {
-            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, y, new Size(8), new Size(buffer.Width, buffer.Height));
-
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
-
-            ref Block8x8F yBlock = ref this.Y;
-            ref Block8x8F cbBlock = ref this.Cb;
-            ref Block8x8F crBlock = ref this.Cr;
+            DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X));
+            DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y));
 
-            if (RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
-            }
-            else
-            {
-                this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
-            }
-        }
-
-        /// <summary>
-        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
-        /// </summary>
-        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
-        {
-            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, y, new Size(16, 8), new Size(buffer.Width, buffer.Height));
-
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
-
-            if (RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
-            }
-            else
-            {
-                throw new NotSupportedException("This is not yet implemented");
-                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
-            }
-        }
-
-
-        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int startY, Size areaSize, Size borders)
-        {
-            int width = Math.Min(areaSize.Width, borders.Width - startX);
-            int height = Math.Min(areaSize.Height, borders.Height - startY);
+            int width = Math.Min(sampleSize.Width, totalSize.Width - start.X);
+            int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y);
 
             uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
-            int remainderXCount = areaSize.Width - width;
+            int remainderXCount = sampleSize.Width - width;
 
             ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
-            int rowSizeInBytes = areaSize.Width * Unsafe.SizeOf<TPixel>();
+            int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf<TPixel>();
 
             for (int y = 0; y < height; y++)
             {
                 Span<TPixel> row = source[y];
 
-                ref byte s = ref Unsafe.As<TPixel, byte>(ref row[startX]);
+                ref byte s = ref Unsafe.As<TPixel, byte>(ref row[start.X]);
                 ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes);
 
                 Unsafe.CopyBlock(ref d, ref s, byteWidth);
@@ -144,7 +42,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 }
             }
 
-            int remainderYCount = areaSize.Height - height;
+            int remainderYCount = sampleSize.Height - height;
 
             if (remainderYCount == 0)
             {

From 12b4b83cb6df5499d0b2211ae8ddf4d6b7e88363 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 18:21:36 +0300
Subject: [PATCH 61/99] 444 converter fixes

---
 .../YCbCrForwardConverter444{TPixel}.cs       | 42 ++++++-------------
 1 file changed, 12 insertions(+), 30 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 58bb1d559..8fef55302 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -16,6 +16,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
     internal ref struct YCbCrForwardConverter444<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
+        // TODO: documentation
+        private const int RgbSpanByteSize = 8 * 8 * 3;
+        // TODO: documentation
+        private const int PixelSpanSize = 8 * 8;
+
+
         /// <summary>
         /// The Y component
         /// </summary>
@@ -37,29 +43,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         private RgbToYCbCrConverterLut colorTables;
 
         /// <summary>
-        /// Temporal 8x8 block to hold TPixel data
+        /// Temporal 64-byte span to hold unconverted TPixel data
         /// </summary>
         private Span<TPixel> pixelSpan;
 
         /// <summary>
-        /// Temporal RGB block
+        /// Temporal 64-byte span to hold converted Rgb24 data
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
-        public Span<Block8x8F> twinBlocksY;
-
         public static YCbCrForwardConverter444<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter444<TPixel>);
 
             // creating rgb pixel bufferr
             // TODO: this is subject to discuss
-            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
-            // TODO: this size should be configurable
-            result.pixelSpan = new TPixel[128].AsSpan();
+            // converter.Convert comments for +8 padding
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + 8].AsSpan());
 
-            result.twinBlocksY = new Block8x8F[2].AsSpan();
+            // TODO: this is subject to discuss
+            result.pixelSpan = new TPixel[PixelSpanSize].AsSpan();
 
             // Avoid creating lookup tables, when vectorized converter is supported
             if (!RgbToYCbCrConverterVectorized.IsSupported)
@@ -93,26 +96,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
-
-        /// <summary>
-        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
-        /// </summary>
-        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
-        {
-            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(16, 8), new Size(buffer.Width, buffer.Height));
-
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
-
-            if (RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
-            }
-            else
-            {
-                throw new NotSupportedException("This is not yet implemented");
-                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
-            }
-        }
     }
 }

From 953095f1b981a59372bfc7b7c7c94ce8d4d68002 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 18:52:03 +0300
Subject: [PATCH 62/99] 420 converter fixes

---
 .../Components/Encoder/HuffmanScanEncoder.cs  | 10 +++---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 35 +++++++++++++++++--
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 4fbd9e4ec..3231c5781 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             var unzig = ZigZag.CreateUnzigTable();
 
-            var pixelConverter = YCbCrForwardConverter444<TPixel>.Create();
+            var pixelConverter = YCbCrForwardConverter420<TPixel>.Create();
 
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@@ -138,23 +138,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 cancellationToken.ThrowIfCancellationRequested();
                 for (int x = 0; x < pixels.Width; x += 16)
                 {
-                    for(int i = 0; i < 2; i++)
+                    for (int i = 0; i < 2; i++)
                     {
                         int yOff = i * 8;
                         currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert420(frame, x, y, ref currentRows, i);
+                        pixelConverter.Convert(frame, x, y, ref currentRows, i);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
-                            ref pixelConverter.twinBlocksY[0],
+                            ref pixelConverter.YLeft,
                             ref luminanceQuantTable,
                             ref unzig);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
                             prevDCY,
-                            ref pixelConverter.twinBlocksY[1],
+                            ref pixelConverter.YRight,
                             ref luminanceQuantTable,
                             ref unzig);
                     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index e5fe4dea2..cf4d47774 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -28,6 +28,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
 #if SUPPORTS_RUNTIME_INTRINSICS
+        // TODO: documentation
+        public const int AvxRegisterRgbCompatibilityOffset = 8;
+
         private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
         {
             0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
@@ -205,7 +208,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
         /// </summary>
-        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 
@@ -241,7 +244,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             for (int i = 0; i < 4; i++)
             {
                 // 16x2 => 8x1
-                for (int j = 0; j < 4; j++)
+                // left 8x8 column conversions
+                for (int j = 0; j < 4; j += 2)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
+
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+                    int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                // 16x2 => 8x1
+                // right 8x8 column conversions
+                for (int j = 1; j < 4; j += 2)
                 {
                     rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
 
@@ -257,7 +286,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
 
                     // (0.299F * r) + (0.587F * g) + (0.114F * b);
-                    Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+                    Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
 
                     rDataLanes[j] = r;
                     gDataLanes[j] = g;

From 5fc29a2e9899171878b6c703868f657c62f8e735 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 18:52:39 +0300
Subject: [PATCH 63/99] Introduced separate 420 converter

---
 .../YCbCrForwardConverter420{TPixel}.cs       | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
new file mode 100644
index 000000000..c831b611c
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -0,0 +1,95 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    /// <summary>
+    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+    /// </summary>
+    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
+    internal ref struct YCbCrForwardConverter420<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        /// <summary>
+        /// The left Y component
+        /// </summary>
+        public Block8x8F YLeft;
+
+        /// <summary>
+        /// The left Y component
+        /// </summary>
+        public Block8x8F YRight;
+
+        /// <summary>
+        /// The Cb component
+        /// </summary>
+        public Block8x8F Cb;
+
+        /// <summary>
+        /// The Cr component
+        /// </summary>
+        public Block8x8F Cr;
+
+        /// <summary>
+        /// The color conversion tables
+        /// </summary>
+        private RgbToYCbCrConverterLut colorTables;
+
+        /// <summary>
+        /// Temporal 16x8 block to hold TPixel data
+        /// </summary>
+        private Span<TPixel> pixelSpan;
+
+        /// <summary>
+        /// Temporal RGB block
+        /// </summary>
+        private Span<Rgb24> rgbSpan;
+
+        public static YCbCrForwardConverter420<TPixel> Create()
+        {
+            var result = default(YCbCrForwardConverter420<TPixel>);
+
+            // TODO: this is subject to discuss
+            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
+
+            // TODO: this size should be configurable
+            result.pixelSpan = new TPixel[128].AsSpan();
+
+            // Avoid creating lookup tables, when vectorized converter is supported
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                result.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
+        /// </summary>
+        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
+        {
+            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(16, 8), new Size(buffer.Width, buffer.Height));
+
+            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+            }
+            else
+            {
+                throw new NotSupportedException("This is not yet implemented");
+                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+        }
+    }
+}

From cb1acaec78c92688774f7245c6ae7345a2aeda6a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 22:30:45 +0300
Subject: [PATCH 64/99] Finished 420 subsampling converter

---
 .../Components/Encoder/HuffmanScanEncoder.cs  |  6 +-
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 16 +++++-
 .../YCbCrForwardConverter420{TPixel}.cs       | 55 +++++++++++++------
 3 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 3231c5781..283a98fab 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -125,14 +125,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         {
             var unzig = ZigZag.CreateUnzigTable();
 
-            var pixelConverter = YCbCrForwardConverter420<TPixel>.Create();
-
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
             ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
             Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
             RowOctet<TPixel> currentRows = default;
 
+            var pixelConverter = new YCbCrForwardConverter420<TPixel>(frame);
+
             for (int y = 0; y < pixels.Height; y += 16)
             {
                 cancellationToken.ThrowIfCancellationRequested();
@@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     {
                         int yOff = i * 8;
                         currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert(frame, x, y, ref currentRows, i);
+                        pixelConverter.Convert(x, y, ref currentRows, i);
 
                         prevDCY = this.WriteBlock(
                             QuantIndex.Luminance,
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index cf4d47774..b9f0fa427 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -27,9 +27,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
+        public static int AvxRegisterRgbCompatibilityPadding
+        {
+            get
+            {
+                if (IsSupported)
+                {
+                    return 8;
+                }
+
+                return 0;
+            }
+        }
+
 #if SUPPORTS_RUNTIME_INTRINSICS
-        // TODO: documentation
-        public const int AvxRegisterRgbCompatibilityOffset = 8;
 
         private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
         {
@@ -306,7 +317,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
-
 #if SUPPORTS_RUNTIME_INTRINSICS
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index c831b611c..fdb41a8e2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -16,6 +16,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
     internal ref struct YCbCrForwardConverter420<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
+        // TODO: docs
+        private const int PixelsPerSample = 16 * 8;
+
+        // TODO: docs
+        private static int RgbSpanByteSize = PixelsPerSample * 3;
+
+        // TODO: docs
+        private static readonly Size SampleSize = new Size(16, 8);
+
         /// <summary>
         /// The left Y component
         /// </summary>
@@ -51,35 +60,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
-        public static YCbCrForwardConverter420<TPixel> Create()
+        // TODO: docs
+        private Size samplingAreaSize;
+
+        // TODO: docs
+        private Configuration config;
+
+
+        public YCbCrForwardConverter420(ImageFrame<TPixel> frame)
         {
-            var result = default(YCbCrForwardConverter420<TPixel>);
+            // matrices would be filled during convert calls
+            this.YLeft = default;
+            this.YRight = default;
+            this.Cb = default;
+            this.Cr = default;
 
-            // TODO: this is subject to discuss
-            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
+            // temporal pixel buffers
+            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
 
-            // TODO: this size should be configurable
-            result.pixelSpan = new TPixel[128].AsSpan();
+            // frame data
+            this.samplingAreaSize = new Size(frame.Width, frame.Height);
+            this.config = frame.GetConfiguration();
 
-            // Avoid creating lookup tables, when vectorized converter is supported
+            // conversion vector fallback data
             if (!RgbToYCbCrConverterVectorized.IsSupported)
             {
-                result.colorTables = RgbToYCbCrConverterLut.Create();
+                this.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+            else
+            {
+                this.colorTables = default;
             }
-
-            return result;
         }
 
-        /// <summary>
-        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
-        /// </summary>
-        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
+        public void Convert(int x, int y, ref RowOctet<TPixel> currentRows, int idx)
         {
-            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(16, 8), new Size(buffer.Width, buffer.Height));
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
 
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
+            PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {

From 672da457d340b2ae6df50d880dfdba0f12c9e2ec Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 30 May 2021 22:44:09 +0300
Subject: [PATCH 65/99] Finished 444 subsampling converter

---
 .../Components/Encoder/HuffmanScanEncoder.cs  |  5 +-
 .../YCbCrForwardConverter444{TPixel}.cs       | 53 +++++++++++++++----
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 283a98fab..218b2b59c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -71,11 +71,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // ReSharper disable once InconsistentNaming
             int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
 
-            var pixelConverter = YCbCrForwardConverter444<TPixel>.Create();
             ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
             Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
             RowOctet<TPixel> currentRows = default;
 
+            var pixelConverter = new YCbCrForwardConverter444<TPixel>(frame);
+
             for (int y = 0; y < pixels.Height; y += 8)
             {
                 cancellationToken.ThrowIfCancellationRequested();
@@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 for (int x = 0; x < pixels.Width; x += 8)
                 {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
+                    pixelConverter.Convert(x, y, ref currentRows);
 
                     prevDCY = this.WriteBlock(
                         QuantIndex.Luminance,
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 8fef55302..27f7e3ae9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -16,10 +16,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
     internal ref struct YCbCrForwardConverter444<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
-        // TODO: documentation
-        private const int RgbSpanByteSize = 8 * 8 * 3;
-        // TODO: documentation
-        private const int PixelSpanSize = 8 * 8;
+        // TODO: docs
+        private const int PixelsPerSample = 8 * 8;
+
+        // TODO: docs
+        private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+        // TODO: docs
+        private static readonly Size SampleSize = new Size(8, 8);
 
 
         /// <summary>
@@ -52,6 +56,38 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
+        // TODO: docs
+        private Size samplingAreaSize;
+
+        // TODO: docs
+        private readonly Configuration config;
+
+        public YCbCrForwardConverter444(ImageFrame<TPixel> frame)
+        {
+            // matrices would be filled during convert calls
+            this.Y = default;
+            this.Cb = default;
+            this.Cr = default;
+
+            // temporal pixel buffers
+            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
+
+            // frame data
+            this.samplingAreaSize = new Size(frame.Width, frame.Height);
+            this.config = frame.GetConfiguration();
+
+            // conversion vector fallback data
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+            else
+            {
+                this.colorTables = default;
+            }
+        }
+
         public static YCbCrForwardConverter444<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter444<TPixel>);
@@ -62,7 +98,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + 8].AsSpan());
 
             // TODO: this is subject to discuss
-            result.pixelSpan = new TPixel[PixelSpanSize].AsSpan();
+            result.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
 
             // Avoid creating lookup tables, when vectorized converter is supported
             if (!RgbToYCbCrConverterVectorized.IsSupported)
@@ -76,12 +112,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
         /// </summary>
-        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
+        public void Convert(int x, int y, ref RowOctet<TPixel> currentRows)
         {
-            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), new Size(8), new Size(buffer.Width, buffer.Height));
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
 
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
+            PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
 
             ref Block8x8F yBlock = ref this.Y;
             ref Block8x8F cbBlock = ref this.Cb;

From 1d54702dc1ae9b65cb471eeeaa331ded112479cc Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 2 Jun 2021 17:24:56 +0100
Subject: [PATCH 66/99] Update shared-infrastructure

---
 shared-infrastructure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared-infrastructure b/shared-infrastructure
index 48e73f455..1f7ee7028 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506
+Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7

From 5ea8da6c979f4e5a8dc2ba7131e0624ec1535ca1 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 2 Jun 2021 18:23:09 +0100
Subject: [PATCH 67/99] Fix BitOperations

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 33 ++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index e8ba6dde6..6bf06150b 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -23,6 +23,28 @@ namespace SixLabors.ImageSharp
         private const int ShuffleAlphaControl = 0b_11_11_11_11;
 #endif
 
+#if !SUPPORTS_BITOPERATIONS
+        /// <summary>
+        /// Gets the counts the number of bits needed to hold an integer.
+        /// </summary>
+        private static ReadOnlySpan<byte> BitCountLut => new byte[]
+        {
+            0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8,
+        };
+#endif
+
         /// <summary>
         /// Determine the Greatest CommonDivisor (GCD) of two numbers.
         /// </summary>
@@ -756,7 +778,7 @@ namespace SixLabors.ImageSharp
         /// widening them to 32-bit integers and performing four additions.
         /// </summary>
         /// <remarks>
-        /// <code>byte(1, 2, 3, 4,  5, 6, 7, 8,  9, 10, 11, 12,  13, 14, 15, 16)</code>
+        /// <c>byte(1, 2, 3, 4,  5, 6, 7, 8,  9, 10, 11, 12,  13, 14, 15, 16)</c>
         /// is widened and added onto <paramref name="accumulator"/> as such:
         /// <code>
         ///  accumulator += i32(1, 2, 3, 4);
@@ -834,8 +856,17 @@ namespace SixLabors.ImageSharp
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int MinimumBitsToStore(uint number)
         {
+#if !SUPPORTS_BITOPERATIONS
+            if (number < 0x100)
+            {
+                return BitCountLut[(int)number];
+            }
+
+            return 8 + BitCountLut[(int)number >> 8];
+#else
             const int bitInUnsignedInteger = sizeof(uint) * 8;
             return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number);
+#endif
         }
     }
 }

From de176b699e377ce4da7f005c66a9351d77b8eed1 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 3 Jun 2021 17:35:23 +0300
Subject: [PATCH 68/99] Initial 420 subsampling lut conversion implementation

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 90 +++++++++++++++++++
 .../YCbCrForwardConverter420{TPixel}.cs       |  3 +-
 2 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 1ceea1e08..635e571b7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -115,6 +115,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ConvertPixelInto(
+            int r,
+            int g,
+            int b,
+            ref Block8x8F yResult,
+            int i)
+        {
+            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
+            yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ConvertPixelInto(
+            int r,
+            int g,
+            int b,
+            ref Block8x8F cbResult,
+            ref Block8x8F crResult,
+            int i)
+        {
+            // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
+            cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+
+            // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+            crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+        }
+
         public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             ref Rgb24 rgbStart = ref rgbSpan[0];
@@ -134,6 +162,68 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
+        public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        {
+            ref Rgb24 rgbStart = ref rgbSpan[0];
+            for (int i = 0; i < 8; i += 2)
+            {
+                Span<int> r = stackalloc int[8];
+                Span<int> g = stackalloc int[8];
+                Span<int> b = stackalloc int[8];
+
+                for (int j = 0; j < 2; j++)
+                {
+                    // left
+                    ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
+                    for (int k = 0; k < 8; k += 2)
+                    {
+                        int r0 = Unsafe.Add(ref stride, k).R;
+                        int g0 = Unsafe.Add(ref stride, k).G;
+                        int b0 = Unsafe.Add(ref stride, k).B;
+                        this.ConvertPixelInto(r0, g0, b0, ref yBlockLeft, (i + j) * 8 + k);
+
+                        int r1 = Unsafe.Add(ref stride, k + 1).R;
+                        int g1 = Unsafe.Add(ref stride, k + 1).G;
+                        int b1 = Unsafe.Add(ref stride, k + 1).B;
+                        this.ConvertPixelInto(r1, g1, b1, ref yBlockLeft, (i + j) * 8 + k + 1);
+
+                        int idx = k / 2;
+                        r[idx] += r0 + r1;
+                        g[idx] += g0 + g1;
+                        b[idx] += b0 + b1;
+                    }
+
+                    // right
+                    stride = ref Unsafe.Add(ref stride, 8);
+                    for (int k = 0; k < 8; k += 2)
+                    {
+                        int r0 = Unsafe.Add(ref stride, k).R;
+                        int g0 = Unsafe.Add(ref stride, k).G;
+                        int b0 = Unsafe.Add(ref stride, k).B;
+                        this.ConvertPixelInto(r0, g0, b0, ref yBlockRight, (i + j) * 8 + k);
+
+                        int r1 = Unsafe.Add(ref stride, k + 1).R;
+                        int g1 = Unsafe.Add(ref stride, k + 1).G;
+                        int b1 = Unsafe.Add(ref stride, k + 1).B;
+                        this.ConvertPixelInto(r1, g1, b1, ref yBlockRight, (i + j) * 8 + k + 1);
+
+                        int idx = 4 + (k / 2);
+                        r[idx] += r0 + r1;
+                        g[idx] += g0 + g1;
+                        b[idx] += b0 + b1;
+                    }
+                }
+
+                int writeIdx =
+                    row * Block8x8F.Size / 2 // upper or lower part
+                    + (i / 2) * 8;           // which row
+                for (int j = 0; j < 8; j++)
+                {
+                    this.ConvertPixelInto(r[j] / 4, g[j] / 4, b[j] / 4, ref cbBlock, ref crBlock, writeIdx + j);
+                }
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int Fix(float x)
             => (int)((x * (1L << ScaleBits)) + 0.5F);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index fdb41a8e2..2e8433cdc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -106,8 +106,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
             else
             {
-                throw new NotSupportedException("This is not yet implemented");
-                //this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                this.colorTables.Convert(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
             }
         }
     }

From 7896e24606ba15500e43bcdaa856cebee9e42b67 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 4 Jun 2021 13:47:10 +0300
Subject: [PATCH 69/99] Improved non-simd ycbcr lut converter code

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 37 ++++++++-----------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 635e571b7..06e8f26b6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -177,40 +177,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
                     for (int k = 0; k < 8; k += 2)
                     {
-                        int r0 = Unsafe.Add(ref stride, k).R;
-                        int g0 = Unsafe.Add(ref stride, k).G;
-                        int b0 = Unsafe.Add(ref stride, k).B;
-                        this.ConvertPixelInto(r0, g0, b0, ref yBlockLeft, (i + j) * 8 + k);
+                        Rgb24 px0 = Unsafe.Add(ref stride, k);
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockLeft, (i + j) * 8 + k);
 
-                        int r1 = Unsafe.Add(ref stride, k + 1).R;
-                        int g1 = Unsafe.Add(ref stride, k + 1).G;
-                        int b1 = Unsafe.Add(ref stride, k + 1).B;
-                        this.ConvertPixelInto(r1, g1, b1, ref yBlockLeft, (i + j) * 8 + k + 1);
+                        Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockLeft, (i + j) * 8 + k + 1);
 
                         int idx = k / 2;
-                        r[idx] += r0 + r1;
-                        g[idx] += g0 + g1;
-                        b[idx] += b0 + b1;
+                        r[idx] += px0.R + px1.R;
+                        g[idx] += px0.G + px1.G;
+                        b[idx] += px0.B + px1.B;
                     }
 
                     // right
                     stride = ref Unsafe.Add(ref stride, 8);
                     for (int k = 0; k < 8; k += 2)
                     {
-                        int r0 = Unsafe.Add(ref stride, k).R;
-                        int g0 = Unsafe.Add(ref stride, k).G;
-                        int b0 = Unsafe.Add(ref stride, k).B;
-                        this.ConvertPixelInto(r0, g0, b0, ref yBlockRight, (i + j) * 8 + k);
+                        Rgb24 px0 = Unsafe.Add(ref stride, k);
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRight, (i + j) * 8 + k);
 
-                        int r1 = Unsafe.Add(ref stride, k + 1).R;
-                        int g1 = Unsafe.Add(ref stride, k + 1).G;
-                        int b1 = Unsafe.Add(ref stride, k + 1).B;
-                        this.ConvertPixelInto(r1, g1, b1, ref yBlockRight, (i + j) * 8 + k + 1);
+                        Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockRight, (i + j) * 8 + k + 1);
 
                         int idx = 4 + (k / 2);
-                        r[idx] += r0 + r1;
-                        g[idx] += g0 + g1;
-                        b[idx] += b0 + b1;
+                        r[idx] += px0.R + px1.R;
+                        g[idx] += px0.G + px1.G;
+                        b[idx] += px0.B + px1.B;
+
                     }
                 }
 

From 2e25a3ee34ca3c21c9ade0a5c3c11131167a319b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 4 Jun 2021 14:16:32 +0300
Subject: [PATCH 70/99] Optimized non-simd ycbcr lut converter code

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 06e8f26b6..e26e73044 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -167,9 +167,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             ref Rgb24 rgbStart = ref rgbSpan[0];
             for (int i = 0; i < 8; i += 2)
             {
-                Span<int> r = stackalloc int[8];
-                Span<int> g = stackalloc int[8];
-                Span<int> b = stackalloc int[8];
+                Span<int> rgbTriplets = stackalloc int[24]; // 8 pixels by 3 integers
 
                 for (int j = 0; j < 2; j++)
                 {
@@ -183,10 +181,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
                         this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockLeft, (i + j) * 8 + k + 1);
 
-                        int idx = k / 2;
-                        r[idx] += px0.R + px1.R;
-                        g[idx] += px0.G + px1.G;
-                        b[idx] += px0.B + px1.B;
+                        int idx = 3 * (k / 2);
+                        rgbTriplets[idx] += px0.R + px1.R;
+                        rgbTriplets[idx + 1] += px0.G + px1.G;
+                        rgbTriplets[idx + 2] += px0.B + px1.B;
                     }
 
                     // right
@@ -199,10 +197,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
                         this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockRight, (i + j) * 8 + k + 1);
 
-                        int idx = 4 + (k / 2);
-                        r[idx] += px0.R + px1.R;
-                        g[idx] += px0.G + px1.G;
-                        b[idx] += px0.B + px1.B;
+                        int idx = 3 * (4 + (k / 2));
+                        rgbTriplets[idx] += px0.R + px1.R;
+                        rgbTriplets[idx + 1] += px0.G + px1.G;
+                        rgbTriplets[idx + 2] += px0.B + px1.B;
 
                     }
                 }
@@ -212,7 +210,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     + (i / 2) * 8;           // which row
                 for (int j = 0; j < 8; j++)
                 {
-                    this.ConvertPixelInto(r[j] / 4, g[j] / 4, b[j] / 4, ref cbBlock, ref crBlock, writeIdx + j);
+                    int idx = j * 3;
+                    this.ConvertPixelInto(rgbTriplets[idx] / 4, rgbTriplets[idx + 1] / 4, rgbTriplets[idx + 2] / 4, ref cbBlock, ref crBlock, writeIdx + j);
                 }
             }
         }

From 44bae0b79e8ee83dbbf5533c32f2eb34a33de490 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 4 Jun 2021 16:50:07 +0300
Subject: [PATCH 71/99]  Made non-simd ycbcr lut converter code more readable

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 54 ++++++++++++-------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index e26e73044..18f5ee0e7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -128,21 +128,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(
-            int r,
-            int g,
-            int b,
-            ref Block8x8F cbResult,
-            ref Block8x8F crResult,
-            int i)
+        private void ConvertPixelInto(int r, int g, int b, ref float yResult) =>
+            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
+            yResult = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ConvertPixelInto(int r, int g, int b, ref float cbResult, ref float crResult)
         {
             // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-            cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+            cbResult = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
 
             // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
-            crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+            crResult = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
+
         public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             ref Rgb24 rgbStart = ref rgbSpan[0];
@@ -164,10 +164,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
         public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
         {
+            ref float yBlockLeftRef = ref Unsafe.As<Block8x8F, float>(ref yBlockLeft);
+            ref float yBlockRightRef = ref Unsafe.As<Block8x8F, float>(ref yBlockRight);
+
+            // 0-31 or 32-63
+            // upper or lower part
+            int chromaWriteOffset = row * Block8x8F.Size / 2;
+            ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref cbBlock), chromaWriteOffset);
+            ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref crBlock), chromaWriteOffset);
+
             ref Rgb24 rgbStart = ref rgbSpan[0];
+
             for (int i = 0; i < 8; i += 2)
             {
-                Span<int> rgbTriplets = stackalloc int[24]; // 8 pixels by 3 integers
+                // 8 pixels by 3 integers
+                Span<int> rgbTriplets = stackalloc int[24];
 
                 for (int j = 0; j < 2; j++)
                 {
@@ -175,11 +186,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
                     for (int k = 0; k < 8; k += 2)
                     {
+                        ref float yBlockRef = ref Unsafe.Add(ref yBlockLeftRef, (i + j) * 8 + k);
+
                         Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockLeft, (i + j) * 8 + k);
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
 
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
-                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockLeft, (i + j) * 8 + k + 1);
+                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
 
                         int idx = 3 * (k / 2);
                         rgbTriplets[idx] += px0.R + px1.R;
@@ -191,11 +204,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     stride = ref Unsafe.Add(ref stride, 8);
                     for (int k = 0; k < 8; k += 2)
                     {
+                        ref float yBlockRef = ref Unsafe.Add(ref yBlockRightRef, (i + j) * 8 + k);
+
                         Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRight, (i + j) * 8 + k);
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
 
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
-                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref yBlockRight, (i + j) * 8 + k + 1);
+                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
 
                         int idx = 3 * (4 + (k / 2));
                         rgbTriplets[idx] += px0.R + px1.R;
@@ -205,13 +220,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     }
                 }
 
-                int writeIdx =
-                    row * Block8x8F.Size / 2 // upper or lower part
-                    + (i / 2) * 8;           // which row
+                int writeIdx = 8 * (i / 2);
                 for (int j = 0; j < 8; j++)
                 {
                     int idx = j * 3;
-                    this.ConvertPixelInto(rgbTriplets[idx] / 4, rgbTriplets[idx + 1] / 4, rgbTriplets[idx + 2] / 4, ref cbBlock, ref crBlock, writeIdx + j);
+                    this.ConvertPixelInto(
+                        rgbTriplets[idx] / 4,       // r
+                        rgbTriplets[idx + 1] / 4,   // g
+                        rgbTriplets[idx + 2] / 4,   // b
+                        ref Unsafe.Add(ref cbBlockRef, writeIdx + j),
+                        ref Unsafe.Add(ref crBlockRef, writeIdx + j));
                 }
             }
         }

From 078703b595ecf204db96c34220b1d23ca9499b8a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 4 Jun 2021 17:28:57 +0300
Subject: [PATCH 72/99] Added docs, renamed LuT converter for 444 and 420
 subsampling methods, added debug guards

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 32 +++++++++++++++----
 .../YCbCrForwardConverter420{TPixel}.cs       |  2 +-
 .../YCbCrForwardConverter444{TPixel}.cs       |  2 +-
 .../Encoder/YCbCrForwardConverterBenchmark.cs |  2 +-
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   |  2 +-
 5 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 18f5ee0e7..3706b8062 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -142,8 +142,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             crResult = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
-
-        public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        /// <summary>
+        /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma.
+        /// </summary>
+        /// <param name="rgbSpan">Span of Rgb24 pixel data</param>
+        /// <param name="yBlock">Resulting Y values block</param>
+        /// <param name="cbBlock">Resulting Cb values block</param>
+        /// <param name="crBlock">Resulting Cr values block</param>
+        public void Convert444(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             ref Rgb24 rgbStart = ref rgbSpan[0];
 
@@ -162,8 +168,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
-        public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        /// <summary>
+        /// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma.
+        /// </summary>
+        /// <remarks>Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param.</remarks>
+        /// <param name="rgbSpan">Span of Rgb24 pixel data</param>
+        /// <param name="yBlockLeft">First or "left" resulting Y block</param>
+        /// <param name="yBlockRight">Second or "right" resulting Y block</param>
+        /// <param name="cbBlock">Resulting Cb values block</param>
+        /// <param name="crBlock">Resulting Cr values block</param>
+        /// <param name="row">Row index of the 16x16 block, 0 or 1</param>
+        public void Convert420(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
         {
+            DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row));
+
             ref float yBlockLeftRef = ref Unsafe.As<Block8x8F, float>(ref yBlockLeft);
             ref float yBlockRightRef = ref Unsafe.As<Block8x8F, float>(ref yBlockRight);
 
@@ -189,9 +207,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref float yBlockRef = ref Unsafe.Add(ref yBlockLeftRef, (i + j) * 8 + k);
 
                         Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
                         this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
 
                         int idx = 3 * (k / 2);
@@ -207,9 +225,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         ref float yBlockRef = ref Unsafe.Add(ref yBlockRightRef, (i + j) * 8 + k);
 
                         Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-
                         Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+
+                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
                         this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
 
                         int idx = 3 * (4 + (k / 2));
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index 2e8433cdc..e0e7854b0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -106,7 +106,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
             else
             {
-                this.colorTables.Convert(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+                this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
             }
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 27f7e3ae9..f3ae33934 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -128,7 +128,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
             else
             {
-                this.colorTables.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
         }
     }
diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
index 1db407293..60a585384 100644
--- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
@@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+            this.converter.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr);
         }
 
         [Benchmark]
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 9a6fc8d6f..c605a6cf8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -32,7 +32,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+            target.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
             Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
         }

From da1b85bee38b4e4ceded1c57d25ac13a2a0e8f22 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Fri, 4 Jun 2021 18:04:58 +0300
Subject: [PATCH 73/99] Final cleanup of the non-simd 420 rgb -> ycbcr
 conversion code

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 62 +++++++++----------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 3706b8062..7681063ee 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -200,45 +200,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                 for (int j = 0; j < 2; j++)
                 {
-                    // left
+                    int yBlockWriteOffset = (i + j) * 8;
                     ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
-                    for (int k = 0; k < 8; k += 2)
-                    {
-                        ref float yBlockRef = ref Unsafe.Add(ref yBlockLeftRef, (i + j) * 8 + k);
-
-                        Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
-
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
 
-                        int idx = 3 * (k / 2);
-                        rgbTriplets[idx] += px0.R + px1.R;
-                        rgbTriplets[idx + 1] += px0.G + px1.G;
-                        rgbTriplets[idx + 2] += px0.B + px1.B;
-                    }
+                    // left
+                    this.ConvertChunk420(ref stride, ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset), rgbTriplets);
 
                     // right
-                    stride = ref Unsafe.Add(ref stride, 8);
-                    for (int k = 0; k < 8; k += 2)
-                    {
-                        ref float yBlockRef = ref Unsafe.Add(ref yBlockRightRef, (i + j) * 8 + k);
-
-                        Rgb24 px0 = Unsafe.Add(ref stride, k);
-                        Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
-
-                        this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-                        this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
-
-                        int idx = 3 * (4 + (k / 2));
-                        rgbTriplets[idx] += px0.R + px1.R;
-                        rgbTriplets[idx + 1] += px0.G + px1.G;
-                        rgbTriplets[idx + 2] += px0.B + px1.B;
-
-                    }
+                    this.ConvertChunk420(ref Unsafe.Add(ref stride, 8), ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset), rgbTriplets.Slice(12));
                 }
 
                 int writeIdx = 8 * (i / 2);
+                ref float cbWriteRef = ref Unsafe.Add(ref cbBlockRef, writeIdx);
+                ref float crWriteRef = ref Unsafe.Add(ref crBlockRef, writeIdx);
                 for (int j = 0; j < 8; j++)
                 {
                     int idx = j * 3;
@@ -246,12 +220,32 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                         rgbTriplets[idx] / 4,       // r
                         rgbTriplets[idx + 1] / 4,   // g
                         rgbTriplets[idx + 2] / 4,   // b
-                        ref Unsafe.Add(ref cbBlockRef, writeIdx + j),
-                        ref Unsafe.Add(ref crBlockRef, writeIdx + j));
+                        ref Unsafe.Add(ref cbWriteRef, j),
+                        ref Unsafe.Add(ref crWriteRef, j));
                 }
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, Span<int> chromaRgbTriplet)
+        {
+            for (int k = 0; k < 8; k += 2)
+            {
+                ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
+
+                Rgb24 px0 = Unsafe.Add(ref stride, k);
+                Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+
+                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
+                this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
+
+                int idx = 3 * (k / 2);
+                chromaRgbTriplet[idx] += px0.R + px1.R;
+                chromaRgbTriplet[idx + 1] += px0.G + px1.G;
+                chromaRgbTriplet[idx + 2] += px0.B + px1.B;
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int Fix(float x)
             => (int)((x * (1L << ScaleBits)) + 0.5F);

From 7135fc70963dd4c291375db79bd43fd8fb625f61 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 5 Jun 2021 03:08:13 +0300
Subject: [PATCH 74/99] Renamed MinimumBitsToStore16 method as it only works
 with up to 16 bits values

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 6bf06150b..ef457f7ce 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -854,7 +854,7 @@ namespace SixLabors.ImageSharp
         /// <param name="number">Unsigned integer to store</param>
         /// <returns>Minimum number of bits needed to store given value</returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int MinimumBitsToStore(uint number)
+        public static int MinimumBitsToStore16(uint number)
         {
 #if !SUPPORTS_BITOPERATIONS
             if (number < 0x100)

From 743e34c489d68543f60935484aa0e7f1a847e0cd Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 5 Jun 2021 03:49:14 +0300
Subject: [PATCH 75/99] Fixed stream flush for jpeg encoder

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 218b2b59c..fdeecc9d8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -381,7 +381,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             int padBitsCount = 8 - (this.bitCount % 8);
             if (padBitsCount != 0)
             {
-                this.Emit(0xff, padBitsCount);
+                this.Emit((1 << padBitsCount) - 1, padBitsCount);
             }
 
             // flush remaining bytes

From 01f44a839ed0a3f3ec5362f0f661a80611ed6ea1 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 5 Jun 2021 20:05:50 +0300
Subject: [PATCH 76/99] Renamed vectorized rgb -> ycbcr converter for 444
 subsampling

---
 .../Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs  | 2 +-
 .../Components/Encoder/YCbCrForwardConverter444{TPixel}.cs    | 2 +-
 .../Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs | 2 +-
 .../ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index b9f0fa427..05a1b111f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -63,7 +63,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
         /// <param name="cbBlock">8x8 destination matrix of Chrominance(Cb) converted data</param>
         /// <param name="crBlock">8x8 destination matrix of Chrominance(Cr) converted data</param>
-        public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        public static void Convert444(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index f3ae33934..0b7438725 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -124,7 +124,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                RgbToYCbCrConverterVectorized.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
             }
             else
             {
diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
index 60a585384..9aafb6936 100644
--- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
@@ -49,7 +49,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+                RgbToYCbCrConverterVectorized.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr);
             }
         }
     }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index c605a6cf8..5f9d3f26d 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 using System;
@@ -52,7 +52,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+            RgbToYCbCrConverterVectorized.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
             Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
         }

From fcf202a913a3c623c877363cb4144a5b050dd15f Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sat, 5 Jun 2021 23:00:19 +0300
Subject: [PATCH 77/99] Added tests for 420 rgb -> ycbcr subsampling

---
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   | 165 ++++++++++++++++--
 1 file changed, 152 insertions(+), 13 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 5f9d3f26d..fcc570c15 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -23,9 +23,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         private ITestOutputHelper Output { get; }
 
         [Fact]
-        public void TestLutConverter()
+        public void TestConverterLut444()
         {
-            Rgb24[] data = CreateTestData();
+            int dataSize = 8 * 8;
+            Rgb24[] data = CreateTestData(dataSize);
             var target = RgbToYCbCrConverterLut.Create();
 
             Block8x8F y = default;
@@ -34,11 +35,11 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             target.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
+            Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
         }
 
         [Fact]
-        public void TestVectorizedConverter()
+        public void TestConverterVectorized444()
         {
             if (!RgbToYCbCrConverterVectorized.IsSupported)
             {
@@ -46,7 +47,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 return;
             }
 
-            Rgb24[] data = CreateTestData();
+            int dataSize = 8 * 8;
+            Rgb24[] data = CreateTestData(dataSize);
 
             Block8x8F y = default;
             Block8x8F cb = default;
@@ -54,10 +56,141 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             RgbToYCbCrConverterVectorized.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
+            Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
         }
 
-        private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
+        [Fact]
+        public void TestConverterLut420()
+        {
+            int dataSize = 16 * 16;
+            Span<Rgb24> data = CreateTestData(dataSize).AsSpan();
+            var target = RgbToYCbCrConverterLut.Create();
+
+            var yBlocks = new Block8x8F[4];
+            var cb = default(Block8x8F);
+            var cr = default(Block8x8F);
+
+            target.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            target.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+
+            Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
+        }
+
+        [Fact]
+        public void TestConverterVectorized420()
+        {
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.Output.WriteLine("No AVX and/or FMA present, skipping test!");
+                return;
+            }
+
+            int dataSize = 16 * 16;
+            Span<Rgb24> data = CreateTestData(dataSize).AsSpan();
+
+            var yBlocks = new Block8x8F[4];
+            var cb = default(Block8x8F);
+            var cr = default(Block8x8F);
+
+            RgbToYCbCrConverterVectorized.Convert420_16x8(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            RgbToYCbCrConverterVectorized.Convert420_16x8(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+
+            Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
+        }
+
+
+        private static void Verify444(
+            ReadOnlySpan<Rgb24> data,
+            ref Block8x8F yResult,
+            ref Block8x8F cbResult,
+            ref Block8x8F crResult,
+            ApproximateColorSpaceComparer comparer)
+        {
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            RgbToYCbCr(data, ref y, ref cb, ref cr);
+
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                Assert.True(comparer.Equals(new YCbCr(y[i], cb[i], cr[i]), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y[i]} == {yResult[i]}, {cb[i]} == {cbResult[i]}, {cr[i]} == {crResult[i]}");
+            }
+        }
+
+        private static void Verify420(
+            ReadOnlySpan<Rgb24> data,
+            Block8x8F[] yResult,
+            ref Block8x8F cbResult,
+            ref Block8x8F crResult,
+            ApproximateFloatComparer comparer)
+        {
+            var tempBlock = default(Block8x8F);
+            var cbTrue = new Block8x8F[4];
+            var crTrue = new Block8x8F[4];
+
+            Span<Rgb24> tempData = new Rgb24[8 * 8].AsSpan();
+
+            // top left
+            Copy8x8(data, tempData);
+            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[0], ref crTrue[0]);
+            VerifyBlock(ref yResult[0], ref tempBlock, comparer);
+
+            // top right
+            Copy8x8(data.Slice(8), tempData);
+            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[1], ref crTrue[1]);
+            VerifyBlock(ref yResult[1], ref tempBlock, comparer);
+
+            // bottom left
+            Copy8x8(data.Slice(8 * 16), tempData);
+            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[2], ref crTrue[2]);
+            VerifyBlock(ref yResult[2], ref tempBlock, comparer);
+
+            // bottom right
+            Copy8x8(data.Slice((8 * 16) + 8), tempData);
+            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[3], ref crTrue[3]);
+            VerifyBlock(ref yResult[3], ref tempBlock, comparer);
+
+            // verify Cb
+            Scale16X16To8X8(ref tempBlock, cbTrue);
+            VerifyBlock(ref cbResult, ref tempBlock, comparer);
+
+            // verify Cr
+            Scale16X16To8X8(ref tempBlock, crTrue);
+            VerifyBlock(ref crResult, ref tempBlock, comparer);
+
+
+            // extracts 8x8 blocks from 16x8 memory region
+            static void Copy8x8(ReadOnlySpan<Rgb24> source, Span<Rgb24> dest)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    source.Slice(i * 16, 8).CopyTo(dest.Slice(i * 8));
+                }
+            }
+
+            // scales 16x16 to 8x8, used in chroma subsampling tests
+            static void Scale16X16To8X8(ref Block8x8F dest, ReadOnlySpan<Block8x8F> source)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    int dstOff = ((i & 2) << 4) | ((i & 1) << 2);
+                    Block8x8F iSource = source[i];
+
+                    for (int y = 0; y < 4; y++)
+                    {
+                        for (int x = 0; x < 4; x++)
+                        {
+                            int j = (16 * y) + (2 * x);
+                            float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9];
+                            dest[(8 * y) + x + dstOff] = (sum + 2) * .25F;
+                        }
+                    }
+                }
+            }
+        }
+
+        private static void RgbToYCbCr(ReadOnlySpan<Rgb24> data, ref Block8x8F y, ref Block8x8F cb, ref Block8x8F cr)
         {
             for (int i = 0; i < data.Length; i++)
             {
@@ -65,17 +198,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 int g = data[i].G;
                 int b = data[i].B;
 
-                float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-                float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-                float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+                y[i] = (0.299F * r) + (0.587F * g) + (0.114F * b);
+                cb[i] = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
+                cr[i] = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+            }
+        }
 
-                Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}");
+        private static void VerifyBlock(ref Block8x8F res, ref Block8x8F target, ApproximateFloatComparer comparer)
+        {
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                Assert.True(comparer.Equals(res[i], target[i]), $"Pos {i}, Expected {target[i]} == {res[i]}");
             }
         }
 
-        private static Rgb24[] CreateTestData()
+        private static Rgb24[] CreateTestData(int size)
         {
-            var data = new Rgb24[64];
+            var data = new Rgb24[size];
             var r = new Random();
 
             var random = new byte[3];

From ad333f6598c92a2b9faf6d02637b21880a0eb0d3 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 6 Jun 2021 15:39:12 +0300
Subject: [PATCH 78/99] Simplified Lut implementation

---
 .../Encoder/RgbToYCbCrConverterLut.cs          | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 7681063ee..b301e8320 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -229,20 +229,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, Span<int> chromaRgbTriplet)
         {
-            for (int k = 0; k < 8; k += 2)
+            for (int i = 0; i < 8; i++)
             {
-                ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
+                Rgb24 px0 = Unsafe.Add(ref stride, i);
 
-                Rgb24 px0 = Unsafe.Add(ref stride, k);
-                Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref Unsafe.Add(ref yBlock, i));
 
-                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-                this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
-
-                int idx = 3 * (k / 2);
-                chromaRgbTriplet[idx] += px0.R + px1.R;
-                chromaRgbTriplet[idx + 1] += px0.G + px1.G;
-                chromaRgbTriplet[idx + 2] += px0.B + px1.B;
+                int idx = 3 * (i / 2);
+                chromaRgbTriplet[idx] += px0.R;
+                chromaRgbTriplet[idx + 1] += px0.G;
+                chromaRgbTriplet[idx + 2] += px0.B;
             }
         }
 

From 0e053f0d6a62d621bd5d24b4685c19340815a4b5 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 04:34:37 +0300
Subject: [PATCH 79/99] Optimized 420 converter with higher precision

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 143 ++++++++++--------
 1 file changed, 78 insertions(+), 65 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index b301e8320..e1dcad1b6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -92,6 +92,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             return tables;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateY(byte r, byte g, byte b)
+        {
+            return (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateCb(byte r, byte g, byte b)
+        {
+            return (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateCr(byte r, byte g, byte b)
+        {
+            return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+        }
+
+
         /// <summary>
         /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
         /// </summary>
@@ -115,33 +134,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(
-            int r,
-            int g,
-            int b,
-            ref Block8x8F yResult,
-            int i)
-        {
-            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-            yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(int r, int g, int b, ref float yResult) =>
-            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-            yResult = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(int r, int g, int b, ref float cbResult, ref float crResult)
-        {
-            // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-            cbResult = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
-
-            // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
-            crResult = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
-        }
-
         /// <summary>
         /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma.
         /// </summary>
@@ -187,7 +179,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // 0-31 or 32-63
             // upper or lower part
-            int chromaWriteOffset = row * Block8x8F.Size / 2;
+            int chromaWriteOffset = row * (Block8x8F.Size / 2);
             ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref cbBlock), chromaWriteOffset);
             ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref crBlock), chromaWriteOffset);
 
@@ -195,51 +187,72 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             for (int i = 0; i < 8; i += 2)
             {
-                // 8 pixels by 3 integers
-                Span<int> rgbTriplets = stackalloc int[24];
-
-                for (int j = 0; j < 2; j++)
-                {
-                    int yBlockWriteOffset = (i + j) * 8;
-                    ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
-
-                    // left
-                    this.ConvertChunk420(ref stride, ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset), rgbTriplets);
-
-                    // right
-                    this.ConvertChunk420(ref Unsafe.Add(ref stride, 8), ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset), rgbTriplets.Slice(12));
-                }
-
-                int writeIdx = 8 * (i / 2);
-                ref float cbWriteRef = ref Unsafe.Add(ref cbBlockRef, writeIdx);
-                ref float crWriteRef = ref Unsafe.Add(ref crBlockRef, writeIdx);
-                for (int j = 0; j < 8; j++)
-                {
-                    int idx = j * 3;
-                    this.ConvertPixelInto(
-                        rgbTriplets[idx] / 4,       // r
-                        rgbTriplets[idx + 1] / 4,   // g
-                        rgbTriplets[idx + 2] / 4,   // b
-                        ref Unsafe.Add(ref cbWriteRef, j),
-                        ref Unsafe.Add(ref crWriteRef, j));
-                }
+                int yBlockWriteOffset = i * 8;
+                ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, i * 16);
+
+                int chromaOffset = 8 * (i / 2);
+
+                // left
+                this.ConvertChunk420(
+                    ref stride,
+                    ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset),
+                    ref Unsafe.Add(ref cbBlockRef, chromaOffset),
+                    ref Unsafe.Add(ref crBlockRef, chromaOffset));
+
+                // right
+                this.ConvertChunk420(
+                    ref Unsafe.Add(ref stride, 8),
+                    ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset),
+                    ref Unsafe.Add(ref cbBlockRef, chromaOffset + 4),
+                    ref Unsafe.Add(ref crBlockRef, chromaOffset + 4));
             }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, Span<int> chromaRgbTriplet)
+        private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, ref float cbBlock, ref float crBlock)
         {
-            for (int i = 0; i < 8; i++)
+            // jpeg 8x8 blocks are processed as 16x16 blocks with 16x8 subpasses (this is done for performance reasons)
+            // each row is 16 pixels wide thus +16 stride reference offset
+            // resulting luminance (Y`) are sampled at original resolution thus +8 reference offset
+            for (int k = 0; k < 8; k += 2)
             {
-                Rgb24 px0 = Unsafe.Add(ref stride, i);
+                ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
+
+                // top row
+                Rgb24 px0 = Unsafe.Add(ref stride, k);
+                Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
+                this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
+
+                // bottom row
+                Rgb24 px2 = Unsafe.Add(ref stride, k + 16);
+                Rgb24 px3 = Unsafe.Add(ref stride, k + 17);
+                this.ConvertPixelInto(px2.R, px2.G, px2.B, ref Unsafe.Add(ref yBlockRef, 8));
+                this.ConvertPixelInto(px3.R, px3.G, px3.B, ref Unsafe.Add(ref yBlockRef, 9));
+
+                Unsafe.Add(ref cbBlock, k / 2) = this.CalculateAverageCb(px0, px1, px2, px3);
+                Unsafe.Add(ref crBlock, k / 2) = this.CalculateAverageCr(px0, px1, px2, px3);
+            }
+        }
 
-                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref Unsafe.Add(ref yBlock, i));
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateAverageCb(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+        {
+            return 0.25f
+                * (this.CalculateCb(px0.R, px0.G, px0.B)
+                + this.CalculateCb(px1.R, px1.G, px1.B)
+                + this.CalculateCb(px2.R, px2.G, px2.B)
+                + this.CalculateCb(px3.R, px3.G, px3.B));
+        }
 
-                int idx = 3 * (i / 2);
-                chromaRgbTriplet[idx] += px0.R;
-                chromaRgbTriplet[idx + 1] += px0.G;
-                chromaRgbTriplet[idx + 2] += px0.B;
-            }
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateAverageCr(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+        {
+            return 0.25f
+                * (this.CalculateCr(px0.R, px0.G, px0.B)
+                + this.CalculateCr(px1.R, px1.G, px1.B)
+                + this.CalculateCr(px2.R, px2.G, px2.B)
+                + this.CalculateCr(px3.R, px3.G, px3.B));
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]

From 2d54226caef366fe6c7c1e210d47cb70c4bf771c Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 05:26:28 +0300
Subject: [PATCH 80/99] Both converters code cleanup

---
 .../Encoder/RgbToYCbCrConverterLut.cs         | 51 +++++--------------
 1 file changed, 13 insertions(+), 38 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index e1dcad1b6..15574a32a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -95,43 +95,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private float CalculateY(byte r, byte g, byte b)
         {
+            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
             return (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private float CalculateCb(byte r, byte g, byte b)
         {
+            // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
             return (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private float CalculateCr(byte r, byte g, byte b)
         {
-            return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
-        }
-
-
-        /// <summary>
-        /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(
-            int r,
-            int g,
-            int b,
-            ref Block8x8F yResult,
-            ref Block8x8F cbResult,
-            ref Block8x8F crResult,
-            int i)
-        {
-            // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-            yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
-
-            // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-            cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
-
             // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
-            crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+            return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
         /// <summary>
@@ -147,16 +126,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             for (int i = 0; i < Block8x8F.Size; i++)
             {
-                ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
-
-                this.ConvertPixelInto(
-                    c.R,
-                    c.G,
-                    c.B,
-                    ref yBlock,
-                    ref cbBlock,
-                    ref crBlock,
-                    i);
+                Rgb24 c = Unsafe.Add(ref rgbStart, i);
+
+                yBlock[i] = this.CalculateY(c.R, c.G, c.B);
+                cbBlock[i] = this.CalculateCb(c.R, c.G, c.B);
+                crBlock[i] = this.CalculateCr(c.R, c.G, c.B);
             }
         }
 
@@ -221,15 +195,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                 // top row
                 Rgb24 px0 = Unsafe.Add(ref stride, k);
                 Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
-                this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
-                this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
+                yBlockRef = this.CalculateY(px0.R, px0.G, px0.B);
+                Unsafe.Add(ref yBlockRef, 1) = this.CalculateY(px1.R, px1.G, px1.B);
 
                 // bottom row
                 Rgb24 px2 = Unsafe.Add(ref stride, k + 16);
                 Rgb24 px3 = Unsafe.Add(ref stride, k + 17);
-                this.ConvertPixelInto(px2.R, px2.G, px2.B, ref Unsafe.Add(ref yBlockRef, 8));
-                this.ConvertPixelInto(px3.R, px3.G, px3.B, ref Unsafe.Add(ref yBlockRef, 9));
+                Unsafe.Add(ref yBlockRef, 8) = this.CalculateY(px2.R, px2.G, px2.B);
+                Unsafe.Add(ref yBlockRef, 9) = this.CalculateY(px3.R, px3.G, px3.B);
 
+                // chroma average for 2x2 pixel block
                 Unsafe.Add(ref cbBlock, k / 2) = this.CalculateAverageCb(px0, px1, px2, px3);
                 Unsafe.Add(ref crBlock, k / 2) = this.CalculateAverageCr(px0, px1, px2, px3);
             }

From 2949145981a454ebce528cdd7dd56f70987adce1 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 05:27:02 +0300
Subject: [PATCH 81/99] Fixed failing tests output

---
 tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index fcc570c15..9ec1bf603 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -208,7 +208,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
         {
             for (int i = 0; i < Block8x8F.Size; i++)
             {
-                Assert.True(comparer.Equals(res[i], target[i]), $"Pos {i}, Expected {target[i]} == {res[i]}");
+                Assert.True(comparer.Equals(res[i], target[i]), $"Pos {i}, Expected: {target[i]}, Got: {res[i]}");
             }
         }
 

From 8f79eb93c2442da2e9c8331f9267997df8c79316 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 07:22:31 +0300
Subject: [PATCH 82/99] Converters tests/code cleanup, added comments for
 padding property

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 155 +++---------------
 .../YCbCrForwardConverter420{TPixel}.cs       |   4 +-
 .../YCbCrForwardConverter444{TPixel}.cs       |   2 +-
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   |  30 ++--
 4 files changed, 39 insertions(+), 152 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 05a1b111f..49b974404 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -27,15 +27,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
-        public static int AvxRegisterRgbCompatibilityPadding
+        public static int AvxCompatibilityPadding
         {
+            // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total
+            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
+            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
+            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
+            // stride 0    0    - 192  -(+64bits)-> 256
+            // stride 1    192  - 384  -(+64bits)-> 448
+            // stride 2    384  - 576  -(+64bits)-> 640
+            // stride 3    576  - 768  -(+64bits)-> 832
+            // stride 4    768  - 960  -(+64bits)-> 1024
+            // stride 5    960  - 1152 -(+64bits)-> 1216
+            // stride 6    1152 - 1344 -(+64bits)-> 1408
+            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
+            //
+            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
+            // This is not permitted - we are reading foreign memory
+            //
+            // 8 byte padding to rgb byte span will solve this problem without extra code in converters
             get
             {
+#if SUPPORTS_RUNTIME_INTRINSICS
                 if (IsSupported)
                 {
                     return 8;
                 }
-
+#endif
                 return 0;
             }
         }
@@ -89,26 +107,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             Vector256<byte> rgb, rg, bx;
             Vector256<float> r, g, b;
 
-            // TODO: probably remove this after the draft
-            // rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total
-            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
-            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
-            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
-            // stride 0    0    - 192  -(+64bits)-> 256
-            // stride 1    192  - 384  -(+64bits)-> 448
-            // stride 2    384  - 576  -(+64bits)-> 640
-            // stride 3    576  - 768  -(+64bits)-> 832
-            // stride 4    768  - 960  -(+64bits)-> 1024
-            // stride 5    960  - 1152 -(+64bits)-> 1216
-            // stride 6    1152 - 1344 -(+64bits)-> 1408
-            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
-            //
-            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
-            // This is not permitted - we are reading foreign memory
-            // That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved
-            //
-            // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
-            // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
             const int bytesPerRgbStride = 24;
             for (int i = 0; i < 8; i++)
             {
@@ -135,91 +133,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
-        /// <summary>
-        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
-        /// </summary>
-        /// <remarks>Total size of rgb span must be 200 bytes</remarks>
-        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
-        {
-            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-            var f0299 = Vector256.Create(0.299f);
-            var f0587 = Vector256.Create(0.587f);
-            var f0114 = Vector256.Create(0.114f);
-            var fn0168736 = Vector256.Create(-0.168736f);
-            var fn0331264 = Vector256.Create(-0.331264f);
-            var f128 = Vector256.Create(128f);
-            var fn0418688 = Vector256.Create(-0.418688f);
-            var fn0081312F = Vector256.Create(-0.081312F);
-            var f05 = Vector256.Create(0.5f);
-            var zero = Vector256.Create(0).AsByte();
-
-            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
-            ref Vector256<float> destYRef = ref yBlock.V0;
-
-            int destOffset = (idx & 2) * 4 + (idx & 1);
-
-            ref Vector128<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref cbBlock), destOffset);
-            ref Vector128<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref crBlock), destOffset);
-
-            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
-            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
-            Vector256<byte> rgb, rg, bx;
-            Vector256<float> r, g, b;
-
-            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
-            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
-            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
-
-            const int bytesPerRgbStride = 24;
-            for (int i = 0; i < 2; i++)
-            {
-                // each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7]
-                for (int j = 0; j < 4; j++)
-                {
-                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
-
-                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
-
-                    rg = Avx2.UnpackLow(rgb, zero);
-                    bx = Avx2.UnpackHigh(rgb, zero);
-
-                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
-                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
-                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
-
-                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
-                    Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
-
-                    rDataLanes[j] = r;
-                    gDataLanes[j] = g;
-                    bDataLanes[j] = b;
-                }
-
-                int localDestOffset = (i & 1) * 4;
-
-                r = Scale_8x4_4x2(rDataLanes);
-                g = Scale_8x4_4x2(gDataLanes);
-                b = Scale_8x4_4x2(bDataLanes);
-
-                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-                Vector256<float> cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
-                Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower();
-                Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper();
-
-                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-                Vector256<float> cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
-                Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower();
-                Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper();
-            }
-#endif
-        }
-
         /// <summary>
         /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
         /// </summary>
-        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 
@@ -337,36 +254,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
             => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
-
-        public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
-        {
-            var fn0168736 = Vector256.Create(-0.168736f);
-            var fn0331264 = Vector256.Create(-0.331264f);
-            var f128 = Vector256.Create(128f);
-            var fn0418688 = Vector256.Create(-0.418688f);
-            var fn0081312F = Vector256.Create(-0.081312F);
-            var f05 = Vector256.Create(0.5f);
-
-            ref Vector256<float> destCbRef = ref cbBlock.V0;
-            ref Vector256<float> destCrRef = ref crBlock.V0;
-
-            ref Vector256<float> rRef = ref rBlock.V0;
-            ref Vector256<float> gRef = ref gBlock.V0;
-            ref Vector256<float> bRef = ref bBlock.V0;
-
-            for (int i = 0; i < 8; i++)
-            {
-                ref Vector256<float> r = ref Unsafe.Add(ref rRef, i);
-                ref Vector256<float> g = ref Unsafe.Add(ref gRef, i);
-                ref Vector256<float> b = ref Unsafe.Add(ref bRef, i);
-
-                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
-
-                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
-            }
-        }
 #endif
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index e0e7854b0..9288acc7e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // temporal pixel buffers
             this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
-            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
 
             // frame data
             this.samplingAreaSize = new Size(frame.Width, frame.Height);
@@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
             }
             else
             {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 0b7438725..d611aaf9e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             // temporal pixel buffers
             this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
-            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
 
             // frame data
             this.samplingAreaSize = new Size(frame.Width, frame.Height);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 9ec1bf603..d95191ffe 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -92,8 +92,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             var cb = default(Block8x8F);
             var cr = default(Block8x8F);
 
-            RgbToYCbCrConverterVectorized.Convert420_16x8(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
-            RgbToYCbCrConverterVectorized.Convert420_16x8(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+            RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
 
             Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
         }
@@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             ref Block8x8F crResult,
             ApproximateFloatComparer comparer)
         {
-            var tempBlock = default(Block8x8F);
+            var trueBlock = default(Block8x8F);
             var cbTrue = new Block8x8F[4];
             var crTrue = new Block8x8F[4];
 
@@ -133,31 +133,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             // top left
             Copy8x8(data, tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[0], ref crTrue[0]);
-            VerifyBlock(ref yResult[0], ref tempBlock, comparer);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]);
+            VerifyBlock(ref yResult[0], ref trueBlock, comparer);
 
             // top right
             Copy8x8(data.Slice(8), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[1], ref crTrue[1]);
-            VerifyBlock(ref yResult[1], ref tempBlock, comparer);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]);
+            VerifyBlock(ref yResult[1], ref trueBlock, comparer);
 
             // bottom left
             Copy8x8(data.Slice(8 * 16), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[2], ref crTrue[2]);
-            VerifyBlock(ref yResult[2], ref tempBlock, comparer);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]);
+            VerifyBlock(ref yResult[2], ref trueBlock, comparer);
 
             // bottom right
             Copy8x8(data.Slice((8 * 16) + 8), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[3], ref crTrue[3]);
-            VerifyBlock(ref yResult[3], ref tempBlock, comparer);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]);
+            VerifyBlock(ref yResult[3], ref trueBlock, comparer);
 
             // verify Cb
-            Scale16X16To8X8(ref tempBlock, cbTrue);
-            VerifyBlock(ref cbResult, ref tempBlock, comparer);
+            Scale16X16To8X8(ref trueBlock, cbTrue);
+            VerifyBlock(ref cbResult, ref trueBlock, comparer);
 
             // verify Cr
-            Scale16X16To8X8(ref tempBlock, crTrue);
-            VerifyBlock(ref crResult, ref tempBlock, comparer);
+            Scale16X16To8X8(ref trueBlock, crTrue);
+            VerifyBlock(ref crResult, ref trueBlock, comparer);
 
 
             // extracts 8x8 blocks from 16x8 memory region

From b1a21269a0d5bdfaf4315559b0803a8f0cd2a15a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 07:34:02 +0300
Subject: [PATCH 83/99] Added docs

---
 .../Encoder/YCbCrForwardConverter420{TPixel}.cs    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index 9288acc7e..987ca6463 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -16,13 +16,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
     internal ref struct YCbCrForwardConverter420<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
-        // TODO: docs
+        /// <summary>
+        /// Number of pixels processed per single <see cref="Convert(int, int, ref RowOctet{TPixel}, int)"/> call
+        /// </summary>
         private const int PixelsPerSample = 16 * 8;
 
-        // TODO: docs
-        private static int RgbSpanByteSize = PixelsPerSample * 3;
+        /// <summary>
+        /// Total byte size of processed pixels converted from TPixel to <see cref="Rgb24"/>
+        /// </summary>
+        private const int RgbSpanByteSize = PixelsPerSample * 3;
 
-        // TODO: docs
+        /// <summary>
+        /// <see cref="Size"/> of sampling area from given frame pixel buffer
+        /// </summary>
         private static readonly Size SampleSize = new Size(16, 8);
 
         /// <summary>

From 2edb1a8bb96627a57f23588ab564dd04432c4c53 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 07:39:44 +0300
Subject: [PATCH 84/99] Removed obsolete code

---
 .../YCbCrForwardConverter444{TPixel}.cs       | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index d611aaf9e..91e56cab2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -88,27 +88,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
         }
 
-        public static YCbCrForwardConverter444<TPixel> Create()
-        {
-            var result = default(YCbCrForwardConverter444<TPixel>);
-
-            // creating rgb pixel bufferr
-            // TODO: this is subject to discuss
-            // converter.Convert comments for +8 padding
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + 8].AsSpan());
-
-            // TODO: this is subject to discuss
-            result.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
-
-            // Avoid creating lookup tables, when vectorized converter is supported
-            if (!RgbToYCbCrConverterVectorized.IsSupported)
-            {
-                result.colorTables = RgbToYCbCrConverterLut.Create();
-            }
-
-            return result;
-        }
-
         /// <summary>
         /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
         /// </summary>

From 0aecbd023d0003fb8fb7baf157ae3bc781a0e4f7 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 07:41:15 +0300
Subject: [PATCH 85/99] Removed unused usings

---
 .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs       | 2 --
 .../Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index fdeecc9d8..ca352397b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -1,9 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
-using System;
 using System.IO;
-using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 91e56cab2..1ef8246ff 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;

From a4222fd91cfb1b9b5597455860417dff68d76526 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 08:11:43 +0300
Subject: [PATCH 86/99] Added DCT tests

---
 .../Jpeg/Components/FastFloatingPointDCT.cs   |   2 +-
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  | 207 +++++++++++++-----
 2 files changed, 159 insertions(+), 50 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index afcf4158b..ad2e290f6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -203,7 +203,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// </summary>
         /// <param name="s">Source</param>
         /// <param name="d">Destination</param>
-        private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 75ad5427c..99dce57c7 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,7 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-
+using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
 
@@ -22,94 +22,160 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             {
             }
 
-            [Fact]
-            public void IDCT2D8x4_LeftPart()
+            // Reference tests
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
             {
-                float[] sourceArray = Create8x8FloatData();
-                var expectedDestArray = new float[64];
+                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
 
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray, expectedDestArray);
+                var source = Block8x8F.Load(sourceArray);
 
-                var source = default(Block8x8F);
-                source.LoadFrom(sourceArray);
+                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
 
-                var dest = default(Block8x8F);
+                var temp = default(Block8x8F);
+                var actual = default(Block8x8F);
+                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
 
-                FastFloatingPointDCT.IDCT8x4_LeftPart(ref source, ref dest);
+                this.CompareBlocks(expected, actual, 1f);
+            }
 
-                var actualDestArray = new float[64];
-                dest.ScaledCopyTo(actualDestArray);
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToAccurate(int seed)
+            {
+                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+
+                var source = Block8x8F.Load(sourceArray);
+
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
 
-                this.Print8x8Data(expectedDestArray);
-                this.Output.WriteLine("**************");
-                this.Print8x8Data(actualDestArray);
+                var temp = default(Block8x8F);
+                var actual = default(Block8x8F);
+                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
 
-                Assert.Equal(expectedDestArray, actualDestArray);
+                this.CompareBlocks(expected, actual, 1f);
             }
 
-            [Fact]
-            public void IDCT2D8x4_RightPart()
+
+            // Inverse transform
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void IDCT8x4_LeftPart(int seed)
             {
-                float[] sourceArray = Create8x8FloatData();
-                var expectedDestArray = new float[64];
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
+
+                var destBlock = default(Block8x8F);
+
+                var expectedDest = new float[64];
+
+                // reference
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
 
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray.AsSpan(4), expectedDestArray.AsSpan(4));
+                // testee
+                FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+            }
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void IDCT8x4_RightPart(int seed)
+            {
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                var source = default(Block8x8F);
-                source.LoadFrom(sourceArray);
+                var destBlock = default(Block8x8F);
 
-                var dest = default(Block8x8F);
+                var expectedDest = new float[64];
 
-                FastFloatingPointDCT.IDCT8x4_RightPart(ref source, ref dest);
+                // reference
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
 
-                var actualDestArray = new float[64];
-                dest.ScaledCopyTo(actualDestArray);
+                // testee
+                FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock);
 
-                this.Print8x8Data(expectedDestArray);
-                this.Output.WriteLine("**************");
-                this.Print8x8Data(actualDestArray);
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
 
-                Assert.Equal(expectedDestArray, actualDestArray);
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
+            public void IDCT8x8_Avx(int seed)
             {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                if (!Avx.IsSupported)
+                {
+                    this.Output.WriteLine("No AVX present, skipping test!");
+                    return;
+                }
 
-                var source = Block8x8F.Load(sourceArray);
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+                var destBlock = default(Block8x8F);
 
-                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                var expectedDest = new float[64];
 
-                this.CompareBlocks(expected, actual, 1f);
+                // reference, left part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
+
+                // reference, right part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee, whole 8x8
+                FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToAccurate(int seed)
+            public void TransformIDCT(int seed)
             {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                var source = Block8x8F.Load(sourceArray);
+                var destBlock = default(Block8x8F);
 
-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
+                var expectedDest = new float[64];
+                var temp1 = new float[64];
+                var temp2 = default(Block8x8F);
 
-                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                // reference
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
 
-                this.CompareBlocks(expected, actual, 1f);
+                // testee
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
+
+            // Forward transform
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -123,7 +189,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 var expectedDest = new float[64];
 
+                // reference
                 ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
+
+                // testee
                 FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
 
                 var actualDest = new float[64];
@@ -145,7 +214,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 var expectedDest = new float[64];
 
+                // reference
                 ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee
                 FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
 
                 var actualDest = new float[64];
@@ -154,6 +226,40 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void FDCT8x8_Avx(int seed)
+            {
+                if (!Avx.IsSupported)
+                {
+                    this.Output.WriteLine("No AVX present, skipping test!");
+                    return;
+                }
+
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
+
+                var destBlock = default(Block8x8F);
+
+                var expectedDest = new float[64];
+
+                // reference, left part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
+
+                // reference, right part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee, whole 8x8
+                FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+            }
+
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -169,7 +275,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 var temp1 = new float[64];
                 var temp2 = default(Block8x8F);
 
+                // reference
                 ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
+
+                // testee
                 FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
 
                 var actualDest = new float[64];

From 8a61048a5c73ee5cc025fcefb8860168cad97c94 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 08:37:13 +0300
Subject: [PATCH 87/99] Fixed DCT tests

---
 tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 99dce57c7..fd5e5b005 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,7 +2,9 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+#if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
 
@@ -118,7 +120,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             [InlineData(2)]
             public void IDCT8x8_Avx(int seed)
             {
-                if (!Avx.IsSupported)
+#if SUPPORTS_RUNTIME_INTRINSICS
+                var skip = !Avx.IsSupported;
+#else
+                var skip = true;
+#endif
+
+                if (skip)
                 {
                     this.Output.WriteLine("No AVX present, skipping test!");
                     return;
@@ -231,7 +239,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             [InlineData(2)]
             public void FDCT8x8_Avx(int seed)
             {
-                if (!Avx.IsSupported)
+#if SUPPORTS_RUNTIME_INTRINSICS
+                var skip = !Avx.IsSupported;
+#else
+                var skip = true;
+#endif
+                if (skip)
                 {
                     this.Output.WriteLine("No AVX present, skipping test!");
                     return;

From b9b853b5239cbe5ada16370b624cad7794a2067e Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 16:42:26 +0300
Subject: [PATCH 88/99] Added docs & stylecop fixes

---
 .../YCbCrForwardConverter420{TPixel}.cs       | 10 ++++----
 .../YCbCrForwardConverter444{TPixel}.cs       | 23 +++++++++++++------
 .../Jpeg/Components/FastFloatingPointDCT.cs   |  2 --
 .../Formats/Jpeg/JpegEncoderCore.cs           |  5 ----
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  |  2 --
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
index 987ca6463..a4abd532b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
@@ -66,13 +65,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
-        // TODO: docs
+        /// <summary>
+        /// Sampled pixel buffer size
+        /// </summary>
         private Size samplingAreaSize;
 
-        // TODO: docs
+        /// <summary>
+        /// <see cref="Configuration"/> for internal operations
+        /// </summary>
         private Configuration config;
 
-
         public YCbCrForwardConverter420(ImageFrame<TPixel> frame)
         {
             // matrices would be filled during convert calls
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
index 1ef8246ff..ef589272b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -15,16 +15,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
     internal ref struct YCbCrForwardConverter444<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
-        // TODO: docs
+        /// <summary>
+        /// Number of pixels processed per single <see cref="Convert(int, int, ref RowOctet{TPixel})"/> call
+        /// </summary>
         private const int PixelsPerSample = 8 * 8;
 
-        // TODO: docs
+        /// <summary>
+        /// Total byte size of processed pixels converted from TPixel to <see cref="Rgb24"/>
+        /// </summary>
         private const int RgbSpanByteSize = PixelsPerSample * 3;
 
-        // TODO: docs
+        /// <summary>
+        /// <see cref="Size"/> of sampling area from given frame pixel buffer
+        /// </summary>
         private static readonly Size SampleSize = new Size(8, 8);
 
-
         /// <summary>
         /// The Y component
         /// </summary>
@@ -55,11 +60,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private Span<Rgb24> rgbSpan;
 
-        // TODO: docs
+        /// <summary>
+        /// Sampled pixel buffer size
+        /// </summary>
         private Size samplingAreaSize;
 
-        // TODO: docs
-        private readonly Configuration config;
+        /// <summary>
+        /// <see cref="Configuration"/> for internal operations
+        /// </summary>
+        private Configuration config;
 
         public YCbCrForwardConverter444(ImageFrame<TPixel> frame)
         {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index ad2e290f6..f31d07efc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -273,7 +273,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <summary>
         /// Apply floating point FDCT from src into dest
         /// </summary>
-        /// <remarks></remarks>
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
         /// <param name="temp">Temporary block provided by the caller for optimization</param>
@@ -467,7 +466,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
             Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
 
-
             Vector256<float> my2 = s.V2;
             Vector256<float> my6 = s.V6;
             mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index c68c0ffb0..6020e6196 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -5,14 +5,11 @@ using System;
 using System.Buffers.Binary;
 using System.IO;
 using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using System.Threading;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
-using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Exif;
 using SixLabors.ImageSharp.Metadata.Profiles.Icc;
@@ -69,7 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                 99, 99, 99, 99, 99, 99, 99, 99,
             };
 
-
         /// <summary>
         /// A scratch buffer to reduce allocations.
         /// </summary>
@@ -625,7 +621,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
         private void WriteStartOfScan<TPixel>(Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
             Span<byte> componentId = stackalloc byte[]
             {
                 0x01,
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index fd5e5b005..606a5678b 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -63,7 +63,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 this.CompareBlocks(expected, actual, 1f);
             }
 
-
             // Inverse transform
             [Theory]
             [InlineData(1)]
@@ -182,7 +181,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
-
             // Forward transform
             [Theory]
             [InlineData(1)]

From 8d321a5dc205252b540a30ccbed49cabe14c6320 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 17:49:10 +0300
Subject: [PATCH 89/99] Added DCT tests paths for nosimd/avx/avx+fma

---
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  | 90 +++++++++++++------
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 606a5678b..d49a6498c 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -7,7 +7,7 @@ using System.Runtime.Intrinsics.X86;
 #endif
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
-
+using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;
 using Xunit.Abstractions;
 
@@ -159,26 +159,42 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             [InlineData(2)]
             public void TransformIDCT(int seed)
             {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
+                static void RunTest(string serialized)
+                {
+                    int seed = FeatureTestRunner.Deserialize<int>(serialized);
 
-                var destBlock = default(Block8x8F);
+                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    var srcBlock = default(Block8x8F);
+                    srcBlock.LoadFrom(src);
 
-                var expectedDest = new float[64];
-                var temp1 = new float[64];
-                var temp2 = default(Block8x8F);
+                    var destBlock = default(Block8x8F);
 
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
+                    var expectedDest = new float[64];
+                    var temp1 = new float[64];
+                    var temp2 = default(Block8x8F);
 
-                // testee
-                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+                    // reference
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
 
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
+                    // testee
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
 
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                    var actualDest = new float[64];
+                    destBlock.ScaledCopyTo(actualDest);
+
+                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                }
+
+                // 3 paths:
+                // 1. AllowAll - call avx/fma implementation
+                // 2. DisableFMA - call avx implementation without fma acceleration
+                // 3. DisableAvx - call fallback code of Vector4 implementation
+                //
+                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                    RunTest,
+                    seed,
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
             }
 
             // Forward transform
@@ -276,26 +292,42 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             [InlineData(2)]
             public void TransformFDCT(int seed)
             {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
+                static void RunTest(string serialized)
+                {
+                    int seed = FeatureTestRunner.Deserialize<int>(serialized);
 
-                var destBlock = default(Block8x8F);
+                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    var srcBlock = default(Block8x8F);
+                    srcBlock.LoadFrom(src);
 
-                var expectedDest = new float[64];
-                var temp1 = new float[64];
-                var temp2 = default(Block8x8F);
+                    var destBlock = default(Block8x8F);
 
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
+                    var expectedDest = new float[64];
+                    var temp1 = new float[64];
+                    var temp2 = default(Block8x8F);
 
-                // testee
-                FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+                    // reference
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
+                    // testee
+                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
 
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                    var actualDest = new float[64];
+                    destBlock.ScaledCopyTo(actualDest);
+
+                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                }
+
+                // 3 paths:
+                // 1. AllowAll - call avx/fma implementation
+                // 2. DisableFMA - call avx implementation without fma acceleration
+                // 3. DisableAvx - call fallback code of Vector4 implementation
+                //
+                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                    RunTest,
+                    seed,
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
             }
         }
     }

From 0e07a8ed6187721125b6a490b4f48a3bb6081a1b Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Mon, 7 Jun 2021 18:40:12 +0300
Subject: [PATCH 90/99] Removed obsolete code

---
 .../Formats/Jpeg/Components/Block8x8F.cs      | 75 -------------------
 .../Block8x8F_Scale16X16To8X8.cs              | 38 ----------
 2 files changed, 113 deletions(-)
 delete mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 0acc6408e..8ca7b0c80 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -477,81 +477,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             DivideRoundAll(ref dest, ref qt);
         }
 
-        /// <summary>
-        /// Scales the 16x16 region represented by the 4 source blocks to the 8x8 DST block.
-        /// </summary>
-        /// <param name="destination">The destination block.</param>
-        /// <param name="source">The source block.</param>
-        public static unsafe void Scale16X16To8X8(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                Scale16X16To8X8Vectorized(ref destination, source);
-                return;
-            }
-#endif
-
-            Scale16X16To8X8Scalar(ref destination, source);
-        }
-
-        private static void Scale16X16To8X8Vectorized(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx2.IsSupported, "AVX2 is required to execute this method");
-
-            var f2 = Vector256.Create(2f);
-            var f025 = Vector256.Create(0.25f);
-            Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
-            ref Vector256<float> destRef = ref destination.V0;
-
-            for (int i = 0; i < 2; i++)
-            {
-                ref Vector256<float> in1 = ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i).V0;
-                ref Vector256<float> in2 = ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1).V0;
-
-                for (int j = 0; j < 8; j += 2)
-                {
-                    Vector256<float> a = Unsafe.Add(ref in1, j);
-                    Vector256<float> b = Unsafe.Add(ref in1, j + 1);
-                    Vector256<float> c = Unsafe.Add(ref in2, j);
-                    Vector256<float> d = Unsafe.Add(ref in2, j + 1);
-
-                    Vector256<float> calc1 = Avx.Shuffle(a, c, 0b10_00_10_00);
-                    Vector256<float> calc2 = Avx.Shuffle(a, c, 0b11_01_11_01);
-                    Vector256<float> calc3 = Avx.Shuffle(b, d, 0b10_00_10_00);
-                    Vector256<float> calc4 = Avx.Shuffle(b, d, 0b11_01_11_01);
-
-                    Vector256<float> sum = Avx.Add(Avx.Add(calc1, calc2), Avx.Add(calc3, calc4));
-                    Vector256<float> add = Avx.Add(sum, f2);
-                    Vector256<float> res = Avx.Multiply(add, f025);
-
-                    destRef = Avx2.PermuteVar8x32(res, switchInnerDoubleWords);
-                    destRef = ref Unsafe.Add(ref destRef, 1);
-                }
-            }
-#endif
-        }
-
-        private static unsafe void Scale16X16To8X8Scalar(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-            for (int i = 0; i < 4; i++)
-            {
-                int dstOff = ((i & 2) << 4) | ((i & 1) << 2);
-                Block8x8F iSource = source[i];
-
-                for (int y = 0; y < 4; y++)
-                {
-                    for (int x = 0; x < 4; x++)
-                    {
-                        int j = (16 * y) + (2 * x);
-                        float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9];
-                        destination[(8 * y) + x + dstOff] = (sum + 2) * .25F;
-                    }
-                }
-            }
-        }
-
         [MethodImpl(InliningOptions.ShortMethod)]
         private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
         {
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs
deleted file mode 100644
index ebd3e4013..000000000
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using BenchmarkDotNet.Attributes;
-using SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components
-{
-    [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
-    public class Block8x8F_Scale16X16To8X8
-    {
-        private Block8x8F source;
-        private readonly Block8x8F[] target = new Block8x8F[4];
-
-        [GlobalSetup]
-        public void Setup()
-        {
-            var random = new Random();
-
-            float[] f = new float[8 * 8];
-            for (int i = 0; i < f.Length; i++)
-            {
-                f[i] = (float)random.NextDouble();
-            }
-
-            for (int i = 0; i < 4; i++)
-            {
-                this.target[i] = Block8x8F.Load(f);
-            }
-
-            this.source = Block8x8F.Load(f);
-        }
-
-        [Benchmark]
-        public void Scale16X16To8X8() => Block8x8F.Scale16X16To8X8(ref this.source, this.target);
-    }
-}

From 0013c54460e1b775f3daa530305092eacc9623c5 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 9 Jun 2021 15:49:04 +0300
Subject: [PATCH 91/99] Optimized vector rgb pixel matrix scaling

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 18 ++++++++++++
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 28 ++-----------------
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 00c0d89f0..caeb694a9 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -577,6 +577,24 @@ namespace SixLabors.ImageSharp
                 }
             }
 
+            /// <summary>
+            /// Scales 8x8 matrix to 4x2 using 2x2 average
+            /// </summary>
+            /// <param name="v">Input matrix consisting of 4 256bit vectors, first row: (v[0], v[2]), second row: (v[1], v[3])</param>
+            /// <returns>256bit vector containing upper and lower scaled parts of the input matrix</returns>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static Vector256<float> Scale16x2_8x1(ReadOnlySpan<Vector256<float>> v)
+            {
+                DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
+
+                var f025 = Vector256.Create(0.25f);
+
+                Vector256<float> left = Avx.Add(v[0], v[2]);
+                Vector256<float> right = Avx.Add(v[1], v[3]);
+                Vector256<float> avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025);
+
+                return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
+            }
 
             /// <summary>
             /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 49b974404..56da8acc7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -221,9 +221,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     bDataLanes[j] = b;
                 }
 
-                r = Scale_8x4_4x2(rDataLanes);
-                g = Scale_8x4_4x2(gDataLanes);
-                b = Scale_8x4_4x2(bDataLanes);
+                r = SimdUtils.HwIntrinsics.Scale16x2_8x1(rDataLanes);
+                g = SimdUtils.HwIntrinsics.Scale16x2_8x1(gDataLanes);
+                b = SimdUtils.HwIntrinsics.Scale16x2_8x1(bDataLanes);
 
                 // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
                 Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
@@ -233,27 +233,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 #endif
         }
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
-        {
-            Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
-            var f025 = Vector256.Create(0.25f);
-
-            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]);
-            Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]);
-
-            return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static Vector256<float> SumHorizontalPairs(Vector256<float> v0, Vector256<float> v1)
-            => Avx.Add(Avx.Shuffle(v0, v1, 0b10_00_10_00), Avx.Shuffle(v0, v1, 0b11_01_11_01));
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
-            => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
-#endif
     }
 }

From 35daf2110f2196ce47853e167d4eb1df2e265b26 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 03:59:26 +0300
Subject: [PATCH 92/99] Added tests for vector rgb pixel matrix scaling

---
 .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 1f680aa6c..69f1b20fb 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -5,6 +5,8 @@ using System;
 using System.Linq;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
 #endif
@@ -358,6 +360,44 @@ namespace SixLabors.ImageSharp.Tests.Common
                     SimdUtils.PackFromRgbPlanes(Configuration.Default, r, g, b, actual));
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        public void Scale16x2_8x1(int seed)
+        {
+            if (!Avx.IsSupported)
+            {
+                return;
+            }
+
+            Span<float> data = new Random(seed).GenerateRandomFloatArray(Vector256<float>.Count * 4, -1000, 1000);
+
+            // Act:
+            Vector256<float> resultVector = SimdUtils.HwIntrinsics.Scale16x2_8x1(MemoryMarshal.Cast<float, Vector256<float>>(data));
+            ref float result = ref Unsafe.As<Vector256<float>, float>(ref resultVector);
+
+            // Assert:
+            // Comparison epsilon is tricky but 10^(-4) is good enough (?)
+            var comparer = new ApproximateFloatComparer(0.0001f);
+            for (int i = 0; i < Vector256<float>.Count; i++)
+            {
+                float actual = Unsafe.Add(ref result, i);
+                float expected = CalculateAverage16x2_8x1(data, i);
+
+                Assert.True(comparer.Equals(actual, expected), $"Pos {i}, Expected: {expected}, Actual: {actual}");
+            }
+
+            static float CalculateAverage16x2_8x1(Span<float> data, int index)
+            {
+                int upIdx = index * 2;
+                int lowIdx = (index + 8) * 2;
+                return 0.25f * (data[upIdx] + data[upIdx + 1] + data[lowIdx] + data[lowIdx + 1]);
+            }
+        }
+#endif
+
 #if SUPPORTS_RUNTIME_INTRINSICS
         [Fact]
         public void PackFromRgbPlanesAvx2Reduce_Rgb24()

From 121d1fa917da89c47a31a703862dfae77bed5f7a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 04:13:18 +0300
Subject: [PATCH 93/99] Fixed build error due to invalid using

---
 tests/ImageSharp.Tests/Common/SimdUtilsTests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 69f1b20fb..40f0e0c7b 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -6,8 +6,8 @@ using System.Linq;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
 #if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
 using SixLabors.ImageSharp.PixelFormats;

From 20a0d846768bb7662fc19cb6ae88648b5b3a0810 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 05:09:53 +0300
Subject: [PATCH 94/99] Moved jpeg matrix scaler to jpeg converter

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 19 -------------
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 27 ++++++++++++++++---
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index caeb694a9..b530a37e7 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -577,25 +577,6 @@ namespace SixLabors.ImageSharp
                 }
             }
 
-            /// <summary>
-            /// Scales 8x8 matrix to 4x2 using 2x2 average
-            /// </summary>
-            /// <param name="v">Input matrix consisting of 4 256bit vectors, first row: (v[0], v[2]), second row: (v[1], v[3])</param>
-            /// <returns>256bit vector containing upper and lower scaled parts of the input matrix</returns>
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static Vector256<float> Scale16x2_8x1(ReadOnlySpan<Vector256<float>> v)
-            {
-                DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
-
-                var f025 = Vector256.Create(0.25f);
-
-                Vector256<float> left = Avx.Add(v[0], v[2]);
-                Vector256<float> right = Avx.Add(v[1], v[3]);
-                Vector256<float> avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025);
-
-                return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
-            }
-
             /// <summary>
             /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 56da8acc7..1b7df596c 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -221,9 +221,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                     bDataLanes[j] = b;
                 }
 
-                r = SimdUtils.HwIntrinsics.Scale16x2_8x1(rDataLanes);
-                g = SimdUtils.HwIntrinsics.Scale16x2_8x1(gDataLanes);
-                b = SimdUtils.HwIntrinsics.Scale16x2_8x1(bDataLanes);
+                r = Scale16x2_8x1(rDataLanes);
+                g = Scale16x2_8x1(gDataLanes);
+                b = Scale16x2_8x1(bDataLanes);
 
                 // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
                 Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
@@ -233,5 +233,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             }
 #endif
         }
+
+#if SUPPORTS_RUNTIME_INTRINSICS 
+        /// <summary>
+        /// Scales 16x2 matrix to 8x1 using 2x2 average
+        /// </summary>
+        /// <param name="v">Input matrix consisting of 4 256bit vectors</param>
+        /// <returns>256bit vector containing upper and lower scaled parts of the input matrix</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<float> Scale16x2_8x1(ReadOnlySpan<Vector256<float>> v)
+        {
+            DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
+
+            var f025 = Vector256.Create(0.25f);
+
+            Vector256<float> left = Avx.Add(v[0], v[2]);
+            Vector256<float> right = Avx.Add(v[1], v[3]);
+            Vector256<float> avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025);
+
+            return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
+        }
     }
+#endif
 }

From 6d4e2ee23c4d2fb42d5039044b998c476f2a8c52 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 05:12:40 +0300
Subject: [PATCH 95/99] Moved jpeg converter scaler tests to to jpeg converter
 tests

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  |  2 +-
 .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 40 -----------------
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   | 43 +++++++++++++++++++
 3 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 1b7df596c..0fcffbc7e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -234,7 +234,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
         }
 
-#if SUPPORTS_RUNTIME_INTRINSICS 
+#if SUPPORTS_RUNTIME_INTRINSICS
         /// <summary>
         /// Scales 16x2 matrix to 8x1 using 2x2 average
         /// </summary>
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 40f0e0c7b..1f680aa6c 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -5,9 +5,7 @@ using System;
 using System.Linq;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
 using SixLabors.ImageSharp.PixelFormats;
@@ -360,44 +358,6 @@ namespace SixLabors.ImageSharp.Tests.Common
                     SimdUtils.PackFromRgbPlanes(Configuration.Default, r, g, b, actual));
         }
 
-#if SUPPORTS_RUNTIME_INTRINSICS
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public void Scale16x2_8x1(int seed)
-        {
-            if (!Avx.IsSupported)
-            {
-                return;
-            }
-
-            Span<float> data = new Random(seed).GenerateRandomFloatArray(Vector256<float>.Count * 4, -1000, 1000);
-
-            // Act:
-            Vector256<float> resultVector = SimdUtils.HwIntrinsics.Scale16x2_8x1(MemoryMarshal.Cast<float, Vector256<float>>(data));
-            ref float result = ref Unsafe.As<Vector256<float>, float>(ref resultVector);
-
-            // Assert:
-            // Comparison epsilon is tricky but 10^(-4) is good enough (?)
-            var comparer = new ApproximateFloatComparer(0.0001f);
-            for (int i = 0; i < Vector256<float>.Count; i++)
-            {
-                float actual = Unsafe.Add(ref result, i);
-                float expected = CalculateAverage16x2_8x1(data, i);
-
-                Assert.True(comparer.Equals(actual, expected), $"Pos {i}, Expected: {expected}, Actual: {actual}");
-            }
-
-            static float CalculateAverage16x2_8x1(Span<float> data, int index)
-            {
-                int upIdx = index * 2;
-                int lowIdx = (index + 8) * 2;
-                return 0.25f * (data[upIdx] + data[upIdx + 1] + data[lowIdx] + data[lowIdx + 1]);
-            }
-        }
-#endif
-
 #if SUPPORTS_RUNTIME_INTRINSICS
         [Fact]
         public void PackFromRgbPlanesAvx2Reduce_Rgb24()
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index d95191ffe..0d5b55038 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -2,6 +2,12 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.ColorSpaces;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
@@ -98,6 +104,43 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        public void Scale16x2_8x1(int seed)
+        {
+            if (!Avx2.IsSupported)
+            {
+                return;
+            }
+
+            Span<float> data = new Random(seed).GenerateRandomFloatArray(Vector256<float>.Count * 4, -1000, 1000);
+
+            // Act:
+            Vector256<float> resultVector = RgbToYCbCrConverterVectorized.Scale16x2_8x1(MemoryMarshal.Cast<float, Vector256<float>>(data));
+            ref float result = ref Unsafe.As<Vector256<float>, float>(ref resultVector);
+
+            // Assert:
+            // Comparison epsilon is tricky but 10^(-4) is good enough (?)
+            var comparer = new ApproximateFloatComparer(0.0001f);
+            for (int i = 0; i < Vector256<float>.Count; i++)
+            {
+                float actual = Unsafe.Add(ref result, i);
+                float expected = CalculateAverage16x2_8x1(data, i);
+
+                Assert.True(comparer.Equals(actual, expected), $"Pos {i}, Expected: {expected}, Actual: {actual}");
+            }
+
+            static float CalculateAverage16x2_8x1(Span<float> data, int index)
+            {
+                int upIdx = index * 2;
+                int lowIdx = (index + 8) * 2;
+                return 0.25f * (data[upIdx] + data[upIdx + 1] + data[lowIdx] + data[lowIdx + 1]);
+            }
+        }
+#endif
 
         private static void Verify444(
             ReadOnlySpan<Rgb24> data,

From ce1d9922004c45724b0c48ec1609688bd6dde33d Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 05:17:28 +0300
Subject: [PATCH 96/99] Fixed invalid curly braces, added debug Avx2 check

---
 .../Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 0fcffbc7e..926e7d5a4 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -243,6 +243,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static Vector256<float> Scale16x2_8x1(ReadOnlySpan<Vector256<float>> v)
         {
+            Debug.Assert(Avx2.IsSupported, "AVX2 is required to run this converter");
             DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
 
             var f025 = Vector256.Create(0.25f);
@@ -253,6 +254,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
             return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
         }
-    }
 #endif
+    }
 }

From 8bbcd6519762a93fcd094e797b591ac4c11f5843 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 17:26:18 +0300
Subject: [PATCH 97/99] Improved benchmark for jpeg encoder

---
 .../Codecs/Jpeg/EncodeJpeg.cs                 | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index e807c416b..5e0a5aff3 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -13,14 +13,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 {
     public class EncodeJpeg
     {
-        [Params(50, 75, 95, 100)]
+        [Params(75, 90, 100)]
         public int Quality;
 
         private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
 
-        // GDI+ uses 4:2:0 subsampling
-        private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;
-
         // System.Drawing
         private SDImage bmpDrawing;
         private Stream bmpStream;
@@ -29,7 +26,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 
         // ImageSharp
         private Image<Rgba32> bmpCore;
-        private JpegEncoder encoder;
+        private JpegEncoder encoder420;
+        private JpegEncoder encoder444;
 
         private MemoryStream destinationStream;
 
@@ -42,14 +40,15 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 
                 this.bmpCore = Image.Load<Rgba32>(this.bmpStream);
                 this.bmpCore.Metadata.ExifProfile = null;
-                this.encoder = new JpegEncoder { Quality = Quality, Subsample = EncodingSubsampling };
+                this.encoder420 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio420 };
+                this.encoder444 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio444 };
 
                 this.bmpStream.Position = 0;
                 this.bmpDrawing = SDImage.FromStream(this.bmpStream);
                 this.jpegCodec = GetEncoder(ImageFormat.Jpeg);
                 this.encoderParameters = new EncoderParameters(1);
                 // Quality cast to long is necessary
-                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)Quality);
+                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)this.Quality);
 
                 this.destinationStream = new MemoryStream();
             }
@@ -60,21 +59,34 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
         {
             this.bmpStream.Dispose();
             this.bmpStream = null;
+
+            this.destinationStream.Dispose();
+            this.destinationStream = null;
+
             this.bmpCore.Dispose();
             this.bmpDrawing.Dispose();
+
+            this.encoderParameters.Dispose();
         }
 
-        [Benchmark(Baseline = true, Description = "System.Drawing Jpeg")]
+        [Benchmark(Baseline = true, Description = "System.Drawing Jpeg 4:2:0")]
         public void JpegSystemDrawing()
         {
             this.bmpDrawing.Save(this.destinationStream, this.jpegCodec, this.encoderParameters);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
 
-        [Benchmark(Description = "ImageSharp Jpeg")]
-        public void JpegCore()
+        [Benchmark(Description = "ImageSharp Jpeg 4:2:0")]
+        public void JpegCore420()
+        {
+            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder420);
+            this.destinationStream.Seek(0, SeekOrigin.Begin);
+        }
+
+        [Benchmark(Description = "ImageSharp Jpeg 4:4:4")]
+        public void JpegCore444()
         {
-            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder);
+            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder444);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
 

From ab8ed086c0b8c6207e050b97e7c0ca70b11482ae Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Thu, 10 Jun 2021 17:27:02 +0300
Subject: [PATCH 98/99] Updated benchmark results

---
 .../Codecs/Jpeg/EncodeJpeg.cs                 | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 5e0a5aff3..47c6f2c7d 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -110,12 +110,21 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
 .NET Core SDK=6.0.100-preview.3.21202.5
-  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
+  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT  [AttachedDebugger]
   DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
 
 
-|                Method |     Mean |    Error |   StdDev | Ratio | RatioSD |
-|---------------------- |---------:|---------:|---------:|------:|--------:|
-| 'System.Drawing Jpeg' | 39.67 ms | 0.774 ms | 0.828 ms |  1.00 |    0.00 |
-|     'ImageSharp Jpeg' | 45.39 ms | 0.415 ms | 0.346 ms |  1.14 |    0.03 |
+|                      Method | Quality |     Mean |    Error |   StdDev | Ratio | RatioSD |
+|---------------------------- |-------- |---------:|---------:|---------:|------:|--------:|
+| 'System.Drawing Jpeg 4:2:0' |      75 | 30.60 ms | 0.496 ms | 0.464 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |      75 | 29.86 ms | 0.350 ms | 0.311 ms |  0.98 |    0.02 |
+|     'ImageSharp Jpeg 4:4:4' |      75 | 45.36 ms | 0.899 ms | 1.036 ms |  1.48 |    0.05 |
+|                             |         |          |          |          |       |         |
+| 'System.Drawing Jpeg 4:2:0' |      90 | 34.05 ms | 0.669 ms | 0.687 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |      90 | 37.26 ms | 0.706 ms | 0.660 ms |  1.10 |    0.03 |
+|     'ImageSharp Jpeg 4:4:4' |      90 | 52.54 ms | 0.579 ms | 0.514 ms |  1.55 |    0.04 |
+|                             |         |          |          |          |       |         |
+| 'System.Drawing Jpeg 4:2:0' |     100 | 39.36 ms | 0.267 ms | 0.237 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |     100 | 42.44 ms | 0.410 ms | 0.383 ms |  1.08 |    0.01 |
+|     'ImageSharp Jpeg 4:4:4' |     100 | 70.88 ms | 0.508 ms | 0.450 ms |  1.80 |    0.02 |
 */

From 87aec89f25fa752727a2396275a50d63df9e1e15 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Fri, 11 Jun 2021 18:43:15 +1000
Subject: [PATCH 99/99] Use GreatestCommonDivisor. Fix #1616

---
 .../Processors/Transforms/Resize/ResizeKernelMap.cs           | 4 ++--
 .../Processing/Processors/Transforms/ResizeKernelMapTests.cs  | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
index ab6040c17..2ab1d8b5a 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -130,9 +130,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
             int radius = (int)TolerantMath.Ceiling(scale * sampler.Radius);
 
             // 'ratio' is a rational number.
-            // Multiplying it by LCM(sourceSize, destSize)/sourceSize will result in a whole number "again".
+            // Multiplying it by destSize/GCD(sourceSize, destSize) will result in a whole number "again".
             // This value is determining the length of the periods in repeating kernel map rows.
-            int period = Numerics.LeastCommonMultiple(sourceSize, destinationSize) / sourceSize;
+            int period = destinationSize / Numerics.GreatestCommonDivisor(sourceSize, destinationSize);
 
             // the center position at i == 0:
             double center0 = (ratio - 1) * 0.5;
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
index f15a6242d..1d4629ccc 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
@@ -80,6 +80,9 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Transforms
             { KnownResamplers.Bicubic, 1680, 1200 },
             { KnownResamplers.Box, 13, 299 },
             { KnownResamplers.Lanczos5, 3032, 600 },
+
+            // Large number. https://github.com/SixLabors/ImageSharp/issues/1616
+            { KnownResamplers.Bicubic, 207773, 51943 }
         };
 
         public static TheoryData<string, int, int> GeneratedImageResizeData =