From 477716c5fa88b6b6ee99de6a9fad5433ebcb5ac6 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 25 Jan 2022 14:58:25 +0300
Subject: [PATCH 01/14] Fused transpose/zigzag implementation

---
 .../Formats/Jpeg/Components/Block8x8F.cs      |   8 +-
 .../FastFloatingPointDCT.Intrinsic.cs         |   5 +-
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 133 +-----
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 426 +++++++++---------
 4 files changed, 229 insertions(+), 343 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index f252864476..d7511fddac 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -280,7 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         }
 
         /// <summary>
-        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
+        /// Quantize input block, transpose, apply zig-zag ordering and store as <see cref="Block8x8"/>.
         /// </summary>
         /// <param name="block">Source block.</param>
         /// <param name="dest">Destination block.</param>
@@ -291,19 +291,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             if (Avx2.IsSupported)
             {
                 MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingAvx2(ref dest);
+                ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
             }
             else if (Ssse3.IsSupported)
             {
                 MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingSsse3(ref dest);
+                ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
             }
             else
 #endif
             {
                 for (int i = 0; i < Size; i++)
                 {
-                    int idx = ZigZag.ZigZagOrder[i];
+                    int idx = ZigZag.TransposingOrder[i];
                     float quantizedVal = block[idx] * qt[idx];
                     quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
                     dest[i] = (short)quantizedVal;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 94864005ec..8acc4b6269 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -29,11 +29,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
-            // First pass - process rows
-            block.TransposeInplace();
+            // First pass - process columns
             FDCT8x8_1D_Avx(ref block);
 
-            // Second pass - process columns
+            // Second pass - process rows
             block.TransposeInplace();
             FDCT8x8_1D_Avx(ref block);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index c27ad5b82b..e1bcff30f3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -92,6 +92,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
                 tableRef = ref Unsafe.Add(ref tableRef, 1);
             }
+
+            // Spectral macroblocks are not transposed before quantization
+            // Transpose is done after quantization at zig-zag stage
+            // so we must transpose quantization table
+            quantTable.TransposeInplace();
         }
 
         /// <summary>
@@ -133,14 +138,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
             else
 #endif
-            if (Vector.IsHardwareAccelerated)
             {
                 FDCT_Vector4(ref block);
             }
-            else
-            {
-                FDCT_Scalar(ref block);
-            }
         }
 
         /// <summary>
@@ -217,136 +217,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
-        /// <summary>
-        /// Apply 2D floating point FDCT inplace using scalar operations.
-        /// </summary>
-        /// <remarks>
-        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
-        /// </remarks>
-        /// <param name="block">Input block.</param>
-        private static void FDCT_Scalar(ref Block8x8F block)
-        {
-            const int dctSize = 8;
-
-            float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-            float tmp10, tmp11, tmp12, tmp13;
-            float z1, z2, z3, z4, z5, z11, z13;
-
-            // First pass - process rows
-            ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
-            for (int ctr = 7; ctr >= 0; ctr--)
-            {
-                tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
-                tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
-                tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
-                tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
-                tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
-                tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
-                tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
-                tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
-
-                // Even part
-                tmp10 = tmp0 + tmp3;
-                tmp13 = tmp0 - tmp3;
-                tmp11 = tmp1 + tmp2;
-                tmp12 = tmp1 - tmp2;
-
-                Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
-                Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
-
-                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
-                Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
-
-                // Odd part
-                tmp10 = tmp4 + tmp5;
-                tmp11 = tmp5 + tmp6;
-                tmp12 = tmp6 + tmp7;
-
-                z5 = (tmp10 - tmp12) * 0.382683433f;
-                z2 = (0.541196100f * tmp10) + z5;
-                z4 = (1.306562965f * tmp12) + z5;
-                z3 = tmp11 * 0.707106781f;
-
-                z11 = tmp7 + z3;
-                z13 = tmp7 - z3;
-
-                Unsafe.Add(ref blockRef, 5) = z13 + z2;
-                Unsafe.Add(ref blockRef, 3) = z13 - z2;
-                Unsafe.Add(ref blockRef, 1) = z11 + z4;
-                Unsafe.Add(ref blockRef, 7) = z11 - z4;
-
-                blockRef = ref Unsafe.Add(ref blockRef, dctSize);
-            }
-
-            // Second pass - process columns
-            blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
-            for (int ctr = 7; ctr >= 0; ctr--)
-            {
-                tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
-                tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
-                tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
-                tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
-                tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
-                tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
-                tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
-                tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
-
-                // Even part
-                tmp10 = tmp0 + tmp3;
-                tmp13 = tmp0 - tmp3;
-                tmp11 = tmp1 + tmp2;
-                tmp12 = tmp1 - tmp2;
-
-                Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
-                Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
-
-                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
-                Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
-
-                // Odd part
-                tmp10 = tmp4 + tmp5;
-                tmp11 = tmp5 + tmp6;
-                tmp12 = tmp6 + tmp7;
-
-                z5 = (tmp10 - tmp12) * 0.382683433f;
-                z2 = (0.541196100f * tmp10) + z5;
-                z4 = (1.306562965f * tmp12) + z5;
-                z3 = tmp11 * 0.707106781f;
-
-                z11 = tmp7 + z3;
-                z13 = tmp7 - z3;
-
-                Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
-                Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
-                Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
-                Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
-
-                blockRef = ref Unsafe.Add(ref blockRef, 1);
-            }
-        }
-
         /// <summary>
         /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
         /// </summary>
-        /// <remarks>
-        /// This implementation must be called only if hardware supports 4
-        /// floating point numbers vector. Otherwise explicit scalar
-        /// implementation <see cref="FDCT_Scalar"/> is faster
-        /// because it does not rely on block transposition.
-        /// </remarks>
         /// <param name="block">Input block.</param>
         public static void FDCT_Vector4(ref Block8x8F block)
         {
-            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
-
-            // First pass - process rows
-            block.TransposeInplace();
+            // First pass - process columns
             FDCT8x4_Vector4(ref block.V0L);
             FDCT8x4_Vector4(ref block.V0R);
 
-            // Second pass - process columns
+            // Second pass - process rows
             block.TransposeInplace();
             FDCT8x4_Vector4(ref block.V0L);
             FDCT8x4_Vector4(ref block.V0R);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 6577739c1a..e5faf97257 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -18,120 +18,138 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #pragma warning restore SA1309
 
         /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
+        /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
         /// zig zag implementation.
         /// </summary>
         private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
         {
-            // row0
-            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
-            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
-
-            // row1
-            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
-            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-
-            // row2
-            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-
-            // row3
-            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
-            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
-            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
-            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
-
-            // row4
-            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
-            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
-            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
-
-            // row5
-            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
-            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-
-            // row6
-            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
-            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-
-            // row7
-            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
-            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+#pragma warning disable SA1515
+            /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
+            // A
+            0, 1, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+            // B
+            _, _, 0, 1, _, _, _, _, 2, 3, _, _, _, _, _, _,
+            // C
+            _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3,
+
+            /* row1 - B2 A3 A4 B3 C2 D1 E0 F0 */
+            // A
+            _, _, 6, 7, 8, 9, _, _, _, _, _, _, _, _, _, _,
+            // B
+            4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _, _, _,
+
+            /* row2 - E1 D2 C3 B4 A5 A6 B5 C4 */
+            // A
+            _, _, _, _, _, _, _, _, 10, 11, 12, 13,  _,  _, _, _,
+            // B
+            _, _, _, _, _, _, 8, 9,  _,  _,  _,  _, 10, 11, _, _,
+            // C
+            _, _, _, _, 6, 7, _, _,  _,  _,  _,  _,  _,  _, 8, 9,
+
+            /* row3 - D3 E2 F1 G0 H0 G1 F2 E3 */
+            // E
+            _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 6, 7,
+            // F
+            _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _,
+            // G
+            _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _,
+
+            /* row4 - D4 C5 B6 A7 B7 C6 D5 E4 */
+            // B
+            _, _,  _,  _, 12, 13, _, _, 14, 15,  _,  _,  _,  _, _, _,
+            // C
+            _, _, 10, 11,  _,  _, _, _,  _,  _, 12, 13,  _,  _, _, _,
+            // D
+            8, 9,  _,  _,  _,  _, _, _,  _,  _,  _,  _, 10, 11, _, _,
+
+            /* row5 - F3 G2 H1 H2 G3 F4 E5 D6 */
+            // F
+            6, 7, _, _, _, _, _, _, _, _, 8, 9, _, _, _, _,
+            // G
+            _, _, 4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _,
+            // H
+            _, _, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+
+            /* row6 - C7 D7 E6 F5 G4 H3 H4 G5 */
+            // G
+            _, _, _, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11,
+            // H
+            _, _, _, _, _, _, _, _, _, _, 6, 7, 8, 9,  _,  _,
+
+            /* row7 - F6 E7 F7 G6 H5 H6 G7 H7 */
+            // F
+            12, 13, _, _, 14, 15,  _,  _,  _,  _,  _,  _,  _,  _, _, _,
+            // G
+            _,  _, _, _,  _,  _, 12, 13,  _,  _,  _,  _, 14, 15, _, _,
+            // H
+            _,  _, _, _,  _,  _,  _,  _, 10, 11, 12, 13,  _,  _, 14, 15,
+#pragma warning restore SA1515
         };
 
         /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
+        /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
         /// zig zag implementation.
         /// </summary>
         private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
         {
-                // 01_AB/01_EF/23_CD - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
-
-                // 01_AB - inner-lane
-                0, 1, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   6, 7, 12, 13,  _, _, _, _,   _, _, _, _,   _, _, 10, 11,   4, 5, 6, 7,
-
-                // 01_CD/23_GH - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,
-
-                // 01_CD - inner-lane
-                _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   _, _, 10, 11,   4, 5, _, _,   _, _, _, _,
-
-                // 01_EF - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 23_AB/45_CD/67_EF - cross-lane
-                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
-
-                // 23_AB - inner-lane
-                4, 5, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   2, 3, 8, 9,   _, _, _, _,
-
-                // 23_CD - inner-lane
-                _, _, 6, 7,   12, 13, _, _,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   6, 7, 12, 13,
-
-                // 23_EF - inner-lane
-                _, _, _, _,   _, _, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 23_GH - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 45_AB - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 45_CD - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   _, _, 2, 3,   8, 9, _, _,   _, _, _, _,
-
-                // 45_EF - cross-lane
-                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   _, _, _, _,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
-
-                // 45_EF - inner-lane
-                2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,  _, _, _, _,   _, _, _, _,   _, _, 2, 3,   8, 9, _, _,
-
-                // 45_GH - inner-lane
-                _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 6, 7,
-
-                // 67_CD - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, 10, 11,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 67_EF - inner-lane
-                _, _, _, _,   _, _, 6, 7,   0, 1, _, _,   2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,
-
-                // 67_GH - inner-lane
-                8, 9, 10, 11,   4, 5, _, _,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, 6, 7,   12, 13, 14, 15
+#pragma warning disable SA1515
+            /* 01 */
+            // [cr] crln_01_AB_CD
+            0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   1, 0, 0, 0,   2, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) AB
+            0, 1, 8, 9,   2, 3, 4, 5,   10, 11, _, _,   _, _, _, _,   12, 13, 2, 3,   4, 5, 14, 15,   _, _, _, _,   _, _, _, _,
+            // (in) CD
+            _, _, _, _,   _, _, _, _,   _, _, 0, 1,   8, 9, 2, 3,   _, _, _, _,   _, _, _, _,   0, 1, 10, 11,   _, _, _, _,
+            // [cr] crln_01_23_EF_23_CD
+            0, 0, 0, 0,   1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) EF
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, 8, 9,
+
+            /* 23 */
+            // [cr] crln_23_AB_23_45_GH
+            2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) AB
+            _, _, _, _,   _, _, 8, 9,   2, 3, 4, 5,   10, 11, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) CDe
+            _, _, 12, 13,   6, 7, _, _,   _, _, _, _,   _, _, 8, 9,   14, 15, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) EF
+            2, 3, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 4, 5,   10, 11, _, _,   _, _, _, _,   12, 13, 6, 7,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 0, 1,   8, 9, 2, 3,   _, _, _, _,
+
+            /* 45 */
+            // (in) AB
+            _, _, _, _,   12, 13, 6, 7,   14, 15, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // [cr] crln_45_67_CD_45_EF
+            2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+            // (in) CD
+            8, 9, 2, 3,   _, _, _, _,   _, _, 4, 5,   10, 11, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 12, 13,
+            // (in) EF
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 0, 1,   6, 7, _, _,   _, _, _, _,   _, _, 8, 9,   2, 3, _, _,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 4, 5,   10, 11, 12, 13,   6, 7, _, _,   _, _, _, _,
+
+            /* 67 */
+            // (in) CD
+            6, 7, 14, 15,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // [cr] crln_67_EF_67_GH
+            2, 0, 0, 0,   3, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
+            // (in) EF
+            _, _, _, _,   4, 5, 14, 15,   _, _, _, _,   _, _, _, _,   8, 9, 2, 3,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   0, 1, 10, 11,   12, 13, 2, 3,   _, _, _, _,   _, _, 0, 1,   6, 7, 8, 9,   2, 3, 10, 11,
+#pragma warning restore SA1515
         };
 
         /// <summary>
         /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
         /// </summary>
         /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
+        public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
-            fixed (byte* maskPtr = SseShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = SseShuffleMasks)
             {
                 Vector128<byte> rowA = block.V0.AsByte();
                 Vector128<byte> rowB = block.V1.AsByte();
@@ -142,73 +160,69 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 Vector128<byte> rowG = block.V6.AsByte();
                 Vector128<byte> rowH = block.V7.AsByte();
 
-                // row0 - A0  A1  B0  C0  B1  A2  A3  B2
-                Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
-                Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
-                Vector128<short> row0 = Sse2.Or(rowA0, rowB0);
-                Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
-                row0 = Sse2.Or(row0, rowC0);
-
-                // row1 - C1  D0  E0  D1  C2  B3  A4  A5
-                Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
-                Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
-                Vector128<short> row1 = Sse2.Or(rowA1, rowC1);
-                Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
-                row1 = Sse2.Or(row1, rowD1);
-                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
-                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
-
-                // row2
-                Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
-                Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
-                Vector128<short> row2 = Sse2.Or(rowE2, rowF2);
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
-
-                // row3
-                Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
-                Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
-                Vector128<short> row3 = Sse2.Or(rowA3, rowB3);
-                Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
-                row3 = Sse2.Or(row3, rowC3);
-                Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
-                Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
-                row3 = Sse2.Or(row3, rowD3);
-
-                // row4
-                Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
-                Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
-                Vector128<short> row4 = Sse2.Or(rowE4, rowF4);
-                Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
-                row4 = Sse2.Or(row4, rowG4);
-                Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
-                row4 = Sse2.Or(row4, rowH4);
-
-                // row5
-                Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
-                Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
-                Vector128<short> row5 = Sse2.Or(rowC5, rowD5);
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
-
-                // row6
-                Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
-                Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
-                Vector128<short> row6 = Sse2.Or(rowE6, rowF6);
-                Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
-                row6 = Sse2.Or(row6, rowH6);
-                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
-                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
-
-                // row7
-                Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
-                Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
-                Vector128<short> row7 = Sse2.Or(rowG7, rowH7);
-                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
+                // row0 - A0 B0 A1 A2 B1 C0 D0 C1
+                Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
+                Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
+                Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
+                row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
+
+                // row1 - B2 A3 A4 B3 C2 D1 E0 F0
+                Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
+                Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
+
+                // row2 - E1 D2 C3 B4 A5 A6 B5 C4
+                Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
+                Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
+                Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
+
+                // row3 - D3 E2 F1 G0 H0 G1 F2 E3
+                Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
+                Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
+                Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
+                Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
+                row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
+                row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
+
+                // row4 - D4 C5 B6 A7 B7 C6 D5 E4
+                Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
+                Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
+                Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
+                row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
+                row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
+
+                // row5 - F3 G2 H1 H2 G3 F4 E5 D6
+                Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
+                Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
+                Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
+
+                // row6 - C7 D7 E6 F5 G4 H3 H4 G5
+                Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
+                Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
+
+                // row7 - F6 E7 F7 G6 H5 H6 G7 H7
+                Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
+                Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
+                Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
+                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
 
                 block.V0 = row0;
                 block.V1 = row1;
@@ -225,69 +239,61 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
         /// </summary>
         /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
+        public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
             fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
             {
-                Vector256<byte> rowsAB = block.V01.AsByte();
-                Vector256<byte> rowsCD = block.V23.AsByte();
-                Vector256<byte> rowsEF = block.V45.AsByte();
-                Vector256<byte> rowsGH = block.V67.AsByte();
-
-                // rows 0 1
-                Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
-                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                Vector256<byte> rowAB = block.V01.AsByte();
+                Vector256<byte> rowCD = block.V23.AsByte();
+                Vector256<byte> rowEF = block.V45.AsByte();
+                Vector256<byte> rowGH = block.V67.AsByte();
+
+                /* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */
+                Vector256<int> crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte();
                 row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
-
-                Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
-                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
-                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
-
-                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
-                Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
-
-                Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
-
-                // rows 2 3
-                Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
-                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
-
-                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte();
+                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte();
+                Vector256<int> crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32();
+                Vector256<byte> row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte();
+                Vector256<byte> row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+                Vector256<byte> row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF));
+
+                /* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */
+                Vector256<int> crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+                Vector256<byte> row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+                Vector256<byte> row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte();
                 row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
-
-                Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
-
-                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
-                Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+                Vector256<byte> row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+                Vector256<byte> row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+                Vector256<byte> row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte();
 
                 Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
 
-                // rows 4 5
-                Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
-                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
-
-                Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
-                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
-                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
-
-                Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+                /* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */
+                Vector256<byte> row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte();
+                Vector256<int> crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32();
+                Vector256<byte> row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+                Vector256<byte> row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte();
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte();
+                Vector256<byte> row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte();
 
                 Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
 
-                // rows 6 7
-                Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
-
-                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
-
-                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
-                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+                /* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */
+                Vector256<byte> row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte();
+                Vector256<int> crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32();
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte();
+                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte();
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte();
+                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte();
 
-                Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+                Vector256<byte> row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH));
 
                 block.V01 = row01.AsInt16();
                 block.V23 = row23.AsInt16();

From 219119aad48ddaed26f5e870aac2ec2e08c55a3a Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 25 Jan 2022 15:00:31 +0300
Subject: [PATCH 02/14] Tests fix

---
 tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs | 2 +-
 tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs       | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index ae7e81254b..9576cbd3c8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -220,7 +220,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
                 // Reference implementation quantizes given block via division
                 Block8x8 expected = default;
-                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
+                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.TransposingOrder);
 
                 // Actual current implementation quantizes given block via multiplication
                 // With quantization table reciprocal
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 36570ce55a..9c467a1cc9 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -135,10 +135,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
                     srcBlock.MultiplyInPlace(ref dequantMatrix);
 
+                    // testee
                     // IDCT implementation tranforms blocks after transposition
                     srcBlock.TransposeInplace();
-
-                    // IDCT calculation
                     FastFloatingPointDCT.TransformIDCT(ref srcBlock);
 
                     float[] actualDest = srcBlock.ToArray();
@@ -180,7 +179,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
                     // testee
+                    // Second transpose call is done by Quantize step
+                    // Do this manually here just to be complient to the reference implementation
                     FastFloatingPointDCT.TransformFDCT(ref block);
+                    block.TransposeInplace();
 
                     // Part of the IDCT calculations is fused into the quantization step
                     // We must multiply input block with adjusted no-quantization matrix

From 69458f4f8df9e00c1e2e8e0e0166aa3577320688 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 25 Jan 2022 21:56:14 +0300
Subject: [PATCH 03/14] gfoidl len check removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Günther Foidl <gue@korporal.at>
---
 src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index e5faf97257..ca66519eb4 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -149,7 +149,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
-            fixed (byte* shuffleVectorsPtr = SseShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks)
             {
                 Vector128<byte> rowA = block.V0.AsByte();
                 Vector128<byte> rowB = block.V1.AsByte();

From b6400c287b7454dfd1d0326e897984841d2b1416 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 26 Jan 2022 13:44:12 +0300
Subject: [PATCH 04/14] Fixed compilation error

---
 src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index ca66519eb4..850de26c30 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -3,6 +3,7 @@
 
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System;
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
@@ -149,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
 
-            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
             {
                 Vector128<byte> rowA = block.V0.AsByte();
                 Vector128<byte> rowB = block.V1.AsByte();
@@ -243,7 +244,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         {
             DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
 
-            fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks))
             {
                 Vector256<byte> rowAB = block.V01.AsByte();
                 Vector256<byte> rowCD = block.V23.AsByte();

From 96c1c725f94dcb8d5882c22af68c3f46a9d5582e Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 30 Jan 2022 18:41:56 +0100
Subject: [PATCH 05/14] Write ALPH chunk (only uncompressed for now)

---
 src/ImageSharp/Formats/Webp/AlphaEncoder.cs   | 42 ++++++++++++++++++
 .../Formats/Webp/BitWriter/BitWriterBase.cs   | 43 ++++++++++++++++++-
 .../Formats/Webp/BitWriter/Vp8BitWriter.cs    | 35 ++++++++++++---
 .../Formats/Webp/Lossy/Vp8Encoder.cs          | 13 ++++--
 .../Formats/Webp/Lossy/YuvConversion.cs       | 11 ++++-
 5 files changed, 133 insertions(+), 11 deletions(-)
 create mode 100644 src/ImageSharp/Formats/Webp/AlphaEncoder.cs

diff --git a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
new file mode 100644
index 0000000000..06c114c71f
--- /dev/null
+++ b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
@@ -0,0 +1,42 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Webp
+{
+    /// <summary>
+    /// Encodes the alpha channel data.
+    /// Data is either compressed as lossless webp image or uncompressed.
+    /// </summary>
+    internal static class AlphaEncoder
+    {
+        public static byte[] EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            Buffer2D<TPixel> imageBuffer = image.Frames.RootFrame.PixelBuffer;
+            int height = image.Height;
+            int width = image.Width;
+            byte[] alphaData = new byte[width * height];
+
+            using IMemoryOwner<Rgba32> rowBuffer = memoryAllocator.Allocate<Rgba32>(width);
+            Span<Rgba32> rgbaRow = rowBuffer.GetSpan();
+
+            for (int y = 0; y < height; y++)
+            {
+                Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(y);
+                PixelOperations<TPixel>.Instance.ToRgba32(configuration, rowSpan, rgbaRow);
+                int offset = y * width;
+                for (int x = 0; x < width; x++)
+                {
+                    alphaData[offset + x] = rgbaRow[x].A;
+                }
+            }
+
+            return alphaData;
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
index ac039be797..b33f7987c1 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
@@ -94,7 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// Calculates the chunk size of EXIF or XMP metadata.
         /// </summary>
         /// <param name="metadataBytes">The metadata profile bytes.</param>
-        /// <returns>The exif chunk size in bytes.</returns>
+        /// <returns>The metadata chunk size in bytes.</returns>
         protected uint MetadataChunkSize(byte[] metadataBytes)
         {
             uint metaSize = (uint)metadataBytes.Length;
@@ -103,6 +103,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             return metaChunkSize;
         }
 
+        /// <summary>
+        /// Calculates the chunk size of a alpha chunk.
+        /// </summary>
+        /// <param name="alphaBytes">The alpha chunk bytes.</param>
+        /// <returns>The alpha data chunk size in bytes.</returns>
+        protected uint AlphaChunkSize(byte[] alphaBytes)
+        {
+            uint alphaSize = (uint)alphaBytes.Length + 1;
+            uint alphaChunkSize = WebpConstants.ChunkHeaderSize + alphaSize + (alphaSize & 1);
+
+            return alphaChunkSize;
+        }
+
         /// <summary>
         /// Writes a metadata profile (EXIF or XMP) to the stream.
         /// </summary>
@@ -128,6 +141,34 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             }
         }
 
+        /// <summary>
+        /// Writes the alpha chunk to the stream.
+        /// </summary>
+        /// <param name="stream">The stream to write to.</param>
+        /// <param name="dataBytes">The alpha channel data bytes.</param>
+        protected void WriteAlphaChunk(Stream stream, byte[] dataBytes)
+        {
+            DebugGuard.NotNull(dataBytes, nameof(dataBytes));
+
+            uint size = (uint)dataBytes.Length + 1;
+            Span<byte> buf = this.scratchBuffer.AsSpan(0, 4);
+            BinaryPrimitives.WriteUInt32BigEndian(buf, (uint)WebpChunkType.Alpha);
+            stream.Write(buf);
+            BinaryPrimitives.WriteUInt32LittleEndian(buf, size);
+            stream.Write(buf);
+
+            // Write flags, all zero for now.
+            stream.WriteByte(0);
+
+            stream.Write(dataBytes);
+
+            // Add padding byte if needed.
+            if ((size & 1) == 1)
+            {
+                stream.WriteByte(0);
+            }
+        }
+
         /// <summary>
         /// Writes a VP8X header to the stream.
         /// </summary>
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
index 4e91bedb0b..3f16fc89bc 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
@@ -409,7 +409,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// <param name="width">The width of the image.</param>
         /// <param name="height">The height of the image.</param>
         /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
-        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, XmpProfile xmpProfile, uint width, uint height, bool hasAlpha)
+        /// <param name="alphaData">The alpha channel data.</param>
+        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, XmpProfile xmpProfile, uint width, uint height, bool hasAlpha, byte[] alphaData)
         {
             bool isVp8X = false;
             byte[] exifBytes = null;
@@ -418,7 +419,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             if (exifProfile != null)
             {
                 isVp8X = true;
-                riffSize += ExtendedFileChunkSize;
                 exifBytes = exifProfile.ToByteArray();
                 riffSize += this.MetadataChunkSize(exifBytes);
             }
@@ -426,11 +426,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             if (xmpProfile != null)
             {
                 isVp8X = true;
-                riffSize += ExtendedFileChunkSize;
                 xmpBytes = xmpProfile.Data;
                 riffSize += this.MetadataChunkSize(xmpBytes);
             }
 
+            if (hasAlpha)
+            {
+                isVp8X = true;
+                riffSize += this.AlphaChunkSize(alphaData);
+            }
+
+            if (isVp8X)
+            {
+                riffSize += ExtendedFileChunkSize;
+            }
+
             this.Finish();
             uint numBytes = (uint)this.NumBytes();
             int mbSize = this.enc.Mbw * this.enc.Mbh;
@@ -451,7 +461,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             riffSize += WebpConstants.TagSize + WebpConstants.ChunkHeaderSize + vp8Size;
 
             // Emit headers and partition #0
-            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile, xmpProfile, hasAlpha);
+            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile, xmpProfile, hasAlpha, alphaData);
             bitWriterPartZero.WriteToStream(stream);
 
             // Write the encoded image to the stream.
@@ -639,7 +649,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             while (it.Next());
         }
 
-        private void WriteWebpHeaders(Stream stream, uint size0, uint vp8Size, uint riffSize, bool isVp8X, uint width, uint height, ExifProfile exifProfile, XmpProfile xmpProfile, bool hasAlpha)
+        private void WriteWebpHeaders(
+            Stream stream,
+            uint size0,
+            uint vp8Size,
+            uint riffSize,
+            bool isVp8X,
+            uint width,
+            uint height,
+            ExifProfile exifProfile,
+            XmpProfile xmpProfile,
+            bool hasAlpha,
+            byte[] alphaData)
         {
             this.WriteRiffHeader(stream, riffSize);
 
@@ -647,6 +668,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             if (isVp8X)
             {
                 this.WriteVp8XHeader(stream, exifProfile, xmpProfile, width, height, hasAlpha);
+                if (hasAlpha)
+                {
+                    this.WriteAlphaChunk(stream, alphaData);
+                }
             }
 
             this.WriteVp8Header(stream, vp8Size);
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 0222320502..48af53960c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -300,7 +300,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             Span<byte> y = this.Y.GetSpan();
             Span<byte> u = this.U.GetSpan();
             Span<byte> v = this.V.GetSpan();
-            YuvConversion.ConvertRgbToYuv(image, this.configuration, this.memoryAllocator, y, u, v);
+            bool hasAlpha = YuvConversion.ConvertRgbToYuv(image, this.configuration, this.memoryAllocator, y, u, v);
 
             int yStride = width;
             int uvStride = (yStride + 1) >> 1;
@@ -322,8 +322,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             int expectedSize = this.Mbw * this.Mbh * averageBytesPerMacroBlock;
             this.bitWriter = new Vp8BitWriter(expectedSize, this);
 
-            // TODO: EncodeAlpha();
-            bool hasAlpha = false;
+            // Extract and encode alpha data, if present.
+            byte[] alphaData = null;
+            if (hasAlpha)
+            {
+                // TODO: This can potentially run in an separate task.
+                alphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator);
+            }
 
             // Stats-collection loop.
             this.StatLoop(width, height, yStride, uvStride);
@@ -358,7 +363,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             // Write bytes from the bitwriter buffer to the stream.
             ImageMetadata metadata = image.Metadata;
             metadata.SyncProfiles();
-            this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha);
+            this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha, alphaData);
         }
 
         /// <inheritdoc/>
diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
index 7a731f4284..878bebd105 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@@ -318,7 +318,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         /// <param name="y">Span to store the luma component of the image.</param>
         /// <param name="u">Span to store the u component of the image.</param>
         /// <param name="v">Span to store the v component of the image.</param>
-        public static void ConvertRgbToYuv<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, Span<byte> y, Span<byte> u, Span<byte> v)
+        /// <returns>true, if the image contains alpha data.</returns>
+        public static bool ConvertRgbToYuv<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, Span<byte> y, Span<byte> u, Span<byte> v)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             Buffer2D<TPixel> imageBuffer = image.Frames.RootFrame.PixelBuffer;
@@ -335,6 +336,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             Span<Bgra32> bgraRow1 = bgraRow1Buffer.GetSpan();
             int uvRowIndex = 0;
             int rowIndex;
+            bool hasAlpha = false;
             for (rowIndex = 0; rowIndex < height - 1; rowIndex += 2)
             {
                 Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(rowIndex);
@@ -343,6 +345,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 PixelOperations<TPixel>.Instance.ToBgra32(configuration, nextRowSpan, bgraRow1);
 
                 bool rowsHaveAlpha = WebpCommonUtils.CheckNonOpaque(bgraRow0) && WebpCommonUtils.CheckNonOpaque(bgraRow1);
+                if (rowsHaveAlpha)
+                {
+                    hasAlpha = true;
+                }
 
                 // Downsample U/V planes, two rows at a time.
                 if (!rowsHaveAlpha)
@@ -375,10 +381,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 else
                 {
                     AccumulateRgba(bgraRow0, bgraRow0, tmpRgbSpan, width);
+                    hasAlpha = true;
                 }
 
                 ConvertRgbaToUv(tmpRgbSpan, u.Slice(uvRowIndex * uvWidth), v.Slice(uvRowIndex * uvWidth), uvWidth);
             }
+
+            return hasAlpha;
         }
 
         /// <summary>

From 6078c0eb925a0ec02905d3bd6bfd370ae7cccbac Mon Sep 17 00:00:00 2001
From: Anton Firszov <antonfir@gmail.com>
Date: Tue, 1 Feb 2022 01:50:10 +0100
Subject: [PATCH 06/14] Revert "attempt to re-enable
 RentReturnRelease_SubsequentRentReturnsDifferentHandles on Mac"

This reverts commit bbbf687477a66843219552b445562688ee843f9b.
---
 .../Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs b/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs
index 4ab2c93a52..7d98eff611 100644
--- a/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs
+++ b/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs
@@ -245,7 +245,10 @@ namespace SixLabors.ImageSharp.Tests.Memory.Allocators
             cleanup.Register(b1);
         }
 
-        [Theory]
+        public static readonly bool IsNotMacOS = !TestEnvironment.IsOSX;
+
+        // TODO: Investigate MacOS failures
+        [ConditionalTheory(nameof(IsNotMacOS))]
         [InlineData(false)]
         [InlineData(true)]
         public void RentReturnRelease_SubsequentRentReturnsDifferentHandles(bool multiple)

From d1929412289d3c8bb5bc3974cc47e0b811f0e75f Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 10:23:08 +0100
Subject: [PATCH 07/14] Add lossless alpha compression

---
 src/ImageSharp/Formats/Webp/AlphaEncoder.cs   | 86 ++++++++++++++++++-
 .../Formats/Webp/BitWriter/BitWriterBase.cs   | 11 ++-
 .../Formats/Webp/BitWriter/Vp8BitWriter.cs    | 10 ++-
 .../Formats/Webp/Lossless/Vp8LEncoder.cs      | 31 ++++++-
 .../Formats/Webp/Lossy/Vp8Encoder.cs          | 14 +--
 .../Formats/Webp/WebpEncoderCore.cs           |  6 +-
 6 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
index 06c114c71f..571da5bb24 100644
--- a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
@@ -3,18 +3,98 @@
 
 using System;
 using System.Buffers;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.Formats.Webp.Lossless;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Webp
 {
     /// <summary>
-    /// Encodes the alpha channel data.
-    /// Data is either compressed as lossless webp image or uncompressed.
+    /// Methods for encoding the alpha data of a VP8 image.
     /// </summary>
     internal static class AlphaEncoder
     {
-        public static byte[] EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator)
+        /// <summary>
+        /// Encodes the alpha channel data.
+        /// Data is either compressed as lossless webp image or uncompressed.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="image">The <see cref="ImageFrame{TPixel}"/> to encode from.</param>
+        /// <param name="configuration">The global configuration.</param>
+        /// <param name="memoryAllocator">The memory manager.</param>
+        /// <param name="compress">Indicates, if the data should be compressed with the lossless webp compression.</param>
+        /// <returns>The alpha data.</returns>
+        public static byte[] EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, bool compress)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            byte[] alphaData = ExtractAlphaChannel(image, configuration, memoryAllocator);
+            int width = image.Width;
+            int height = image.Height;
+            if (compress)
+            {
+                WebpEncodingMethod effort = WebpEncodingMethod.Default;
+                int quality = 8 * (int)effort;
+                using var lossLessEncoder = new Vp8LEncoder(
+                    memoryAllocator,
+                    configuration,
+                    width,
+                    height,
+                    quality,
+                    effort,
+                    WebpTransparentColorMode.Preserve,
+                    false,
+                    0);
+
+                // The transparency information will be stored in the green channel of the ARGB quadruplet.
+                // The green channel is allowed extra transformation steps in the specification -- unlike the other channels,
+                // that can improve compression.
+                using Image<Rgba32> alphaAsImage = DispatchAlphaToGreen(image, alphaData);
+
+                return lossLessEncoder.EncodeAlphaImageData(alphaAsImage);
+            }
+
+            return alphaData;
+        }
+
+        /// <summary>
+        /// Store the transparency in the green channel.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="image">The <see cref="ImageFrame{TPixel}"/> to encode from.</param>
+        /// <param name="alphaData">A byte sequence of length width * height, containing all the 8-bit transparency values in scan order.</param>
+        /// <returns>The transparency image.</returns>
+        private static Image<Rgba32> DispatchAlphaToGreen<TPixel>(Image<TPixel> image, byte[] alphaData)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            int width = image.Width;
+            int height = image.Height;
+            var alphaAsImage = new Image<Rgba32>(width, height);
+
+            for (int y = 0; y < height; y++)
+            {
+                Memory<Rgba32> rowBuffer = alphaAsImage.DangerousGetPixelRowMemory(y);
+                Span<Rgba32> pixelRow = rowBuffer.Span;
+                Span<byte> alphaRow = alphaData.AsSpan(y * width, width);
+                for (int x = 0; x < width; x++)
+                {
+                    // Leave A/R/B channels zero'd.
+                    pixelRow[x] = new Rgba32(0, alphaRow[x], 0, 0);
+                }
+            }
+
+            return alphaAsImage;
+        }
+
+        /// <summary>
+        /// Extract the alpha data of the image.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="image">The <see cref="ImageFrame{TPixel}"/> to encode from.</param>
+        /// <param name="configuration">The global configuration.</param>
+        /// <param name="memoryAllocator">The memory manager.</param>
+        /// <returns>A byte sequence of length width * height, containing all the 8-bit transparency values in scan order.</returns>
+        private static byte[] ExtractAlphaChannel<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             Buffer2D<TPixel> imageBuffer = image.Frames.RootFrame.PixelBuffer;
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
index b33f7987c1..84c9d3f133 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
@@ -146,7 +146,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// </summary>
         /// <param name="stream">The stream to write to.</param>
         /// <param name="dataBytes">The alpha channel data bytes.</param>
-        protected void WriteAlphaChunk(Stream stream, byte[] dataBytes)
+        /// <param name="alphaDataIsCompressed">Indicates, if the alpha channel data is compressed.</param>
+        protected void WriteAlphaChunk(Stream stream, byte[] dataBytes, bool alphaDataIsCompressed)
         {
             DebugGuard.NotNull(dataBytes, nameof(dataBytes));
 
@@ -157,9 +158,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             BinaryPrimitives.WriteUInt32LittleEndian(buf, size);
             stream.Write(buf);
 
-            // Write flags, all zero for now.
-            stream.WriteByte(0);
+            byte flags = 0;
+            if (alphaDataIsCompressed)
+            {
+                flags |= 1;
+            }
 
+            stream.WriteByte(flags);
             stream.Write(dataBytes);
 
             // Add padding byte if needed.
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
index 3f16fc89bc..577a87e6a1 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
@@ -410,7 +410,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// <param name="height">The height of the image.</param>
         /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
         /// <param name="alphaData">The alpha channel data.</param>
-        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, XmpProfile xmpProfile, uint width, uint height, bool hasAlpha, byte[] alphaData)
+        /// <param name="alphaDataIsCompressed">Indicates, if the alpha data is compressed.</param>
+        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, XmpProfile xmpProfile, uint width, uint height, bool hasAlpha, byte[] alphaData, bool alphaDataIsCompressed)
         {
             bool isVp8X = false;
             byte[] exifBytes = null;
@@ -461,7 +462,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             riffSize += WebpConstants.TagSize + WebpConstants.ChunkHeaderSize + vp8Size;
 
             // Emit headers and partition #0
-            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile, xmpProfile, hasAlpha, alphaData);
+            this.WriteWebpHeaders(stream, size0, vp8Size, riffSize, isVp8X, width, height, exifProfile, xmpProfile, hasAlpha, alphaData, alphaDataIsCompressed);
             bitWriterPartZero.WriteToStream(stream);
 
             // Write the encoded image to the stream.
@@ -660,7 +661,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             ExifProfile exifProfile,
             XmpProfile xmpProfile,
             bool hasAlpha,
-            byte[] alphaData)
+            byte[] alphaData,
+            bool alphaDataIsCompressed)
         {
             this.WriteRiffHeader(stream, riffSize);
 
@@ -670,7 +672,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
                 this.WriteVp8XHeader(stream, exifProfile, xmpProfile, width, height, hasAlpha);
                 if (hasAlpha)
                 {
-                    this.WriteAlphaChunk(stream, alphaData);
+                    this.WriteAlphaChunk(stream, alphaData, alphaDataIsCompressed);
                 }
             }
 
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index e9dce913a3..797d0794f9 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -228,7 +228,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         public Vp8LHashChain HashChain { get; }
 
         /// <summary>
-        /// Encodes the image to the specified stream from the <see cref="Image{TPixel}"/>.
+        /// Encodes the image as lossless webp to the specified stream.
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="image">The <see cref="Image{TPixel}"/> to encode from.</param>
@@ -236,10 +236,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         public void Encode<TPixel>(Image<TPixel> image, Stream stream)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            image.Metadata.SyncProfiles();
             int width = image.Width;
             int height = image.Height;
 
+            ImageMetadata metadata = image.Metadata;
+            metadata.SyncProfiles();
+
             // Convert image pixels to bgra array.
             bool hasAlpha = this.ConvertPixelsToBgra(image, width, height);
 
@@ -253,11 +255,32 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
             this.EncodeStream(image);
 
             // Write bytes from the bitwriter buffer to the stream.
-            ImageMetadata metadata = image.Metadata;
-            metadata.SyncProfiles();
             this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha);
         }
 
+        /// <summary>
+        /// Encodes the alpha image data using the webp lossless compression.
+        /// </summary>
+        /// <typeparam name="TPixel">The type of the pixel.</typeparam>
+        /// <param name="image">The <see cref="Image{TPixel}"/> to encode from.</param>
+        /// <returns>The encoded alpha stream.</returns>
+        public byte[] EncodeAlphaImageData<TPixel>(Image<TPixel> image)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            int width = image.Width;
+            int height = image.Height;
+
+            // Convert image pixels to bgra array.
+            this.ConvertPixelsToBgra(image, width, height);
+
+            // The image-stream does NOT contain any headers describing the image dimension, the dimension is already known.
+            this.EncodeStream(image);
+            this.bitWriter.Finish();
+            using var ms = new MemoryStream();
+            this.bitWriter.WriteToStream(ms);
+            return ms.ToArray();
+        }
+
         /// <summary>
         /// Writes the image size to the bitwriter buffer.
         /// </summary>
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 48af53960c..d8bd8f759c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -71,10 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         /// </summary>
         private int uvAlpha;
 
-        /// <summary>
-        /// Scratch buffer to reduce allocations.
-        /// </summary>
-        private readonly int[] scratch = new int[16];
+        private readonly bool alphaCompression;
 
         private readonly byte[] averageBytesPerMb = { 50, 24, 16, 9, 7, 5, 3, 2 };
 
@@ -105,6 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         /// <param name="entropyPasses">Number of entropy-analysis passes (in [1..10]).</param>
         /// <param name="filterStrength">The filter the strength of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).</param>
         /// <param name="spatialNoiseShaping">The spatial noise shaping. 0=off, 100=maximum.</param>
+        /// <param name="alphaCompression">If true, the alpha channel will be compressed with the lossless compression.</param>
         public Vp8Encoder(
             MemoryAllocator memoryAllocator,
             Configuration configuration,
@@ -114,7 +112,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             WebpEncodingMethod method,
             int entropyPasses,
             int filterStrength,
-            int spatialNoiseShaping)
+            int spatialNoiseShaping,
+            bool alphaCompression)
         {
             this.memoryAllocator = memoryAllocator;
             this.configuration = configuration;
@@ -125,6 +124,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.entropyPasses = Numerics.Clamp(entropyPasses, 1, 10);
             this.filterStrength = Numerics.Clamp(filterStrength, 0, 100);
             this.spatialNoiseShaping = Numerics.Clamp(spatialNoiseShaping, 0, 100);
+            this.alphaCompression = alphaCompression;
             this.rdOptLevel = method is WebpEncodingMethod.BestQuality ? Vp8RdLevel.RdOptTrellisAll
                 : method >= WebpEncodingMethod.Level5 ? Vp8RdLevel.RdOptTrellis
                 : method >= WebpEncodingMethod.Level3 ? Vp8RdLevel.RdOptBasic
@@ -327,7 +327,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             if (hasAlpha)
             {
                 // TODO: This can potentially run in an separate task.
-                alphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator);
+                alphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression);
             }
 
             // Stats-collection loop.
@@ -363,7 +363,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             // Write bytes from the bitwriter buffer to the stream.
             ImageMetadata metadata = image.Metadata;
             metadata.SyncProfiles();
-            this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha, alphaData);
+            this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha, alphaData, this.alphaCompression);
         }
 
         /// <inheritdoc/>
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
index 195fa62bdc..deed08b729 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
@@ -22,7 +22,6 @@ namespace SixLabors.ImageSharp.Formats.Webp
         private readonly MemoryAllocator memoryAllocator;
 
         /// <summary>
-        /// TODO: not used at the moment.
         /// Indicating whether the alpha plane should be compressed with Webp lossless format.
         /// </summary>
         private readonly bool alphaCompression;
@@ -100,7 +99,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
         }
 
         /// <summary>
-        /// Encodes the image to the specified stream from the <see cref="ImageFrame{TPixel}"/>.
+        /// Encodes the image as webp to the specified stream.
         /// </summary>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <param name="image">The <see cref="ImageFrame{TPixel}"/> to encode from.</param>
@@ -149,7 +148,8 @@ namespace SixLabors.ImageSharp.Formats.Webp
                     this.method,
                     this.entropyPasses,
                     this.filterStrength,
-                    this.spatialNoiseShaping);
+                    this.spatialNoiseShaping,
+                    this.alphaCompression);
                 enc.Encode(image, stream);
             }
         }

From 8b8993dadc37a8a7970d4bcf26b1f043b860df60 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 11:19:40 +0100
Subject: [PATCH 08/14] Add encode lossy webp with alpha tests

---
 .../Formats/WebP/WebpEncoderTests.cs          | 40 +++++++++++++------
 tests/ImageSharp.Tests/TestImages.cs          |  1 +
 tests/Images/Input/Png/transparency.png       |  3 ++
 3 files changed, 32 insertions(+), 12 deletions(-)
 create mode 100644 tests/Images/Input/Png/transparency.png

diff --git a/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs
index 7043549b22..7c74429edc 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs
@@ -167,18 +167,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
             image.VerifyEncoder(provider, "webp", testOutputDetails, encoder);
         }
 
-        [Theory]
-        [WithFile(TestPatternOpaque, PixelTypes.Rgba32)]
-        [WithFile(TestPatternOpaqueSmall, PixelTypes.Rgba32)]
-        public void Encode_Lossless_WorksWithTestPattern<TPixel>(TestImageProvider<TPixel> provider)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            using Image<TPixel> image = provider.GetImage();
-
-            var encoder = new WebpEncoder() { FileFormat = WebpFileFormatType.Lossless };
-            image.VerifyEncoder(provider, "webp", string.Empty, encoder);
-        }
-
         [Fact]
         public void Encode_Lossless_OneByOnePixel_Works()
         {
@@ -279,6 +267,34 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
             image.VerifyEncoder(provider, "webp", testOutputDetails, encoder, customComparer: GetComparer(quality));
         }
 
+        [Theory]
+        [WithFile(TestImages.Png.Transparency, PixelTypes.Rgba32, false)]
+        [WithFile(TestImages.Png.Transparency, PixelTypes.Rgba32, true)]
+        public void Encode_Lossy_WithAlpha_Works<TPixel>(TestImageProvider<TPixel> provider, bool compressed)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            var encoder = new WebpEncoder()
+            {
+                FileFormat = WebpFileFormatType.Lossy,
+                UseAlphaCompression = compressed
+            };
+
+            using Image<TPixel> image = provider.GetImage();
+            image.VerifyEncoder(provider, "webp", $"with_alpha_compressed_{compressed}", encoder, ImageComparer.Tolerant(0.04f));
+        }
+
+        [Theory]
+        [WithFile(TestPatternOpaque, PixelTypes.Rgba32)]
+        [WithFile(TestPatternOpaqueSmall, PixelTypes.Rgba32)]
+        public void Encode_Lossless_WorksWithTestPattern<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            using Image<TPixel> image = provider.GetImage();
+
+            var encoder = new WebpEncoder() { FileFormat = WebpFileFormatType.Lossless };
+            image.VerifyEncoder(provider, "webp", string.Empty, encoder);
+        }
+
         [Theory]
         [WithFile(TestPatternOpaque, PixelTypes.Rgba32)]
         [WithFile(TestPatternOpaqueSmall, PixelTypes.Rgba32)]
diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
index bce22799da..a73d262433 100644
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@@ -15,6 +15,7 @@ namespace SixLabors.ImageSharp.Tests
     {
         public static class Png
         {
+            public const string Transparency = "Png/transparency.png";
             public const string P1 = "Png/pl.png";
             public const string Pd = "Png/pd.png";
             public const string Blur = "Png/blur.png";
diff --git a/tests/Images/Input/Png/transparency.png b/tests/Images/Input/Png/transparency.png
new file mode 100644
index 0000000000..26de0f2d1a
--- /dev/null
+++ b/tests/Images/Input/Png/transparency.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:843bea4db378f52935e2f19f60d289df8ebe20ddde3977c63225f1d58a10bd62
+size 48119

From d398ae744555d87cf71da22a02ed8fb50127ff00 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 11:25:03 +0100
Subject: [PATCH 09/14] Default alpha compression to true

---
 src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs | 1 +
 src/ImageSharp/Formats/Webp/WebpEncoder.cs         | 2 +-
 src/ImageSharp/Formats/Webp/WebpEncoderCore.cs     | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
index d119d3031f..57ec32753d 100644
--- a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
+++ b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
@@ -31,6 +31,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
 
         /// <summary>
         /// Gets a value indicating whether the alpha plane should be compressed with Webp lossless format.
+        /// Defaults to true.
         /// </summary>
         bool UseAlphaCompression { get; }
 
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoder.cs b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
index bdcbb194b1..d0b60d18cd 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
@@ -24,7 +24,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
         public WebpEncodingMethod Method { get; set; } = WebpEncodingMethod.Default;
 
         /// <inheritdoc/>
-        public bool UseAlphaCompression { get; set; }
+        public bool UseAlphaCompression { get; set; } = true;
 
         /// <inheritdoc/>
         public int EntropyPasses { get; set; } = 1;
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
index deed08b729..0fbff81fe4 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
@@ -23,6 +23,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
 
         /// <summary>
         /// Indicating whether the alpha plane should be compressed with Webp lossless format.
+        /// Defaults to true.
         /// </summary>
         private readonly bool alphaCompression;
 

From cf672b99aeaa80d734edfa0a7ca0e2e36b590526 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 12:10:19 +0100
Subject: [PATCH 10/14] Use memory allocator for alpha data

---
 src/ImageSharp/Formats/Webp/AlphaEncoder.cs   | 26 ++++++++++++-------
 .../Formats/Webp/BitWriter/BitWriterBase.cs   | 12 ++++++---
 .../Formats/Webp/BitWriter/Vp8BitWriter.cs    | 12 +++++++--
 .../Formats/Webp/Lossless/Vp8LEncoder.cs      | 13 +++++-----
 .../Formats/Webp/Lossy/Vp8Encoder.cs          | 22 +++++++++++++---
 5 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
index 571da5bb24..38497281ff 100644
--- a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
@@ -24,13 +24,15 @@ namespace SixLabors.ImageSharp.Formats.Webp
         /// <param name="configuration">The global configuration.</param>
         /// <param name="memoryAllocator">The memory manager.</param>
         /// <param name="compress">Indicates, if the data should be compressed with the lossless webp compression.</param>
-        /// <returns>The alpha data.</returns>
-        public static byte[] EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, bool compress)
+        /// <param name="size">The size in bytes of the alpha data.</param>
+        /// <returns>The encoded alpha data.</returns>
+        public static IMemoryOwner<byte> EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, bool compress, out int size)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            byte[] alphaData = ExtractAlphaChannel(image, configuration, memoryAllocator);
             int width = image.Width;
             int height = image.Height;
+            IMemoryOwner<byte> alphaData = ExtractAlphaChannel(image, configuration, memoryAllocator);
+
             if (compress)
             {
                 WebpEncodingMethod effort = WebpEncodingMethod.Default;
@@ -49,11 +51,14 @@ namespace SixLabors.ImageSharp.Formats.Webp
                 // The transparency information will be stored in the green channel of the ARGB quadruplet.
                 // The green channel is allowed extra transformation steps in the specification -- unlike the other channels,
                 // that can improve compression.
-                using Image<Rgba32> alphaAsImage = DispatchAlphaToGreen(image, alphaData);
+                using Image<Rgba32> alphaAsImage = DispatchAlphaToGreen(image, alphaData.GetSpan());
+
+                size = lossLessEncoder.EncodeAlphaImageData(alphaAsImage, alphaData);
 
-                return lossLessEncoder.EncodeAlphaImageData(alphaAsImage);
+                return alphaData;
             }
 
+            size = width * height;
             return alphaData;
         }
 
@@ -64,7 +69,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
         /// <param name="image">The <see cref="ImageFrame{TPixel}"/> to encode from.</param>
         /// <param name="alphaData">A byte sequence of length width * height, containing all the 8-bit transparency values in scan order.</param>
         /// <returns>The transparency image.</returns>
-        private static Image<Rgba32> DispatchAlphaToGreen<TPixel>(Image<TPixel> image, byte[] alphaData)
+        private static Image<Rgba32> DispatchAlphaToGreen<TPixel>(Image<TPixel> image, Span<byte> alphaData)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             int width = image.Width;
@@ -75,7 +80,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
             {
                 Memory<Rgba32> rowBuffer = alphaAsImage.DangerousGetPixelRowMemory(y);
                 Span<Rgba32> pixelRow = rowBuffer.Span;
-                Span<byte> alphaRow = alphaData.AsSpan(y * width, width);
+                Span<byte> alphaRow = alphaData.Slice(y * width, width);
                 for (int x = 0; x < width; x++)
                 {
                     // Leave A/R/B channels zero'd.
@@ -94,13 +99,14 @@ namespace SixLabors.ImageSharp.Formats.Webp
         /// <param name="configuration">The global configuration.</param>
         /// <param name="memoryAllocator">The memory manager.</param>
         /// <returns>A byte sequence of length width * height, containing all the 8-bit transparency values in scan order.</returns>
-        private static byte[] ExtractAlphaChannel<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator)
+        private static IMemoryOwner<byte> ExtractAlphaChannel<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             Buffer2D<TPixel> imageBuffer = image.Frames.RootFrame.PixelBuffer;
             int height = image.Height;
             int width = image.Width;
-            byte[] alphaData = new byte[width * height];
+            IMemoryOwner<byte> alphaDataBuffer = memoryAllocator.Allocate<byte>(width * height);
+            Span<byte> alphaData = alphaDataBuffer.GetSpan();
 
             using IMemoryOwner<Rgba32> rowBuffer = memoryAllocator.Allocate<Rgba32>(width);
             Span<Rgba32> rgbaRow = rowBuffer.GetSpan();
@@ -116,7 +122,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
                 }
             }
 
-            return alphaData;
+            return alphaDataBuffer;
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
index 84c9d3f133..fc1accfdee 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/BitWriterBase.cs
@@ -47,6 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// <param name="stream">The stream to write to.</param>
         public void WriteToStream(Stream stream) => stream.Write(this.Buffer.AsSpan(0, this.NumBytes()));
 
+        /// <summary>
+        /// Writes the encoded bytes of the image to the given buffer. Call Finish() before this.
+        /// </summary>
+        /// <param name="dest">The destination buffer.</param>
+        public void WriteToBuffer(Span<byte> dest) => this.Buffer.AsSpan(0, this.NumBytes()).CopyTo(dest);
+
         /// <summary>
         /// Resizes the buffer to write to.
         /// </summary>
@@ -108,7 +114,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// </summary>
         /// <param name="alphaBytes">The alpha chunk bytes.</param>
         /// <returns>The alpha data chunk size in bytes.</returns>
-        protected uint AlphaChunkSize(byte[] alphaBytes)
+        protected uint AlphaChunkSize(Span<byte> alphaBytes)
         {
             uint alphaSize = (uint)alphaBytes.Length + 1;
             uint alphaChunkSize = WebpConstants.ChunkHeaderSize + alphaSize + (alphaSize & 1);
@@ -147,10 +153,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// <param name="stream">The stream to write to.</param>
         /// <param name="dataBytes">The alpha channel data bytes.</param>
         /// <param name="alphaDataIsCompressed">Indicates, if the alpha channel data is compressed.</param>
-        protected void WriteAlphaChunk(Stream stream, byte[] dataBytes, bool alphaDataIsCompressed)
+        protected void WriteAlphaChunk(Stream stream, Span<byte> dataBytes, bool alphaDataIsCompressed)
         {
-            DebugGuard.NotNull(dataBytes, nameof(dataBytes));
-
             uint size = (uint)dataBytes.Length + 1;
             Span<byte> buf = this.scratchBuffer.AsSpan(0, 4);
             BinaryPrimitives.WriteUInt32BigEndian(buf, (uint)WebpChunkType.Alpha);
diff --git a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
index 577a87e6a1..fa6e09d875 100644
--- a/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
+++ b/src/ImageSharp/Formats/Webp/BitWriter/Vp8BitWriter.cs
@@ -411,7 +411,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
         /// <param name="hasAlpha">Flag indicating, if a alpha channel is present.</param>
         /// <param name="alphaData">The alpha channel data.</param>
         /// <param name="alphaDataIsCompressed">Indicates, if the alpha data is compressed.</param>
-        public void WriteEncodedImageToStream(Stream stream, ExifProfile exifProfile, XmpProfile xmpProfile, uint width, uint height, bool hasAlpha, byte[] alphaData, bool alphaDataIsCompressed)
+        public void WriteEncodedImageToStream(
+            Stream stream,
+            ExifProfile exifProfile,
+            XmpProfile xmpProfile,
+            uint width,
+            uint height,
+            bool hasAlpha,
+            Span<byte> alphaData,
+            bool alphaDataIsCompressed)
         {
             bool isVp8X = false;
             byte[] exifBytes = null;
@@ -661,7 +669,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.BitWriter
             ExifProfile exifProfile,
             XmpProfile xmpProfile,
             bool hasAlpha,
-            byte[] alphaData,
+            Span<byte> alphaData,
             bool alphaDataIsCompressed)
         {
             this.WriteRiffHeader(stream, riffSize);
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 797d0794f9..ece9aefd0f 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -263,8 +263,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// </summary>
         /// <typeparam name="TPixel">The type of the pixel.</typeparam>
         /// <param name="image">The <see cref="Image{TPixel}"/> to encode from.</param>
-        /// <returns>The encoded alpha stream.</returns>
-        public byte[] EncodeAlphaImageData<TPixel>(Image<TPixel> image)
+        /// <param name="alphaData">The destination buffer to write the encoded alpha data to.</param>
+        /// <returns>The size of the data in bytes.</returns>
+        public int EncodeAlphaImageData<TPixel>(Image<TPixel> image, IMemoryOwner<byte> alphaData)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             int width = image.Width;
@@ -273,12 +274,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
             // Convert image pixels to bgra array.
             this.ConvertPixelsToBgra(image, width, height);
 
-            // The image-stream does NOT contain any headers describing the image dimension, the dimension is already known.
+            // The image-stream will NOT contain any headers describing the image dimension, the dimension is already known.
             this.EncodeStream(image);
             this.bitWriter.Finish();
-            using var ms = new MemoryStream();
-            this.bitWriter.WriteToStream(ms);
-            return ms.ToArray();
+            int size = this.bitWriter.NumBytes();
+            this.bitWriter.WriteToBuffer(alphaData.GetSpan());
+            return size;
         }
 
         /// <summary>
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index d8bd8f759c..4b7f3f5c88 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -241,6 +241,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         public int DqUvDc { get; private set; }
 
+        /// <summary>
+        /// Gets or sets the alpha data.
+        /// </summary>
+        private IMemoryOwner<byte> AlphaData { get; set; }
+
         /// <summary>
         /// Gets the luma component.
         /// </summary>
@@ -322,12 +327,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             int expectedSize = this.Mbw * this.Mbh * averageBytesPerMacroBlock;
             this.bitWriter = new Vp8BitWriter(expectedSize, this);
 
-            // Extract and encode alpha data, if present.
-            byte[] alphaData = null;
+            // Extract and encode alpha channel data, if present.
+            int alphaDataSize = 0;
             if (hasAlpha)
             {
                 // TODO: This can potentially run in an separate task.
-                alphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression);
+                this.AlphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression, out alphaDataSize);
             }
 
             // Stats-collection loop.
@@ -363,7 +368,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             // Write bytes from the bitwriter buffer to the stream.
             ImageMetadata metadata = image.Metadata;
             metadata.SyncProfiles();
-            this.bitWriter.WriteEncodedImageToStream(stream, metadata.ExifProfile, metadata.XmpProfile, (uint)width, (uint)height, hasAlpha, alphaData, this.alphaCompression);
+            this.bitWriter.WriteEncodedImageToStream(
+                stream,
+                metadata.ExifProfile,
+                metadata.XmpProfile,
+                (uint)width,
+                (uint)height,
+                hasAlpha,
+                hasAlpha ? this.AlphaData.GetSpan().Slice(0, alphaDataSize) : Span<byte>.Empty,
+                this.alphaCompression);
         }
 
         /// <inheritdoc/>
@@ -372,6 +385,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.Y.Dispose();
             this.U.Dispose();
             this.V.Dispose();
+            this.AlphaData?.Dispose();
         }
 
         /// <summary>

From b12ad7596e3e1b5c91d87332f167afe340f65226 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 12:50:28 +0100
Subject: [PATCH 11/14] Leave alpha data uncompressed, if compression does not
 yield in smaller data

---
 src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs | 11 ++++++++++-
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs     |  9 ++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index ece9aefd0f..30d65562ae 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -264,12 +264,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// <typeparam name="TPixel">The type of the pixel.</typeparam>
         /// <param name="image">The <see cref="Image{TPixel}"/> to encode from.</param>
         /// <param name="alphaData">The destination buffer to write the encoded alpha data to.</param>
-        /// <returns>The size of the data in bytes.</returns>
+        /// <returns>The size of the compressed data in bytes.
+        /// If the size of the data is the same as the pixel count, the compression would not yield in smaller data and is left uncompressed.
+        /// </returns>
         public int EncodeAlphaImageData<TPixel>(Image<TPixel> image, IMemoryOwner<byte> alphaData)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             int width = image.Width;
             int height = image.Height;
+            int pixelCount = width * height;
 
             // Convert image pixels to bgra array.
             this.ConvertPixelsToBgra(image, width, height);
@@ -278,6 +281,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
             this.EncodeStream(image);
             this.bitWriter.Finish();
             int size = this.bitWriter.NumBytes();
+            if (size >= pixelCount)
+            {
+                // Compressing would not yield in smaller data -> leave the data uncompressed.
+                return pixelCount;
+            }
+
             this.bitWriter.WriteToBuffer(alphaData.GetSpan());
             return size;
         }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 4b7f3f5c88..60bdee362c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -302,6 +302,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         {
             int width = image.Width;
             int height = image.Height;
+            int pixelCount = width * height;
             Span<byte> y = this.Y.GetSpan();
             Span<byte> u = this.U.GetSpan();
             Span<byte> v = this.V.GetSpan();
@@ -329,10 +330,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
             // Extract and encode alpha channel data, if present.
             int alphaDataSize = 0;
+            bool alphaCompressionSucceeded = false;
             if (hasAlpha)
             {
                 // TODO: This can potentially run in an separate task.
                 this.AlphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression, out alphaDataSize);
+                if (alphaDataSize < pixelCount)
+                {
+                    // Only use compressed data, if the compressed data is actually smaller then the uncompressed data.
+                    alphaCompressionSucceeded = true;
+                }
             }
 
             // Stats-collection loop.
@@ -376,7 +383,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 (uint)height,
                 hasAlpha,
                 hasAlpha ? this.AlphaData.GetSpan().Slice(0, alphaDataSize) : Span<byte>.Empty,
-                this.alphaCompression);
+                this.alphaCompression && alphaCompressionSucceeded);
         }
 
         /// <inheritdoc/>

From 192cfb03f9a35282eb62f99aa42c3bae91869181 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 13:33:04 +0100
Subject: [PATCH 12/14] Move disposing the alpha data to the AlphaEncoder

---
 src/ImageSharp/Formats/Webp/AlphaEncoder.cs   | 19 ++++++++++++-------
 .../Formats/Webp/Lossy/Vp8Encoder.cs          | 13 +++++--------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
index 38497281ff..1019073d87 100644
--- a/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaEncoder.cs
@@ -13,8 +13,10 @@ namespace SixLabors.ImageSharp.Formats.Webp
     /// <summary>
     /// Methods for encoding the alpha data of a VP8 image.
     /// </summary>
-    internal static class AlphaEncoder
+    internal class AlphaEncoder : IDisposable
     {
+        private IMemoryOwner<byte> alphaData;
+
         /// <summary>
         /// Encodes the alpha channel data.
         /// Data is either compressed as lossless webp image or uncompressed.
@@ -26,12 +28,12 @@ namespace SixLabors.ImageSharp.Formats.Webp
         /// <param name="compress">Indicates, if the data should be compressed with the lossless webp compression.</param>
         /// <param name="size">The size in bytes of the alpha data.</param>
         /// <returns>The encoded alpha data.</returns>
-        public static IMemoryOwner<byte> EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, bool compress, out int size)
+        public IMemoryOwner<byte> EncodeAlpha<TPixel>(Image<TPixel> image, Configuration configuration, MemoryAllocator memoryAllocator, bool compress, out int size)
             where TPixel : unmanaged, IPixel<TPixel>
         {
             int width = image.Width;
             int height = image.Height;
-            IMemoryOwner<byte> alphaData = ExtractAlphaChannel(image, configuration, memoryAllocator);
+            this.alphaData = ExtractAlphaChannel(image, configuration, memoryAllocator);
 
             if (compress)
             {
@@ -51,15 +53,15 @@ namespace SixLabors.ImageSharp.Formats.Webp
                 // The transparency information will be stored in the green channel of the ARGB quadruplet.
                 // The green channel is allowed extra transformation steps in the specification -- unlike the other channels,
                 // that can improve compression.
-                using Image<Rgba32> alphaAsImage = DispatchAlphaToGreen(image, alphaData.GetSpan());
+                using Image<Rgba32> alphaAsImage = DispatchAlphaToGreen(image, this.alphaData.GetSpan());
 
-                size = lossLessEncoder.EncodeAlphaImageData(alphaAsImage, alphaData);
+                size = lossLessEncoder.EncodeAlphaImageData(alphaAsImage, this.alphaData);
 
-                return alphaData;
+                return this.alphaData;
             }
 
             size = width * height;
-            return alphaData;
+            return this.alphaData;
         }
 
         /// <summary>
@@ -124,5 +126,8 @@ namespace SixLabors.ImageSharp.Formats.Webp
 
             return alphaDataBuffer;
         }
+
+        /// <inheritdoc/>
+        public void Dispose() => this.alphaData?.Dispose();
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 60bdee362c..927b04c0cf 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -241,11 +241,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         public int DqUvDc { get; private set; }
 
-        /// <summary>
-        /// Gets or sets the alpha data.
-        /// </summary>
-        private IMemoryOwner<byte> AlphaData { get; set; }
-
         /// <summary>
         /// Gets the luma component.
         /// </summary>
@@ -331,10 +326,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             // Extract and encode alpha channel data, if present.
             int alphaDataSize = 0;
             bool alphaCompressionSucceeded = false;
+            using var alphaEncoder = new AlphaEncoder();
+            Span<byte> alphaData = Span<byte>.Empty;
             if (hasAlpha)
             {
                 // TODO: This can potentially run in an separate task.
-                this.AlphaData = AlphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression, out alphaDataSize);
+                IMemoryOwner<byte> encodedAlphaData = alphaEncoder.EncodeAlpha(image, this.configuration, this.memoryAllocator, this.alphaCompression, out alphaDataSize);
+                alphaData = encodedAlphaData.GetSpan();
                 if (alphaDataSize < pixelCount)
                 {
                     // Only use compressed data, if the compressed data is actually smaller then the uncompressed data.
@@ -382,7 +380,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 (uint)width,
                 (uint)height,
                 hasAlpha,
-                hasAlpha ? this.AlphaData.GetSpan().Slice(0, alphaDataSize) : Span<byte>.Empty,
+                alphaData,
                 this.alphaCompression && alphaCompressionSucceeded);
         }
 
@@ -392,7 +390,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.Y.Dispose();
             this.U.Dispose();
             this.V.Dispose();
-            this.AlphaData?.Dispose();
         }
 
         /// <summary>

From 2491b6ab626f4329a40b9363e4058d5857d8567d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 1 Feb 2022 16:16:35 +0100
Subject: [PATCH 13/14] Change AverageBytesPerMb to ReadOnlySpan<byte>

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 927b04c0cf..695359e5ea 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -73,8 +73,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         private readonly bool alphaCompression;
 
-        private readonly byte[] averageBytesPerMb = { 50, 24, 16, 9, 7, 5, 3, 2 };
-
         private const int NumMbSegments = 4;
 
         private const int MaxItersKMeans = 6;
@@ -174,6 +172,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.ResetBoundaryPredictions();
         }
 
+        // This uses C#'s optimization to refer to the static data segment of the assembly, no allocation occurs.
+        private static ReadOnlySpan<byte> AverageBytesPerMb => new byte[] { 50, 24, 16, 9, 7, 5, 3, 2 };
+
         public int BaseQuant { get; set; }
 
         /// <summary>
@@ -319,7 +320,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.SetLoopParams(this.quality);
 
             // Initialize the bitwriter.
-            int averageBytesPerMacroBlock = this.averageBytesPerMb[this.BaseQuant >> 4];
+            int averageBytesPerMacroBlock = AverageBytesPerMb[this.BaseQuant >> 4];
             int expectedSize = this.Mbw * this.Mbh * averageBytesPerMacroBlock;
             this.bitWriter = new Vp8BitWriter(expectedSize, this);
 

From 85cd83f3a7606de96b0d185c3497143644366758 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Wed, 2 Feb 2022 16:22:58 +0300
Subject: [PATCH 14/14] IDisposable JpegDecoderCore is now properly disposed in
 tests

---
 .../Codecs/Jpeg/DecodeJpegParseStreamOnly.cs                  | 3 +--
 tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs       | 4 ++--
 .../Formats/Jpg/SpectralToPixelConversionTests.cs             | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpegParseStreamOnly.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpegParseStreamOnly.cs
index 9db666c374..988c056608 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpegParseStreamOnly.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpegParseStreamOnly.cs
@@ -39,10 +39,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
             using var memoryStream = new MemoryStream(this.jpegBytes);
             using var bufferedStream = new BufferedReadStream(Configuration.Default, memoryStream);
 
-            var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder { IgnoreMetadata = true });
+            using var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder { IgnoreMetadata = true });
             var scanDecoder = new HuffmanScanDecoder(bufferedStream, new NoopSpectralConverter(), cancellationToken: default);
             decoder.ParseStream(bufferedStream, scanDecoder, cancellationToken: default);
-            decoder.Dispose();
         }
 
         // We want to test only stream parsing and scan decoding, we don't need to convert spectral data to actual pixels
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs
index 35113f14ff..3833b419c4 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs
@@ -50,7 +50,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             // Calculating data from ImageSharp
             byte[] sourceBytes = TestFile.Create(provider.SourceFileOrDescription).Bytes;
 
-            var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
+            using var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
             using var ms = new MemoryStream(sourceBytes);
             using var bufferedStream = new BufferedReadStream(Configuration.Default, ms);
 
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
             // Calculating data from ImageSharp
             byte[] sourceBytes = TestFile.Create(provider.SourceFileOrDescription).Bytes;
 
-            var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
+            using var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
             using var ms = new MemoryStream(sourceBytes);
             using var bufferedStream = new BufferedReadStream(Configuration.Default, ms);
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/SpectralToPixelConversionTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/SpectralToPixelConversionTests.cs
index 0071c623c6..27240831c3 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/SpectralToPixelConversionTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/SpectralToPixelConversionTests.cs
@@ -45,7 +45,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             // Decoding
             using var converter = new SpectralConverter<TPixel>(Configuration.Default);
-            var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
+            using var decoder = new JpegDecoderCore(Configuration.Default, new JpegDecoder());
             var scanDecoder = new HuffmanScanDecoder(bufferedStream, converter, cancellationToken: default);
             decoder.ParseStream(bufferedStream, scanDecoder, cancellationToken: default);