Merge branch 'af/memory-diagnostics' of https://github.com/SixLabors/ImageSharp into af/memory-diagnostics

4 years ago · 25d4bfa746
33 changed files with 506 additions and 462 deletions
--- a/src/ImageSharp/Diagnostics/MemoryDiagnostics.cs
+++ b/src/ImageSharp/Diagnostics/MemoryDiagnostics.cs
@ -53,7 +53,7 @@ namespace SixLabors.ImageSharp.Diagnostics
        /// </summary>
        public static int TotalUndisposedAllocationCount => totalUndisposedAllocationCount;

-        internal static bool MemoryResourceLeakedSubscribed => undisposedMemoryResourceSubscriptionCounter > 0;
+        internal static bool MemoryResourceLeakedSubscribed => Volatile.Read(ref undisposedMemoryResourceSubscriptionCounter) > 0;

        internal static void IncrementTotalUndisposedAllocationCount() =>
            Interlocked.Increment(ref totalUndisposedAllocationCount);
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -280,7 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
+        /// Quantize input block, transpose, apply zig-zag ordering and store as <see cref="Block8x8"/>.
        /// </summary>
        /// <param name="block">Source block.</param>
        /// <param name="dest">Destination block.</param>
@ -291,19 +291,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            if (Avx2.IsSupported)
            {
                MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingAvx2(ref dest);
+                ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
            }
            else if (Ssse3.IsSupported)
            {
                MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
-                ZigZag.ApplyZigZagOrderingSsse3(ref dest);
+                ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
            }
            else
 #endif
            {
                for (int i = 0; i < Size; i++)
                {
-                    int idx = ZigZag.ZigZagOrder[i];
+                    int idx = ZigZag.TransposingOrder[i];
                    float quantizedVal = block[idx] * qt[idx];
                    quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
                    dest[i] = (short)quantizedVal;
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@ -29,11 +29,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        {
            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");

-            // First pass - process rows
-            block.TransposeInplace();
+            // First pass - process columns
            FDCT8x8_1D_Avx(ref block);

-            // Second pass - process columns
+            // Second pass - process rows
            block.TransposeInplace();
            FDCT8x8_1D_Avx(ref block);

--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@ -92,6 +92,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
                tableRef = ref Unsafe.Add(ref tableRef, 1);
            }
+
+            // Spectral macroblocks are not transposed before quantization
+            // Transpose is done after quantization at zig-zag stage
+            // so we must transpose quantization table
+            quantTable.TransposeInplace();
        }

        /// <summary>
@ -133,14 +138,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
            else
 #endif
-            if (Vector.IsHardwareAccelerated)
            {
                FDCT_Vector4(ref block);
            }
-            else
-            {
-                FDCT_Scalar(ref block);
-            }
        }

        /// <summary>
@ -217,136 +217,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

-        /// <summary>
-        /// Apply 2D floating point FDCT inplace using scalar operations.
-        /// </summary>
-        /// <remarks>
-        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
-        /// </remarks>
-        /// <param name="block">Input block.</param>
-        private static void FDCT_Scalar(ref Block8x8F block)
-        {
-            const int dctSize = 8;
-
-            float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-            float tmp10, tmp11, tmp12, tmp13;
-            float z1, z2, z3, z4, z5, z11, z13;
-
-            // First pass - process rows
-            ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
-            for (int ctr = 7; ctr >= 0; ctr--)
-            {
-                tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
-                tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
-                tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
-                tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
-                tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
-                tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
-                tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
-                tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
-
-                // Even part
-                tmp10 = tmp0 + tmp3;
-                tmp13 = tmp0 - tmp3;
-                tmp11 = tmp1 + tmp2;
-                tmp12 = tmp1 - tmp2;
-
-                Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
-                Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
-
-                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
-                Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
-
-                // Odd part
-                tmp10 = tmp4 + tmp5;
-                tmp11 = tmp5 + tmp6;
-                tmp12 = tmp6 + tmp7;
-
-                z5 = (tmp10 - tmp12) * 0.382683433f;
-                z2 = (0.541196100f * tmp10) + z5;
-                z4 = (1.306562965f * tmp12) + z5;
-                z3 = tmp11 * 0.707106781f;
-
-                z11 = tmp7 + z3;
-                z13 = tmp7 - z3;
-
-                Unsafe.Add(ref blockRef, 5) = z13 + z2;
-                Unsafe.Add(ref blockRef, 3) = z13 - z2;
-                Unsafe.Add(ref blockRef, 1) = z11 + z4;
-                Unsafe.Add(ref blockRef, 7) = z11 - z4;
-
-                blockRef = ref Unsafe.Add(ref blockRef, dctSize);
-            }
-
-            // Second pass - process columns
-            blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
-            for (int ctr = 7; ctr >= 0; ctr--)
-            {
-                tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
-                tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
-                tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
-                tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
-                tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
-                tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
-                tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
-                tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
-
-                // Even part
-                tmp10 = tmp0 + tmp3;
-                tmp13 = tmp0 - tmp3;
-                tmp11 = tmp1 + tmp2;
-                tmp12 = tmp1 - tmp2;
-
-                Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
-                Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
-
-                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
-                Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
-
-                // Odd part
-                tmp10 = tmp4 + tmp5;
-                tmp11 = tmp5 + tmp6;
-                tmp12 = tmp6 + tmp7;
-
-                z5 = (tmp10 - tmp12) * 0.382683433f;
-                z2 = (0.541196100f * tmp10) + z5;
-                z4 = (1.306562965f * tmp12) + z5;
-                z3 = tmp11 * 0.707106781f;
-
-                z11 = tmp7 + z3;
-                z13 = tmp7 - z3;
-
-                Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
-                Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
-                Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
-                Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
-
-                blockRef = ref Unsafe.Add(ref blockRef, 1);
-            }
-        }
-
        /// <summary>
        /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
        /// </summary>
-        /// <remarks>
-        /// This implementation must be called only if hardware supports 4
-        /// floating point numbers vector. Otherwise explicit scalar
-        /// implementation <see cref="FDCT_Scalar"/> is faster
-        /// because it does not rely on block transposition.
-        /// </remarks>
        /// <param name="block">Input block.</param>
        public static void FDCT_Vector4(ref Block8x8F block)
        {
-            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
-
-            // First pass - process rows
-            block.TransposeInplace();
+            // First pass - process columns
            FDCT8x4_Vector4(ref block.V0L);
            FDCT8x4_Vector4(ref block.V0R);

-            // Second pass - process columns
+            // Second pass - process rows
            block.TransposeInplace();
            FDCT8x4_Vector4(ref block.V0L);
            FDCT8x4_Vector4(ref block.V0R);
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@ -3,6 +3,7 @@

 #if SUPPORTS_RUNTIME_INTRINSICS
 using System;
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;

@ -18,120 +19,138 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 #pragma warning restore SA1309

        /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
+        /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
        /// zig zag implementation.
        /// </summary>
        private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
        {
-            // row0
-            0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
-            _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
-
-            // row1
-            _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
-            2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
-            _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-
-            // row2
-            _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
-            _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-
-            // row3
-            _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
-            _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
-            _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
-            6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
-
-            // row4
-            _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
-            _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
-            _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
-
-            // row5
-            _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
-            10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-
-            // row6
-            _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
-            _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
-            4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-
-            // row7
-            10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
-            _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+#pragma warning disable SA1515
+            /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
+            // A
+            0, 1, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+            // B
+            _, _, 0, 1, _, _, _, _, 2, 3, _, _, _, _, _, _,
+            // C
+            _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3,
+
+            /* row1 - B2 A3 A4 B3 C2 D1 E0 F0 */
+            // A
+            _, _, 6, 7, 8, 9, _, _, _, _, _, _, _, _, _, _,
+            // B
+            4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _, _, _,
+
+            /* row2 - E1 D2 C3 B4 A5 A6 B5 C4 */
+            // A
+            _, _, _, _, _, _, _, _, 10, 11, 12, 13,  _,  _, _, _,
+            // B
+            _, _, _, _, _, _, 8, 9,  _,  _,  _,  _, 10, 11, _, _,
+            // C
+            _, _, _, _, 6, 7, _, _,  _,  _,  _,  _,  _,  _, 8, 9,
+
+            /* row3 - D3 E2 F1 G0 H0 G1 F2 E3 */
+            // E
+            _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 6, 7,
+            // F
+            _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _,
+            // G
+            _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _,
+
+            /* row4 - D4 C5 B6 A7 B7 C6 D5 E4 */
+            // B
+            _, _,  _,  _, 12, 13, _, _, 14, 15,  _,  _,  _,  _, _, _,
+            // C
+            _, _, 10, 11,  _,  _, _, _,  _,  _, 12, 13,  _,  _, _, _,
+            // D
+            8, 9,  _,  _,  _,  _, _, _,  _,  _,  _,  _, 10, 11, _, _,
+
+            /* row5 - F3 G2 H1 H2 G3 F4 E5 D6 */
+            // F
+            6, 7, _, _, _, _, _, _, _, _, 8, 9, _, _, _, _,
+            // G
+            _, _, 4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _,
+            // H
+            _, _, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+
+            /* row6 - C7 D7 E6 F5 G4 H3 H4 G5 */
+            // G
+            _, _, _, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11,
+            // H
+            _, _, _, _, _, _, _, _, _, _, 6, 7, 8, 9,  _,  _,
+
+            /* row7 - F6 E7 F7 G6 H5 H6 G7 H7 */
+            // F
+            12, 13, _, _, 14, 15,  _,  _,  _,  _,  _,  _,  _,  _, _, _,
+            // G
+            _,  _, _, _,  _,  _, 12, 13,  _,  _,  _,  _, 14, 15, _, _,
+            // H
+            _,  _, _, _,  _,  _,  _,  _, 10, 11, 12, 13,  _,  _, 14, 15,
+#pragma warning restore SA1515
        };

        /// <summary>
-        /// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
+        /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
        /// zig zag implementation.
        /// </summary>
        private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
        {
-                // 01_AB/01_EF/23_CD - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,
-
-                // 01_AB - inner-lane
-                0, 1, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   6, 7, 12, 13,  _, _, _, _,   _, _, _, _,   _, _, 10, 11,   4, 5, 6, 7,
-
-                // 01_CD/23_GH - cross-lane
-                0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,
-
-                // 01_CD - inner-lane
-                _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   _, _, 10, 11,   4, 5, _, _,   _, _, _, _,
-
-                // 01_EF - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 23_AB/45_CD/67_EF - cross-lane
-                3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
-
-                // 23_AB - inner-lane
-                4, 5, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   2, 3, 8, 9,   _, _, _, _,
-
-                // 23_CD - inner-lane
-                _, _, 6, 7,   12, 13, _, _,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   6, 7, 12, 13,
-
-                // 23_EF - inner-lane
-                _, _, _, _,   _, _, 2, 3,   8, 9, _, _,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 23_GH - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, 0, 1,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 45_AB - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 45_CD - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   6, 7, 0, 1,   _, _, 2, 3,   8, 9, _, _,   _, _, _, _,
-
-                // 45_EF - cross-lane
-                1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   _, _, _, _,   2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
-
-                // 45_EF - inner-lane
-                2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, 4, 5,  _, _, _, _,   _, _, _, _,   _, _, 2, 3,   8, 9, _, _,
-
-                // 45_GH - inner-lane
-                _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 6, 7,
-
-                // 67_CD - inner-lane
-                _, _, _, _,   _, _, _, _,   _, _, 10, 11,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
-
-                // 67_EF - inner-lane
-                _, _, _, _,   _, _, 6, 7,   0, 1, _, _,   2, 3, 8, 9,   _, _, _, _,   _, _, _, _,   10, 11, _, _,   _, _, _, _,
-
-                // 67_GH - inner-lane
-                8, 9, 10, 11,   4, 5, _, _,   _, _, _, _,   _, _, _, _,   2, 3, 8, 9,   10, 11, 4, 5,   _, _, 6, 7,   12, 13, 14, 15
+#pragma warning disable SA1515
+            /* 01 */
+            // [cr] crln_01_AB_CD
+            0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   _, _, _, _,   1, 0, 0, 0,   2, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) AB
+            0, 1, 8, 9,   2, 3, 4, 5,   10, 11, _, _,   _, _, _, _,   12, 13, 2, 3,   4, 5, 14, 15,   _, _, _, _,   _, _, _, _,
+            // (in) CD
+            _, _, _, _,   _, _, _, _,   _, _, 0, 1,   8, 9, 2, 3,   _, _, _, _,   _, _, _, _,   0, 1, 10, 11,   _, _, _, _,
+            // [cr] crln_01_23_EF_23_CD
+            0, 0, 0, 0,   1, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) EF
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   0, 1, 8, 9,
+
+            /* 23 */
+            // [cr] crln_23_AB_23_45_GH
+            2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   0, 0, 0, 0,   1, 0, 0, 0,   4, 0, 0, 0,   5, 0, 0, 0,
+            // (in) AB
+            _, _, _, _,   _, _, 8, 9,   2, 3, 4, 5,   10, 11, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) CDe
+            _, _, 12, 13,   6, 7, _, _,   _, _, _, _,   _, _, 8, 9,   14, 15, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) EF
+            2, 3, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 4, 5,   10, 11, _, _,   _, _, _, _,   12, 13, 6, 7,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 0, 1,   8, 9, 2, 3,   _, _, _, _,
+
+            /* 45 */
+            // (in) AB
+            _, _, _, _,   12, 13, 6, 7,   14, 15, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // [cr] crln_45_67_CD_45_EF
+            2, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   2, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,
+            // (in) CD
+            8, 9, 2, 3,   _, _, _, _,   _, _, 4, 5,   10, 11, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 12, 13,
+            // (in) EF
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 0, 1,   6, 7, _, _,   _, _, _, _,   _, _, 8, 9,   2, 3, _, _,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, 4, 5,   10, 11, 12, 13,   6, 7, _, _,   _, _, _, _,
+
+            /* 67 */
+            // (in) CD
+            6, 7, 14, 15,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,   _, _, _, _,
+            // [cr] crln_67_EF_67_GH
+            2, 0, 0, 0,   3, 0, 0, 0,   5, 0, 0, 0,   6, 0, 0, 0,   3, 0, 0, 0,   6, 0, 0, 0,   7, 0, 0, 0,   _, _, _, _,
+            // (in) EF
+            _, _, _, _,   4, 5, 14, 15,   _, _, _, _,   _, _, _, _,   8, 9, 2, 3,   10, 11, _, _,   _, _, _, _,   _, _, _, _,
+            // (in) GH
+            _, _, _, _,   _, _, _, _,   0, 1, 10, 11,   12, 13, 2, 3,   _, _, _, _,   _, _, 0, 1,   6, 7, 8, 9,   2, 3, 10, 11,
+#pragma warning restore SA1515
        };

        /// <summary>
        /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
        /// </summary>
        /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
+        public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
        {
            DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");

-            fixed (byte* maskPtr = SseShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
            {
                Vector128<byte> rowA = block.V0.AsByte();
                Vector128<byte> rowB = block.V1.AsByte();
@ -142,73 +161,69 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                Vector128<byte> rowG = block.V6.AsByte();
                Vector128<byte> rowH = block.V7.AsByte();

-                // row0 - A0  A1  B0  C0  B1  A2  A3  B2
-                Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
-                Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
-                Vector128<short> row0 = Sse2.Or(rowA0, rowB0);
-                Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
-                row0 = Sse2.Or(row0, rowC0);
-
-                // row1 - C1  D0  E0  D1  C2  B3  A4  A5
-                Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
-                Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
-                Vector128<short> row1 = Sse2.Or(rowA1, rowC1);
-                Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
-                row1 = Sse2.Or(row1, rowD1);
-                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
-                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
-
-                // row2
-                Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
-                Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
-                Vector128<short> row2 = Sse2.Or(rowE2, rowF2);
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
-                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
-
-                // row3
-                Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
-                Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
-                Vector128<short> row3 = Sse2.Or(rowA3, rowB3);
-                Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
-                row3 = Sse2.Or(row3, rowC3);
-                Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
-                Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
-                row3 = Sse2.Or(row3, rowD3);
-
-                // row4
-                Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
-                Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
-                Vector128<short> row4 = Sse2.Or(rowE4, rowF4);
-                Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
-                row4 = Sse2.Or(row4, rowG4);
-                Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
-                row4 = Sse2.Or(row4, rowH4);
-
-                // row5
-                Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
-                Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
-                Vector128<short> row5 = Sse2.Or(rowC5, rowD5);
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
-                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
-
-                // row6
-                Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
-                Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
-                Vector128<short> row6 = Sse2.Or(rowE6, rowF6);
-                Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
-                row6 = Sse2.Or(row6, rowH6);
-                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
-                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
-
-                // row7
-                Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
-                Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
-                Vector128<short> row7 = Sse2.Or(rowG7, rowH7);
-                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
+                // row0 - A0 B0 A1 A2 B1 C0 D0 C1
+                Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
+                Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
+                Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
+                Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
+                row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
+
+                // row1 - B2 A3 A4 B3 C2 D1 E0 F0
+                Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
+                Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
+                Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
+                row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
+
+                // row2 - E1 D2 C3 B4 A5 A6 B5 C4
+                Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
+                Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
+                Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
+                Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
+                row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
+
+                // row3 - D3 E2 F1 G0 H0 G1 F2 E3
+                Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
+                Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
+                Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
+                Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
+                row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
+                row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
+
+                // row4 - D4 C5 B6 A7 B7 C6 D5 E4
+                Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
+                Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
+                Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
+                Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
+                row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
+                row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
+
+                // row5 - F3 G2 H1 H2 G3 F4 E5 D6
+                Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
+                Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
+                Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
+                Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
+                row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
+
+                // row6 - C7 D7 E6 F5 G4 H3 H4 G5
+                Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
+                Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
+                Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
+                row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
+
+                // row7 - F6 E7 F7 G6 H5 H6 G7 H7
+                Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
+                Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
+                Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
+                Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
+                row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();

                block.V0 = row0;
                block.V1 = row1;
@ -225,69 +240,61 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
        /// </summary>
        /// <param name="block">Input matrix.</param>
-        public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
+        public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
        {
            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");

-            fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+            fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks))
            {
-                Vector256<byte> rowsAB = block.V01.AsByte();
-                Vector256<byte> rowsCD = block.V23.AsByte();
-                Vector256<byte> rowsEF = block.V45.AsByte();
-                Vector256<byte> rowsGH = block.V67.AsByte();
-
-                // rows 0 1
-                Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
-                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                Vector256<byte> rowAB = block.V01.AsByte();
+                Vector256<byte> rowCD = block.V23.AsByte();
+                Vector256<byte> rowEF = block.V45.AsByte();
+                Vector256<byte> rowGH = block.V67.AsByte();
+
+                /* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */
+                Vector256<int> crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+                Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte();
                row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
-
-                Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
-                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
-                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
-
-                Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
-                Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
-
-                Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
-
-                // rows 2 3
-                Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
-                Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
-
-                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+                Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte();
+                row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte();
+                Vector256<int> crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32();
+                Vector256<byte> row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte();
+                Vector256<byte> row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+                Vector256<byte> row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF));
+
+                /* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */
+                Vector256<int> crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+                Vector256<byte> row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+                Vector256<byte> row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+                Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte();
                row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
-
-                Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
-
-                Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
-                Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+                Vector256<byte> row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+                Vector256<byte> row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+                Vector256<byte> row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte();

                Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));

-                // rows 4 5
-                Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
-                Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
-
-                Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
-                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
-                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
-
-                Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+                /* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */
+                Vector256<byte> row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte();
+                Vector256<int> crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32();
+                Vector256<byte> row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+                Vector256<byte> row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte();
+                Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+                row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte();
+                Vector256<byte> row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte();

                Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));

-                // rows 6 7
-                Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
-
-                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
-                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
-
-                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
-                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+                /* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */
+                Vector256<byte> row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte();
+                Vector256<int> crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32();
+                Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte();
+                row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte();
+                Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte();
+                row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte();

-                Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+                Vector256<byte> row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH));

                block.V01 = row01.AsInt16();
                block.V23 = row23.AsInt16();
--- a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
+++ b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
@ -10,6 +10,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
    {
        /// <summary>
        /// Gets the webp file format used. Either lossless or lossy.
+        /// Defaults to lossy.
        /// </summary>
        WebpFileFormatType? FileFormat { get; }

--- a/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoderCore.cs
@ -70,6 +70,7 @@ namespace SixLabors.ImageSharp.Formats.Webp

        /// <summary>
        /// Indicating what file format compression should be used.
+        /// Defaults to lossy.
        /// </summary>
        private readonly WebpFileFormatType? fileFormat;

@ -112,43 +113,43 @@ namespace SixLabors.ImageSharp.Formats.Webp
            Guard.NotNull(stream, nameof(stream));

            this.configuration = image.GetConfiguration();
-            bool lossy;
+            bool lossless;
            if (this.fileFormat is not null)
            {
-                lossy = this.fileFormat == WebpFileFormatType.Lossy;
+                lossless = this.fileFormat == WebpFileFormatType.Lossless;
            }
            else
            {
                WebpMetadata webpMetadata = image.Metadata.GetWebpMetadata();
-                lossy = webpMetadata.FileFormat == WebpFileFormatType.Lossy;
+                lossless = webpMetadata.FileFormat == WebpFileFormatType.Lossless;
            }

-            if (lossy)
+            if (lossless)
            {
-                using var enc = new Vp8Encoder(
+                using var enc = new Vp8LEncoder(
                    this.memoryAllocator,
                    this.configuration,
                    image.Width,
                    image.Height,
                    this.quality,
                    this.method,
-                    this.entropyPasses,
-                    this.filterStrength,
-                    this.spatialNoiseShaping);
+                    this.transparentColorMode,
+                    this.nearLossless,
+                    this.nearLosslessQuality);
                enc.Encode(image, stream);
            }
            else
            {
-                using var enc = new Vp8LEncoder(
+                using var enc = new Vp8Encoder(
                    this.memoryAllocator,
                    this.configuration,
                    image.Width,
                    image.Height,
                    this.quality,
                    this.method,
-                    this.transparentColorMode,
-                    this.nearLossless,
-                    this.nearLosslessQuality);
+                    this.entropyPasses,
+                    this.filterStrength,
+                    this.spatialNoiseShaping);
                enc.Encode(image, stream);
            }
        }
--- a/src/ImageSharp/Memory/Buffer2DRegion{T}.cs
+++ b/src/ImageSharp/Memory/Buffer2DRegion{T}.cs
@ -88,7 +88,7 @@ namespace SixLabors.ImageSharp.Memory
        /// <param name="y">The row index</param>
        /// <returns>The span</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public Span<T> GetRowSpan(int y)
+        public Span<T> DangerousGetRowSpan(int y)
        {
            int yy = this.Rectangle.Y + y;
            int xx = this.Rectangle.X;
@ -152,7 +152,7 @@ namespace SixLabors.ImageSharp.Memory

            for (int y = 0; y < this.Rectangle.Height; y++)
            {
-                Span<T> row = this.GetRowSpan(y);
+                Span<T> row = this.DangerousGetRowSpan(y);
                row.Clear();
            }
        }
--- a/src/ImageSharp/Processing/Processors/CloningImageProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/CloningImageProcessor{TPixel}.cs
@ -46,38 +46,25 @@ namespace SixLabors.ImageSharp.Processing.Processors
        /// <inheritdoc/>
        Image<TPixel> ICloningImageProcessor<TPixel>.CloneAndExecute()
        {
-            try
-            {
-                Image<TPixel> clone = this.CreateTarget();
-                this.CheckFrameCount(this.Source, clone);
+            Image<TPixel> clone = this.CreateTarget();
+            this.CheckFrameCount(this.Source, clone);

-                Configuration configuration = this.Configuration;
-                this.BeforeImageApply(clone);
+            Configuration configuration = this.Configuration;
+            this.BeforeImageApply(clone);

-                for (int i = 0; i < this.Source.Frames.Count; i++)
-                {
-                    ImageFrame<TPixel> sourceFrame = this.Source.Frames[i];
-                    ImageFrame<TPixel> clonedFrame = clone.Frames[i];
+            for (int i = 0; i < this.Source.Frames.Count; i++)
+            {
+                ImageFrame<TPixel> sourceFrame = this.Source.Frames[i];
+                ImageFrame<TPixel> clonedFrame = clone.Frames[i];

-                    this.BeforeFrameApply(sourceFrame, clonedFrame);
-                    this.OnFrameApply(sourceFrame, clonedFrame);
-                    this.AfterFrameApply(sourceFrame, clonedFrame);
-                }
+                this.BeforeFrameApply(sourceFrame, clonedFrame);
+                this.OnFrameApply(sourceFrame, clonedFrame);
+                this.AfterFrameApply(sourceFrame, clonedFrame);
+            }

-                this.AfterImageApply(clone);
+            this.AfterImageApply(clone);

-                return clone;
-            }
-#if DEBUG
-            catch (Exception)
-            {
-                throw;
-#else
-            catch (Exception ex)
-            {
-                throw new ImageProcessingException($"An error occurred when processing the image using {this.GetType().Name}. See the inner exception for more detail.", ex);
-#endif
-            }
+            return clone;
        }

        /// <inheritdoc/>
--- a/src/ImageSharp/Processing/Processors/Transforms/Linear/AffineTransformProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Linear/AffineTransformProcessor.cs
@ -21,6 +21,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            Guard.NotNull(sampler, nameof(sampler));
            Guard.MustBeValueType(sampler, nameof(sampler));

+            if (TransformUtils.IsDegenerate(matrix))
+            {
+                throw new DegenerateTransformException("Matrix is degenerate. Check input values.");
+            }
+
            this.Sampler = sampler;
            this.TransformMatrix = matrix;
            this.DestinationSize = targetDimensions;
--- a/src/ImageSharp/Processing/Processors/Transforms/Linear/AffineTransformProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Linear/AffineTransformProcessor{TPixel}.cs
@ -61,10 +61,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            Matrix3x2 matrix = this.transformMatrix;

            // Handle transforms that result in output identical to the original.
-            if (matrix.Equals(default) || matrix.Equals(Matrix3x2.Identity))
+            // Degenerate matrices are already handled in the upstream definition.
+            if (matrix.Equals(Matrix3x2.Identity))
            {
                // The clone will be blank here copy all the pixel data over
-                source.GetPixelMemoryGroup().CopyTo(destination.GetPixelMemoryGroup());
+                var interest = Rectangle.Intersect(this.SourceRectangle, destination.Bounds());
+                Buffer2DRegion<TPixel> sourceBuffer = source.PixelBuffer.GetRegion(interest);
+                Buffer2DRegion<TPixel> destbuffer = destination.PixelBuffer.GetRegion(interest);
+                for (int y = 0; y < sourceBuffer.Height; y++)
+                {
+                    sourceBuffer.DangerousGetRowSpan(y).CopyTo(destbuffer.DangerousGetRowSpan(y));
+                }
+
                return;
            }

@ -73,7 +81,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms

            if (sampler is NearestNeighborResampler)
            {
-                var nnOperation = new NNAffineOperation(source.PixelBuffer, destination.PixelBuffer, matrix);
+                var nnOperation = new NNAffineOperation(
+                    source.PixelBuffer,
+                    Rectangle.Intersect(this.SourceRectangle, source.Bounds()),
+                    destination.PixelBuffer,
+                    matrix);
+
                ParallelRowIterator.IterateRows(
                    configuration,
                    destination.Bounds(),
@ -85,6 +98,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            var operation = new AffineOperation<TResampler>(
                configuration,
                source.PixelBuffer,
+                Rectangle.Intersect(this.SourceRectangle, source.Bounds()),
                destination.PixelBuffer,
                in sampler,
                matrix);
@ -105,12 +119,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            [MethodImpl(InliningOptions.ShortMethod)]
            public NNAffineOperation(
                Buffer2D<TPixel> source,
+                Rectangle bounds,
                Buffer2D<TPixel> destination,
                Matrix3x2 matrix)
            {
                this.source = source;
+                this.bounds = bounds;
                this.destination = destination;
-                this.bounds = source.Bounds();
                this.matrix = matrix;
            }

@ -138,6 +153,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
        {
            private readonly Configuration configuration;
            private readonly Buffer2D<TPixel> source;
+            private readonly Rectangle bounds;
            private readonly Buffer2D<TPixel> destination;
            private readonly TResampler sampler;
            private readonly Matrix3x2 matrix;
@ -148,12 +164,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            public AffineOperation(
                Configuration configuration,
                Buffer2D<TPixel> source,
+                Rectangle bounds,
                Buffer2D<TPixel> destination,
                in TResampler sampler,
                Matrix3x2 matrix)
            {
                this.configuration = configuration;
                this.source = source;
+                this.bounds = bounds;
                this.destination = destination;
                this.sampler = sampler;
                this.matrix = matrix;
@ -182,8 +200,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                TResampler sampler = this.sampler;
                float yRadius = this.yRadius;
                float xRadius = this.xRadius;
-                int maxY = this.source.Height - 1;
-                int maxX = this.source.Width - 1;
+                int minY = this.bounds.Y;
+                int maxY = this.bounds.Bottom - 1;
+                int minX = this.bounds.X;
+                int maxX = this.bounds.Right - 1;

                for (int y = rows.Min; y < rows.Max; y++)
                {
@ -200,10 +220,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                        float pY = point.Y;
                        float pX = point.X;

-                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, maxY);
-                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, maxY);
-                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, maxX);
-                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, maxX);
+                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, minY, maxY);
+                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, minY, maxY);
+                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, minX, maxX);
+                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, minX, maxX);

                        if (bottom == top || right == left)
                        {
@ -245,8 +265,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                TResampler sampler = this.sampler;
                float yRadius = this.yRadius;
                float xRadius = this.xRadius;
-                int maxY = this.source.Height - 1;
-                int maxX = this.source.Width - 1;
+                int minY = this.bounds.Y;
+                int maxY = this.bounds.Bottom - 1;
+                int minX = this.bounds.X;
+                int maxX = this.bounds.Right - 1;

                for (int y = rows.Min; y < rows.Max; y++)
                {
@ -263,10 +285,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                        float pY = point.Y;
                        float pX = point.X;

-                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, maxY);
-                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, maxY);
-                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, maxX);
-                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, maxX);
+                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, minY, maxY);
+                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, minY, maxY);
+                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, minX, maxX);
+                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, minX, maxX);

                        if (bottom == top || right == left)
                        {
--- a/src/ImageSharp/Processing/Processors/Transforms/Linear/LinearTransformUtility.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Linear/LinearTransformUtility.cs
@ -39,11 +39,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
        /// </summary>
        /// <param name="radius">The radius.</param>
        /// <param name="center">The center position.</param>
+        /// <param name="min">The min allowed amouunt.</param>
        /// <param name="max">The max allowed amouunt.</param>
        /// <returns>The <see cref="int"/>.</returns>
        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int GetRangeStart(float radius, float center, int max)
-            => Numerics.Clamp((int)MathF.Ceiling(center - radius), 0, max);
+        public static int GetRangeStart(float radius, float center, int min, int max)
+            => Numerics.Clamp((int)MathF.Ceiling(center - radius), min, max);

        /// <summary>
        /// Gets the end position (inclusive) for a sampling range given
@ -51,10 +52,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
        /// </summary>
        /// <param name="radius">The radius.</param>
        /// <param name="center">The center position.</param>
+        /// <param name="min">The min allowed amouunt.</param>
        /// <param name="max">The max allowed amouunt.</param>
        /// <returns>The <see cref="int"/>.</returns>
        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int GetRangeEnd(float radius, float center, int max)
-            => Numerics.Clamp((int)MathF.Floor(center + radius), 0, max);
+        public static int GetRangeEnd(float radius, float center, int min, int max)
+            => Numerics.Clamp((int)MathF.Floor(center + radius), min, max);
    }
 }
--- a/src/ImageSharp/Processing/Processors/Transforms/Linear/ProjectiveTransformProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Linear/ProjectiveTransformProcessor.cs
@ -21,6 +21,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            Guard.NotNull(sampler, nameof(sampler));
            Guard.MustBeValueType(sampler, nameof(sampler));

+            if (TransformUtils.IsDegenerate(matrix))
+            {
+                throw new DegenerateTransformException("Matrix is degenerate. Check input values.");
+            }
+
            this.Sampler = sampler;
            this.TransformMatrix = matrix;
            this.DestinationSize = targetDimensions;
--- a/src/ImageSharp/Processing/Processors/Transforms/Linear/ProjectiveTransformProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Linear/ProjectiveTransformProcessor{TPixel}.cs
@ -60,10 +60,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            Matrix4x4 matrix = this.transformMatrix;

            // Handle transforms that result in output identical to the original.
-            if (matrix.Equals(default) || matrix.Equals(Matrix4x4.Identity))
+            // Degenerate matrices are already handled in the upstream definition.
+            if (matrix.Equals(Matrix4x4.Identity))
            {
                // The clone will be blank here copy all the pixel data over
-                source.GetPixelMemoryGroup().CopyTo(destination.GetPixelMemoryGroup());
+                var interest = Rectangle.Intersect(this.SourceRectangle, destination.Bounds());
+                Buffer2DRegion<TPixel> sourceBuffer = source.PixelBuffer.GetRegion(interest);
+                Buffer2DRegion<TPixel> destbuffer = destination.PixelBuffer.GetRegion(interest);
+                for (int y = 0; y < sourceBuffer.Height; y++)
+                {
+                    sourceBuffer.DangerousGetRowSpan(y).CopyTo(destbuffer.DangerousGetRowSpan(y));
+                }
+
                return;
            }

@ -72,7 +80,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms

            if (sampler is NearestNeighborResampler)
            {
-                var nnOperation = new NNProjectiveOperation(source.PixelBuffer, destination.PixelBuffer, matrix);
+                var nnOperation = new NNProjectiveOperation(
+                    source.PixelBuffer,
+                    Rectangle.Intersect(this.SourceRectangle, source.Bounds()),
+                    destination.PixelBuffer,
+                    matrix);
+
                ParallelRowIterator.IterateRows(
                    configuration,
                    destination.Bounds(),
@ -84,6 +97,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            var operation = new ProjectiveOperation<TResampler>(
                configuration,
                source.PixelBuffer,
+                Rectangle.Intersect(this.SourceRectangle, source.Bounds()),
                destination.PixelBuffer,
                in sampler,
                matrix);
@ -104,12 +118,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            [MethodImpl(InliningOptions.ShortMethod)]
            public NNProjectiveOperation(
                Buffer2D<TPixel> source,
+                Rectangle bounds,
                Buffer2D<TPixel> destination,
                Matrix4x4 matrix)
            {
                this.source = source;
+                this.bounds = bounds;
                this.destination = destination;
-                this.bounds = source.Bounds();
                this.matrix = matrix;
            }

@ -137,6 +152,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
        {
            private readonly Configuration configuration;
            private readonly Buffer2D<TPixel> source;
+            private readonly Rectangle bounds;
            private readonly Buffer2D<TPixel> destination;
            private readonly TResampler sampler;
            private readonly Matrix4x4 matrix;
@ -147,18 +163,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
            public ProjectiveOperation(
                Configuration configuration,
                Buffer2D<TPixel> source,
+                Rectangle bounds,
                Buffer2D<TPixel> destination,
                in TResampler sampler,
                Matrix4x4 matrix)
            {
                this.configuration = configuration;
                this.source = source;
+                this.bounds = bounds;
                this.destination = destination;
                this.sampler = sampler;
                this.matrix = matrix;

-                this.yRadius = LinearTransformUtility.GetSamplingRadius(in sampler, source.Height, destination.Height);
-                this.xRadius = LinearTransformUtility.GetSamplingRadius(in sampler, source.Width, destination.Width);
+                this.yRadius = LinearTransformUtility.GetSamplingRadius(in sampler, bounds.Height, destination.Height);
+                this.xRadius = LinearTransformUtility.GetSamplingRadius(in sampler, bounds.Width, destination.Width);
            }

            [MethodImpl(InliningOptions.ShortMethod)]
@ -181,8 +199,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                TResampler sampler = this.sampler;
                float yRadius = this.yRadius;
                float xRadius = this.xRadius;
-                int maxY = this.source.Height - 1;
-                int maxX = this.source.Width - 1;
+                int minY = this.bounds.Y;
+                int maxY = this.bounds.Bottom - 1;
+                int minX = this.bounds.X;
+                int maxX = this.bounds.Right - 1;

                for (int y = rows.Min; y < rows.Max; y++)
                {
@ -199,10 +219,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                        float pY = point.Y;
                        float pX = point.X;

-                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, maxY);
-                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, maxY);
-                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, maxX);
-                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, maxX);
+                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, minY, maxY);
+                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, minY, maxY);
+                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, minX, maxX);
+                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, minX, maxX);

                        if (bottom <= top || right <= left)
                        {
@ -244,8 +264,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                TResampler sampler = this.sampler;
                float yRadius = this.yRadius;
                float xRadius = this.xRadius;
-                int maxY = this.source.Height - 1;
-                int maxX = this.source.Width - 1;
+                int minY = this.bounds.Y;
+                int maxY = this.bounds.Bottom - 1;
+                int minX = this.bounds.X;
+                int maxX = this.bounds.Right - 1;

                for (int y = rows.Min; y < rows.Max; y++)
                {
@ -262,10 +284,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                        float pY = point.Y;
                        float pX = point.X;

-                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, maxY);
-                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, maxY);
-                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, maxX);
-                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, maxX);
+                        int top = LinearTransformUtility.GetRangeStart(yRadius, pY, minY, maxY);
+                        int bottom = LinearTransformUtility.GetRangeEnd(yRadius, pY, minY, maxY);
+                        int left = LinearTransformUtility.GetRangeStart(xRadius, pX, minX, maxX);
+                        int right = LinearTransformUtility.GetRangeEnd(xRadius, pX, minX, maxX);

                        if (bottom <= top || right <= left)
                        {
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeWorker.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeWorker.cs
@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms

            for (int y = calculationInterval.Min; y < calculationInterval.Max; y++)
            {
-                Span<TPixel> sourceRow = this.source.GetRowSpan(y);
+                Span<TPixel> sourceRow = this.source.DangerousGetRowSpan(y);

                PixelOperations<TPixel>.Instance.ToVector4(
                    this.configuration,
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -220,7 +220,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg

                // Reference implementation quantizes given block via division
                Block8x8 expected = default;
-                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
+                ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.TransposingOrder);

                // Actual current implementation quantizes given block via multiplication
                // With quantization table reciprocal
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@ -135,10 +135,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
                    srcBlock.MultiplyInPlace(ref dequantMatrix);

+                    // testee
                    // IDCT implementation tranforms blocks after transposition
                    srcBlock.TransposeInplace();
-
-                    // IDCT calculation
                    FastFloatingPointDCT.TransformIDCT(ref srcBlock);

                    float[] actualDest = srcBlock.ToArray();
@ -180,7 +179,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);

                    // testee
+                    // Second transpose call is done by Quantize step
+                    // Do this manually here just to be complient to the reference implementation
                    FastFloatingPointDCT.TransformFDCT(ref block);
+                    block.TransposeInplace();

                    // Part of the IDCT calculations is fused into the quantization step
                    // We must multiply input block with adjusted no-quantization matrix
--- a/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/WebpEncoderTests.cs
@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
        private static string TestImageLossyFullPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, Lossy.NoFilter06);

        [Theory]
-        [WithFile(Flag, PixelTypes.Rgba32, WebpFileFormatType.Lossless)] // if its not a webp input image, it should default to lossless.
+        [WithFile(Flag, PixelTypes.Rgba32, WebpFileFormatType.Lossy)] // If its not a webp input image, it should default to lossy.
        [WithFile(Lossless.NoTransform1, PixelTypes.Rgba32, WebpFileFormatType.Lossless)]
        [WithFile(Lossy.Bike, PixelTypes.Rgba32, WebpFileFormatType.Lossy)]
        public void Encode_PreserveRatio<TPixel>(TestImageProvider<TPixel> provider, WebpFileFormatType expectedFormat)
--- a/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs
+++ b/tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs
@ -253,35 +253,40 @@ namespace SixLabors.ImageSharp.Tests.Memory.Allocators
        [InlineData(true)]
        public void RentReturnRelease_SubsequentRentReturnsDifferentHandles(bool multiple)
        {
-            var pool = new UniformUnmanagedMemoryPool(16, 16);
-            using var cleanup = new CleanupUtil(pool);
-            UnmanagedMemoryHandle b0 = pool.Rent();
-            IntPtr h0 = b0.Handle;
-            UnmanagedMemoryHandle b1 = pool.Rent();
-            IntPtr h1 = b1.Handle;
-            pool.Return(b0);
-            pool.Return(b1);
-            pool.Release();
+            RemoteExecutor.Invoke(RunTest, multiple.ToString()).Dispose();

-            // Do some unmanaged allocations to make sure new pool buffers are different:
-            IntPtr[] dummy = Enumerable.Range(0, 100).Select(_ => Marshal.AllocHGlobal(16)).ToArray();
-            cleanup.Register(dummy);
-
-            if (multiple)
-            {
-                UnmanagedMemoryHandle b = pool.Rent();
-                cleanup.Register(b);
-                Assert.NotEqual(h0, b.Handle);
-                Assert.NotEqual(h1, b.Handle);
-            }
-            else
+            static void RunTest(string multipleInner)
            {
-                UnmanagedMemoryHandle[] b = pool.Rent(2);
-                cleanup.Register(b);
-                Assert.NotEqual(h0, b[0].Handle);
-                Assert.NotEqual(h1, b[0].Handle);
-                Assert.NotEqual(h0, b[1].Handle);
-                Assert.NotEqual(h1, b[1].Handle);
+                var pool = new UniformUnmanagedMemoryPool(16, 16);
+                using var cleanup = new CleanupUtil(pool);
+                UnmanagedMemoryHandle b0 = pool.Rent();
+                IntPtr h0 = b0.Handle;
+                UnmanagedMemoryHandle b1 = pool.Rent();
+                IntPtr h1 = b1.Handle;
+                pool.Return(b0);
+                pool.Return(b1);
+                pool.Release();
+
+                // Do some unmanaged allocations to make sure new pool buffers are different:
+                IntPtr[] dummy = Enumerable.Range(0, 100).Select(_ => Marshal.AllocHGlobal(16)).ToArray();
+                cleanup.Register(dummy);
+
+                if (bool.Parse(multipleInner))
+                {
+                    UnmanagedMemoryHandle b = pool.Rent();
+                    cleanup.Register(b);
+                    Assert.NotEqual(h0, b.Handle);
+                    Assert.NotEqual(h1, b.Handle);
+                }
+                else
+                {
+                    UnmanagedMemoryHandle[] b = pool.Rent(2);
+                    cleanup.Register(b);
+                    Assert.NotEqual(h0, b[0].Handle);
+                    Assert.NotEqual(h1, b[0].Handle);
+                    Assert.NotEqual(h0, b[1].Handle);
+                    Assert.NotEqual(h1, b[1].Handle);
+                }
            }
        }

--- a/tests/ImageSharp.Tests/Memory/BufferAreaTests.cs
+++ b/tests/ImageSharp.Tests/Memory/BufferAreaTests.cs
@ -68,7 +68,7 @@ namespace SixLabors.ImageSharp.Tests.Memory

            Buffer2DRegion<int> region = buffer.GetRegion(r);

-            Span<int> span = region.GetRowSpan(y);
+            Span<int> span = region.DangerousGetRowSpan(y);

            Assert.Equal(w, span.Length);

--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/AffineTransformTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/AffineTransformTests.cs
@ -4,6 +4,7 @@
 using System;
 using System.Numerics;
 using System.Reflection;
+using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Processing;
 using SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -224,6 +225,46 @@ namespace SixLabors.ImageSharp.Tests.Processing.Transforms
                c => c.Transform(builder));
        }

+        [Fact]
+        public void Issue1911()
+        {
+            using var image = new Image<Rgba32>(100, 100);
+            image.Mutate(x => x = x.Transform(new Rectangle(0, 0, 99, 100), Matrix3x2.Identity, new Size(99, 100), KnownResamplers.Lanczos2));
+
+            Assert.Equal(99, image.Width);
+            Assert.Equal(100, image.Height);
+        }
+
+        [Theory]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32)]
+        public void Identity<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            using Image<TPixel> image = provider.GetImage();
+
+            Matrix3x2 m = Matrix3x2.Identity;
+            Rectangle r = new(25, 25, 50, 50);
+            image.Mutate(x => x.Transform(r, m, new Size(100, 100), KnownResamplers.Bicubic));
+            image.DebugSave(provider);
+            image.CompareToReferenceOutput(ValidatorComparer, provider);
+        }
+
+        [Theory]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 0.0001F)]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 57F)]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 0F)]
+        public void Transform_With_Custom_Dimensions<TPixel>(TestImageProvider<TPixel> provider, float radians)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            using Image<TPixel> image = provider.GetImage();
+
+            var m = Matrix3x2.CreateRotation(radians, new Vector2(50, 50));
+            Rectangle r = new(25, 25, 50, 50);
+            image.Mutate(x => x.Transform(r, m, new Size(100, 100), KnownResamplers.Bicubic));
+            image.DebugSave(provider, testOutputDetails: radians);
+            image.CompareToReferenceOutput(ValidatorComparer, provider, testOutputDetails: radians);
+        }
+
        private static IResampler GetResampler(string name)
        {
            PropertyInfo property = typeof(KnownResamplers).GetTypeInfo().GetProperty(name);
--- a/tests/ImageSharp.Tests/Processing/Transforms/ProjectiveTransformTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Transforms/ProjectiveTransformTests.cs
@ -147,6 +147,46 @@ namespace SixLabors.ImageSharp.Tests.Processing.Transforms
            }
        }

+        [Fact]
+        public void Issue1911()
+        {
+            using var image = new Image<Rgba32>(100, 100);
+            image.Mutate(x => x = x.Transform(new Rectangle(0, 0, 99, 100), Matrix4x4.Identity, new Size(99, 100), KnownResamplers.Lanczos2));
+
+            Assert.Equal(99, image.Width);
+            Assert.Equal(100, image.Height);
+        }
+
+        [Theory]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32)]
+        public void Identity<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            using Image<TPixel> image = provider.GetImage();
+
+            Matrix4x4 m = Matrix4x4.Identity;
+            Rectangle r = new(25, 25, 50, 50);
+            image.Mutate(x => x.Transform(r, m, new Size(100, 100), KnownResamplers.Bicubic));
+            image.DebugSave(provider);
+            image.CompareToReferenceOutput(ValidatorComparer, provider);
+        }
+
+        [Theory]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 0.0001F)]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 57F)]
+        [WithTestPatternImages(100, 100, PixelTypes.Rgba32, 0F)]
+        public void Transform_With_Custom_Dimensions<TPixel>(TestImageProvider<TPixel> provider, float radians)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            using Image<TPixel> image = provider.GetImage();
+
+            Matrix4x4 m = Matrix4x4.CreateRotationX(radians, new Vector3(50, 50, 1F)) * Matrix4x4.CreateRotationY(radians, new Vector3(50, 50, 1F));
+            Rectangle r = new(25, 25, 50, 50);
+            image.Mutate(x => x.Transform(r, m, new Size(100, 100), KnownResamplers.Bicubic));
+            image.DebugSave(provider, testOutputDetails: radians);
+            image.CompareToReferenceOutput(ValidatorComparer, provider, testOutputDetails: radians);
+        }
+
        private static IResampler GetResampler(string name)
        {
            PropertyInfo property = typeof(KnownResamplers).GetTypeInfo().GetProperty(name);
--- a/tests/ImageSharp.Tests/Quantization/PixelSamplingStrategyTests.cs
+++ b/tests/ImageSharp.Tests/Quantization/PixelSamplingStrategyTests.cs
@ -75,7 +75,7 @@ namespace SixLabors.ImageSharp.Tests.Quantization
            var white = new L8(255);
            for (int y = 0; y < region.Height; y++)
            {
-                region.GetRowSpan(y).Fill(white);
+                region.DangerousGetRowSpan(y).Fill(white);
            }
        }

--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Identity_Rgba32_TestPattern100x100.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Identity_Rgba32_TestPattern100x100.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da8229605bda413676a42f587df250a743540e6e00c04eacb1e622f223e19595
+size 3564
--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_FromSourceRectangle1_Rgba32_TestPattern96x48.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_FromSourceRectangle1_Rgba32_TestPattern96x48.png
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b01d54838d678b61c3b7a1c7e76ff9a60b3f5f4faef5af848231177eac956eb1
-size 1262
+oid sha256:bbe1ffaf7b801fd92724438cc810fd0c5506e0a907b970c4f0bf5bec3627ca2a
+size 551
--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_FromSourceRectangle2_Rgba32_TestPattern96x48.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_FromSourceRectangle2_Rgba32_TestPattern96x48.png
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d34394771605c2a70cc23f3841592c20d22a68aaabf2ad6e8aba7348a181afb3
-size 531
+oid sha256:b45933471a1af1b6d4112240e1bc6b6187065a872043ddbf917200ce9e8cc84b
+size 371
--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.0001.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.0001.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd3b29b530e221618f65cd5e493b21fe3c27804fde7664636b7bb002f72abbb2
+size 3663
--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da8229605bda413676a42f587df250a743540e6e00c04eacb1e622f223e19595
+size 3564
--- a/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_57.png
+++ b/tests/Images/External/ReferenceOutput/AffineTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_57.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35757fef08a6fd9b37e719d5be7a82d5ff79f0395e082f697d9ebe9c7f03cc8
+size 5748
--- a/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Identity_Rgba32_TestPattern100x100.png
+++ b/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Identity_Rgba32_TestPattern100x100.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da8229605bda413676a42f587df250a743540e6e00c04eacb1e622f223e19595
+size 3564
--- a/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.0001.png
+++ b/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.0001.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39c25539c3c9b8926bf65c041df693a60617bbe8653bb72357bde5ab6342c59c
+size 3618
--- a/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.png
+++ b/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_0.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da8229605bda413676a42f587df250a743540e6e00c04eacb1e622f223e19595
+size 3564
--- a/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_57.png
+++ b/tests/Images/External/ReferenceOutput/ProjectiveTransformTests/Transform_With_Custom_Dimensions_Rgba32_TestPattern100x100_57.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1fc95fdf07c7443147205afffb157aa82f94818cfbb833a615c42f584fbda0
+size 5070