From 98f5a428169e9e31b903ba3252bd00223945ab7f Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 25 Nov 2021 22:49:29 +0100
Subject: [PATCH 01/11] Add SSE2 version off FTransform

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 217 +++++++++++++++---
 1 file changed, 185 insertions(+), 32 deletions(-)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index aa4ab5767..143d9f17e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// <summary>
     /// Methods for encoding a VP8 frame.
     /// </summary>
-    internal static class Vp8Encoding
+    internal static unsafe class Vp8Encoding
     {
         private const int KC1 = 20091 + (1 << 16);
 
@@ -382,43 +382,196 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
         {
-            int i;
-            Span<int> tmp = scratch.Slice(0, 16);
-
-            int srcIdx = 0;
-            int refIdx = 0;
-            for (i = 0; i < 4; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
             {
-                int d0 = src[srcIdx] - reference[refIdx];   // 9bit dynamic range ([-255,255])
-                int d1 = src[srcIdx + 1] - reference[refIdx + 1];
-                int d2 = src[srcIdx + 2] - reference[refIdx + 2];
-                int d3 = src[srcIdx + 3] - reference[refIdx + 3];
-                int a0 = d0 + d3;         // 10b                      [-510,510]
-                int a1 = d1 + d2;
-                int a2 = d1 - d2;
-                int a3 = d0 - d3;
-                tmp[0 + (i * 4)] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
-                tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9;      // [-7536,7542]
-                tmp[2 + (i * 4)] = (a0 - a1) * 8;
-                tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
-
-                srcIdx += WebpConstants.Bps;
-                refIdx += WebpConstants.Bps;
+#pragma warning disable SA1503 // Braces should not be omitted
+                fixed (byte* srcRef = src)
+                fixed (byte* referenceRef = reference)
+                {
+                    // Load src.
+                    Vector128<ulong> src0 = Sse2.LoadScalarVector128((ulong*)srcRef);
+                    Vector128<ulong> src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps));
+                    Vector128<ulong> src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2)));
+                    Vector128<ulong> src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3)));
+
+                    // Load ref.
+                    Vector128<ulong> ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef);
+                    Vector128<ulong> ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps));
+                    Vector128<ulong> ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2)));
+                    Vector128<ulong> ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3)));
+
+                    // 00 01 02 03 *
+                    // 10 11 12 13 *
+                    // 20 21 22 23 *
+                    // 30 31 32 33 *
+                    // Shuffle.
+                    Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
+                    Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
+                    Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
+                    Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
+
+                    // 00 01 10 11 02 03 12 13 * * ...
+                    // 20 21 30 31 22 22 32 33 * * ...
+
+                    // Convert both to 16 bit.
+                    Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
+
+                    // Compute the difference.
+                    Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
+                    Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
+
+                    // First pass
+                    FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
+
+                    // Second pass
+                    FTransformPass2SSE2(v01, v32, output);
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
             }
-
-            for (i = 0; i < 4; i++)
+            else
+#endif
             {
-                int a0 = tmp[0 + i] + tmp[12 + i];  // 15b
-                int a1 = tmp[4 + i] + tmp[8 + i];
-                int a2 = tmp[4 + i] - tmp[8 + i];
-                int a3 = tmp[0 + i] - tmp[12 + i];
-                output[0 + i] = (short)((a0 + a1 + 7) >> 4);            // 12b
-                output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
-                output[8 + i] = (short)((a0 - a1 + 7) >> 4);
-                output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
+                int i;
+                Span<int> tmp = scratch.Slice(0, 16);
+
+                int srcIdx = 0;
+                int refIdx = 0;
+                for (i = 0; i < 4; i++)
+                {
+                    int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
+                    int d1 = src[srcIdx + 1] - reference[refIdx + 1];
+                    int d2 = src[srcIdx + 2] - reference[refIdx + 2];
+                    int d3 = src[srcIdx + 3] - reference[refIdx + 3];
+                    int a0 = d0 + d3; // 10b                      [-510,510]
+                    int a1 = d1 + d2;
+                    int a2 = d1 - d2;
+                    int a3 = d0 - d3;
+                    tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b                      [-8160,8160]
+                    tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
+                    tmp[2 + (i * 4)] = (a0 - a1) * 8;
+                    tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
+
+                    srcIdx += WebpConstants.Bps;
+                    refIdx += WebpConstants.Bps;
+                }
+
+                for (i = 0; i < 4; i++)
+                {
+                    int a0 = tmp[0 + i] + tmp[12 + i]; // 15b
+                    int a1 = tmp[4 + i] + tmp[8 + i];
+                    int a2 = tmp[4 + i] - tmp[8 + i];
+                    int a3 = tmp[0 + i] - tmp[12 + i];
+                    output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
+                    output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
+                    output[8 + i] = (short)((a0 - a1 + 7) >> 4);
+                    output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
+                }
             }
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
+        {
+            var k937 = Vector128.Create(937);
+            var k1812 = Vector128.Create(1812);
+            Vector128<short> k88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16();
+            Vector128<short> k88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16();
+            Vector128<short> k5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16();
+            Vector128<short> k5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16();
+
+            // *in01 = 00 01 10 11 02 03 12 13
+            // *in23 = 20 21 30 31 22 23 32 33
+            Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1));
+            Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1));
+
+            // 00 01 10 11 03 02 13 12
+            // 20 21 30 31 23 22 33 32
+            Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+            Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+
+            // 00 01 10 11 20 21 30 31
+            // 03 02 13 12 23 22 33 32
+            Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
+            Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
+
+            // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+            // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+            Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, k88p); // [ (a0 + a1) << 3, ... ]
+            Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, k88m); // [ (a0 - a1) << 3, ... ]
+            Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, k5352_2217p);
+            Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, k5352_2217m);
+            Vector128<int> tmp12 = Sse2.Add(tmp11, k1812);
+            Vector128<int> tmp32 = Sse2.Add(tmp31, k937);
+            Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
+            Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
+            Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
+            Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
+            Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
+            Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
+            Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
+            out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
+            out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2));
+        }
+
+        public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
+        {
+            var seven = Vector128.Create((short)7);
+            Vector128<short> k5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16();
+            Vector128<short> k2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16();
+            var k12000PlusOne = Vector128.Create(12000 + (1 << 16));
+            var k51000 = Vector128.Create(51000);
+
+            // Same operations are done on the (0,3) and (1,2) pairs.
+            // a3 = v0 - v3
+            // a2 = v1 - v2
+            Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
+            Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
+
+            Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
+            Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, k5352_2217);
+            Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, k2217_5352);
+            Vector128<int> d1 = Sse2.Add(c1, k12000PlusOne);
+            Vector128<int> d3 = Sse2.Add(c3, k51000);
+            Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
+            Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
+
+            // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+            // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+            Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1);
+            Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3);
+
+            // g1 = f1 + (a3 != 0);
+            // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+            // desired (0, 1), we add one earlier through k12000_plus_one.
+            // -> g1 = f1 + 1 - (a3 == 0)
+            Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero));
+
+            // a0 = v0 + v3
+            // a1 = v1 + v2
+            Vector128<int> a01 = Sse2.Add(v01, v32);
+            Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), seven);
+            Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
+            Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
+            Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);
+
+            // d0 = (a0 + a1 + 7) >> 4;
+            // d2 = (a0 - a1 + 7) >> 4;
+            Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4);
+            Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4);
+
+            Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
+            Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
+
+            ref short outputRef = ref MemoryMarshal.GetReference(output);
+            Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16();
+            Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref outputRef, 8)) = d2f3.AsInt16();
+        }
+#endif
+
         public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
         {
             Span<int> tmp = scratch.Slice(0, 16);

From 4bb56eea71a6e3e909d3fcd2255f633a1007c643 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 12:50:55 +0100
Subject: [PATCH 02/11] Define mask and shuffle vectors as static readonly

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 75 +++++++++++--------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 143d9f17e..a3a9c924c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -66,11 +66,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };
 
 #if SUPPORTS_RUNTIME_INTRINSICS
-        public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();
+#pragma warning disable SA1310 // Field names should not contain underscore
+        private static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();
 
-        public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();
+        private static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();
 
-        public static readonly Vector128<short> Four = Vector128.Create((short)4);
+        private static readonly Vector128<short> Four = Vector128.Create((short)4);
+
+        private static readonly Vector128<short> Seven = Vector128.Create((short)7);
+
+        private static readonly Vector128<short> K88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16();
+
+        private static readonly Vector128<short> K88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16();
+
+        private static readonly Vector128<short> K5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16();
+
+        private static readonly Vector128<short> K5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16();
+
+        private static readonly Vector128<int> K937 = Vector128.Create(937);
+
+        private static readonly Vector128<int> K1812 = Vector128.Create(1812);
+
+        private static readonly Vector128<short> K5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16();
+
+        private static readonly Vector128<short> K2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16();
+
+        private static readonly Vector128<int> K12000PlusOne = Vector128.Create(12000 + (1 << 16));
+
+        private static readonly Vector128<int> K51000 = Vector128.Create(51000);
+
+        private static readonly byte MmShuffle2301 = SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1);
+
+        private static readonly byte MmShuffle1032 = SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2);
+#pragma warning restore SA1310 // Field names should not contain underscore
 #endif
 
         static Vp8Encoding()
@@ -476,17 +504,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 #if SUPPORTS_RUNTIME_INTRINSICS
         public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
         {
-            var k937 = Vector128.Create(937);
-            var k1812 = Vector128.Create(1812);
-            Vector128<short> k88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16();
-            Vector128<short> k88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16();
-            Vector128<short> k5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16();
-            Vector128<short> k5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16();
-
             // *in01 = 00 01 10 11 02 03 12 13
             // *in23 = 20 21 30 31 22 23 32 33
-            Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1));
-            Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1));
+            Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, MmShuffle2301);
+            Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, MmShuffle2301);
 
             // 00 01 10 11 03 02 13 12
             // 20 21 30 31 23 22 33 32
@@ -500,12 +521,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
             // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
             // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
-            Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, k88p); // [ (a0 + a1) << 3, ... ]
-            Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, k88m); // [ (a0 - a1) << 3, ... ]
-            Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, k5352_2217p);
-            Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, k5352_2217m);
-            Vector128<int> tmp12 = Sse2.Add(tmp11, k1812);
-            Vector128<int> tmp32 = Sse2.Add(tmp31, k937);
+            Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, K88p); // [ (a0 + a1) << 3, ... ]
+            Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, K88m); // [ (a0 - a1) << 3, ... ]
+            Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, K5352_2217p);
+            Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, K5352_2217m);
+            Vector128<int> tmp12 = Sse2.Add(tmp11, K1812);
+            Vector128<int> tmp32 = Sse2.Add(tmp31, K937);
             Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
             Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
             Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
@@ -514,17 +535,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
             Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
             out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
-            out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2));
+            out32 = Sse2.Shuffle(v23, MmShuffle1032);
         }
 
         public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
         {
-            var seven = Vector128.Create((short)7);
-            Vector128<short> k5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16();
-            Vector128<short> k2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16();
-            var k12000PlusOne = Vector128.Create(12000 + (1 << 16));
-            var k51000 = Vector128.Create(51000);
-
             // Same operations are done on the (0,3) and (1,2) pairs.
             // a3 = v0 - v3
             // a2 = v1 - v2
@@ -532,10 +547,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
 
             Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
-            Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, k5352_2217);
-            Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, k2217_5352);
-            Vector128<int> d1 = Sse2.Add(c1, k12000PlusOne);
-            Vector128<int> d3 = Sse2.Add(c3, k51000);
+            Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, K5352_2217);
+            Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, K2217_5352);
+            Vector128<int> d1 = Sse2.Add(c1, K12000PlusOne);
+            Vector128<int> d3 = Sse2.Add(c3, K51000);
             Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
             Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
 
@@ -553,7 +568,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             // a0 = v0 + v3
             // a1 = v1 + v2
             Vector128<int> a01 = Sse2.Add(v01, v32);
-            Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), seven);
+            Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Seven);
             Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
             Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
             Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);

From 38fd3a84582afa8e405316cc769c8832f44a35cb Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 12:51:25 +0100
Subject: [PATCH 03/11] Avoid bounds checks in IsFlat

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 2fcea8cee..f3b0e8e3d 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -744,19 +744,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         private static bool IsFlat(Span<short> levels, int numBlocks, int thresh)
         {
             int score = 0;
+            ref short levelsRef = ref MemoryMarshal.GetReference(levels);
+            int offset = 0;
             while (numBlocks-- > 0)
             {
                 for (int i = 1; i < 16; i++)
                 {
                     // omit DC, we're only interested in AC
-                    score += levels[i] != 0 ? 1 : 0;
+                    score += Unsafe.Add(ref levelsRef, offset) != 0 ? 1 : 0;
                     if (score > thresh)
                     {
                         return false;
                     }
                 }
 
-                levels = levels.Slice(16);
+                offset += 16;
             }
 
             return true;

From 798e9c3ad6e77e3bda0770a16e2e283c7bc45ff1 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 14:09:31 +0100
Subject: [PATCH 04/11] Add SSE2 version of FTransform2

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 64 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index a3a9c924c..f657d3252 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -404,8 +404,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
         {
-            FTransform(src, reference, output, scratch);
-            FTransform(src.Slice(4), reference.Slice(4), output2, scratch);
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+#pragma warning disable SA1503 // Braces should not be omitted
+                fixed (byte* srcRef = src)
+                fixed (byte* referenceRef = reference)
+                {
+                    // Load src.
+                    Vector128<ulong> src0 = Sse2.LoadScalarVector128((ulong*)srcRef);
+                    Vector128<ulong> src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps));
+                    Vector128<ulong> src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2)));
+                    Vector128<ulong> src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3)));
+
+                    // Load ref.
+                    Vector128<ulong> ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef);
+                    Vector128<ulong> ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps));
+                    Vector128<ulong> ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2)));
+                    Vector128<ulong> ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3)));
+
+                    // Convert both to 16 bit.
+                    Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
+                    Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
+
+                    // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
+                    Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
+                    Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
+                    Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
+                    Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
+
+                    // Unpack and shuffle.
+                    // 00 01 02 03   0 0 0 0
+                    // 10 11 12 13   0 0 0 0
+                    // 20 21 22 23   0 0 0 0
+                    // 30 31 32 33   0 0 0 0
+                    Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
+                    Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
+                    Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
+                    Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
+
+                    // First pass.
+                    FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
+                    FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
+
+                    // Second pass.
+                    FTransformPass2SSE2(v01l, v32l, output);
+                    FTransformPass2SSE2(v01h, v32h, output2);
+                }
+            }
+            else
+#endif
+            {
+                FTransform(src, reference, output, scratch);
+                FTransform(src.Slice(4), reference.Slice(4), output2, scratch);
+            }
         }
 
         public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
@@ -567,7 +625,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
             // a0 = v0 + v3
             // a1 = v1 + v2
-            Vector128<int> a01 = Sse2.Add(v01, v32);
+            Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
             Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Seven);
             Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
             Vector128<short> c0 = Sse2.Add(a01Plus7, a11);

From 0880c586521f0d87616ae579df35b068c186ecad Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 15:18:38 +0100
Subject: [PATCH 05/11] Add FTransform tests

---
 .../Formats/WebP/Vp8EncodingTests.cs          | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
index 6bcb4f21f..245e1cdc1 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@@ -11,6 +11,57 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
     [Trait("Format", "Webp")]
     public class Vp8EncodingTests
     {
+        private static void RunFTransform2Test()
+        {
+            // arrange
+            byte[] src = { 154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103, 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159, 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168, 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107, 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151, 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117, 170, 170, 169, 171, 171, 179, 173, 175 };
+            byte[] reference = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129 };
+            short[] actualOutput1 = new short[16];
+            short[] actualOutput2 = new short[16];
+            short[] expectedOutput1 = { 182, 4, 1, 1, 6, 7, -1, -4, 5, 0, -2, 1, 2, 1, 1, 1 };
+            short[] expectedOutput2 = { 192, -34, 10, 1, -11, 8, 10, -7, 6, 3, -8, 4, 5, -3, -2, 6 };
+
+            // act
+            Vp8Encoding.FTransform2(src, reference, actualOutput1, actualOutput2, new int[16]);
+
+            // assert
+            Assert.True(expectedOutput1.SequenceEqual(actualOutput1));
+            Assert.True(expectedOutput2.SequenceEqual(actualOutput2));
+        }
+
+        private static void RunFTransformTest()
+        {
+            // arrange
+            byte[] src =
+            {
+                154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103,
+                101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159,
+                164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168,
+                170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107,
+                104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151,
+                148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117,
+                170, 170, 169, 171, 171, 179, 173, 175
+            };
+            byte[] reference =
+            {
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129
+            };
+            short[] actualOutput = new short[16];
+            short[] expectedOutput = { 182, 4, 1, 1, 6, 7, -1, -4, 5, 0, -2, 1, 2, 1, 1, 1 };
+
+            // act
+            Vp8Encoding.FTransform(src, reference, actualOutput, new int[16]);
+
+            // assert
+            Assert.True(expectedOutput.SequenceEqual(actualOutput));
+        }
+
         private static void RunOneInverseTransformTest()
         {
             // arrange
@@ -75,6 +126,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
             Assert.True(dst.SequenceEqual(expected));
         }
 
+        [Fact]
+        public void FTransform2_Works() => RunFTransform2Test();
+
+        [Fact]
+        public void FTransform_Works() => RunFTransformTest();
+
         [Fact]
         public void OneInverseTransform_Works() => RunOneInverseTransformTest();
 
@@ -82,6 +139,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
         public void TwoInverseTransform_Works() => RunTwoInverseTransformTest();
 
 #if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void FTransform2_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransform2Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void FTransform2_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransform2Test, HwIntrinsics.DisableHWIntrinsic);
+
+        [Fact]
+        public void FTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void FTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransformTest, HwIntrinsics.DisableHWIntrinsic);
+
         [Fact]
         public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll);
 

From 81070c4e61060d019f043b012641ebe7dd02a388 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 15:30:36 +0100
Subject: [PATCH 06/11] Add missing #pragma warning restore SA1503

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index f657d3252..d2b9704ab 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -459,6 +459,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 }
             }
             else
+#pragma warning restore SA1503 // Braces should not be omitted
 #endif
             {
                 FTransform(src, reference, output, scratch);

From cb084077281d30a20218ecb1e7f29009d91c191c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 15:58:15 +0100
Subject: [PATCH 07/11] Use nint in for loop

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index f3b0e8e3d..de6f807da 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -726,7 +726,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         {
             uint v = src[0] * 0x01010101u;
             Span<byte> vSpan = BitConverter.GetBytes(v).AsSpan();
-            for (int i = 0; i < 16; i++)
+            for (nint i = 0; i < 16; i++)
             {
                 if (!src.Slice(0, 4).SequenceEqual(vSpan) || !src.Slice(4, 4).SequenceEqual(vSpan) ||
                     !src.Slice(8, 4).SequenceEqual(vSpan) || !src.Slice(12, 4).SequenceEqual(vSpan))
@@ -748,7 +748,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             int offset = 0;
             while (numBlocks-- > 0)
             {
-                for (int i = 1; i < 16; i++)
+                for (nint i = 1; i < 16; i++)
                 {
                     // omit DC, we're only interested in AC
                     score += Unsafe.Add(ref levelsRef, offset) != 0 ? 1 : 0;

From 0215e99696d0e11295c5ce7506dbf33c5274174c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 26 Nov 2021 16:19:12 +0100
Subject: [PATCH 08/11] Avoid pinning, avoid using LoadScalarVector128

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 188 +++++++++---------
 1 file changed, 91 insertions(+), 97 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index d2b9704ab..9fe526dbf 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -407,59 +407,56 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
-                fixed (byte* srcRef = src)
-                fixed (byte* referenceRef = reference)
-                {
-                    // Load src.
-                    Vector128<ulong> src0 = Sse2.LoadScalarVector128((ulong*)srcRef);
-                    Vector128<ulong> src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps));
-                    Vector128<ulong> src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2)));
-                    Vector128<ulong> src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3)));
-
-                    // Load ref.
-                    Vector128<ulong> ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef);
-                    Vector128<ulong> ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps));
-                    Vector128<ulong> ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2)));
-                    Vector128<ulong> ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3)));
-
-                    // Convert both to 16 bit.
-                    Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
-
-                    // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
-                    Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
-                    Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
-                    Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
-                    Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
-
-                    // Unpack and shuffle.
-                    // 00 01 02 03   0 0 0 0
-                    // 10 11 12 13   0 0 0 0
-                    // 20 21 22 23   0 0 0 0
-                    // 30 31 32 33   0 0 0 0
-                    Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
-                    Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
-                    Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
-                    Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
-
-                    // First pass.
-                    FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
-                    FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
-
-                    // Second pass.
-                    FTransformPass2SSE2(v01l, v32l, output);
-                    FTransformPass2SSE2(v01h, v32h, output2);
-                }
+                ref byte srcRef = ref MemoryMarshal.GetReference(src);
+                ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+
+                // Load src.
+                var src0 = Vector128.Create(Unsafe.As<byte, long>(ref srcRef), 0);
+                var src1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0);
+                var src2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0);
+                var src3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0);
+
+                // Load ref.
+                var ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0);
+                var ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0);
+                var ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0);
+                var ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
+
+                // Convert both to 16 bit.
+                Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
+
+                // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
+                Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
+                Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
+                Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
+                Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
+
+                // Unpack and shuffle.
+                // 00 01 02 03   0 0 0 0
+                // 10 11 12 13   0 0 0 0
+                // 20 21 22 23   0 0 0 0
+                // 30 31 32 33   0 0 0 0
+                Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
+                Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
+                Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
+                Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
+
+                // First pass.
+                FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
+                FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
+
+                // Second pass.
+                FTransformPass2SSE2(v01l, v32l, output);
+                FTransformPass2SSE2(v01h, v32h, output2);
             }
             else
-#pragma warning restore SA1503 // Braces should not be omitted
 #endif
             {
                 FTransform(src, reference, output, scratch);
@@ -472,52 +469,49 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
-                fixed (byte* srcRef = src)
-                fixed (byte* referenceRef = reference)
-                {
-                    // Load src.
-                    Vector128<ulong> src0 = Sse2.LoadScalarVector128((ulong*)srcRef);
-                    Vector128<ulong> src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps));
-                    Vector128<ulong> src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2)));
-                    Vector128<ulong> src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3)));
-
-                    // Load ref.
-                    Vector128<ulong> ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef);
-                    Vector128<ulong> ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps));
-                    Vector128<ulong> ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2)));
-                    Vector128<ulong> ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3)));
-
-                    // 00 01 02 03 *
-                    // 10 11 12 13 *
-                    // 20 21 22 23 *
-                    // 30 31 32 33 *
-                    // Shuffle.
-                    Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
-                    Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
-                    Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
-                    Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
-
-                    // 00 01 10 11 02 03 12 13 * * ...
-                    // 20 21 30 31 22 22 32 33 * * ...
-
-                    // Convert both to 16 bit.
-                    Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
-                    Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
-
-                    // Compute the difference.
-                    Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
-                    Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
-
-                    // First pass
-                    FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
-
-                    // Second pass
-                    FTransformPass2SSE2(v01, v32, output);
-                }
-#pragma warning restore SA1503 // Braces should not be omitted
+                ref byte srcRef = ref MemoryMarshal.GetReference(src);
+                ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+
+                // Load src.
+                var src0 = Vector128.Create(Unsafe.As<byte, long>(ref srcRef), 0);
+                var src1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0);
+                var src2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0);
+                var src3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0);
+
+                // Load ref.
+                var ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0);
+                var ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0);
+                var ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0);
+                var ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
+
+                // 00 01 02 03 *
+                // 10 11 12 13 *
+                // 20 21 22 23 *
+                // 30 31 32 33 *
+                // Shuffle.
+                Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
+                Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
+                Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
+                Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
+
+                // 00 01 10 11 02 03 12 13 * * ...
+                // 20 21 30 31 22 22 32 33 * * ...
+
+                // Convert both to 16 bit.
+                Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
+
+                // Compute the difference.
+                Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
+                Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
+
+                // First pass.
+                FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
+
+                // Second pass.
+                FTransformPass2SSE2(v01, v32, output);
             }
             else
 #endif

From 83da0e069459d716bd3df4fcd7d53282419d295d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sat, 27 Nov 2021 15:46:13 +0100
Subject: [PATCH 09/11] Reverse array access order to avoid bounds checks

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs          | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 9fe526dbf..ab64a8ddb 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -523,18 +523,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 int refIdx = 0;
                 for (i = 0; i < 4; i++)
                 {
-                    int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
-                    int d1 = src[srcIdx + 1] - reference[refIdx + 1];
-                    int d2 = src[srcIdx + 2] - reference[refIdx + 2];
                     int d3 = src[srcIdx + 3] - reference[refIdx + 3];
+                    int d2 = src[srcIdx + 2] - reference[refIdx + 2];
+                    int d1 = src[srcIdx + 1] - reference[refIdx + 1];
+                    int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
                     int a0 = d0 + d3; // 10b                      [-510,510]
                     int a1 = d1 + d2;
                     int a2 = d1 - d2;
                     int a3 = d0 - d3;
-                    tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b                      [-8160,8160]
-                    tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
-                    tmp[2 + (i * 4)] = (a0 - a1) * 8;
                     tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
+                    tmp[2 + (i * 4)] = (a0 - a1) * 8;
+                    tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
+                    tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b                      [-8160,8160]
 
                     srcIdx += WebpConstants.Bps;
                     refIdx += WebpConstants.Bps;
@@ -652,10 +652,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)];
                 int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)];
                 int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)];
-                tmp[0 + (i * 4)] = a0 + a1;   // 14b
-                tmp[1 + (i * 4)] = a3 + a2;
-                tmp[2 + (i * 4)] = a3 - a2;
                 tmp[3 + (i * 4)] = a0 - a1;
+                tmp[2 + (i * 4)] = a3 - a2;
+                tmp[1 + (i * 4)] = a3 + a2;
+                tmp[0 + (i * 4)] = a0 + a1;   // 14b
 
                 inputIdx += 64;
             }

From c0ee67b5b2b51eb51684b0c1fe3ae725331b9874 Mon Sep 17 00:00:00 2001
From: Justin Hopper <jhopper@remarkablehealth.com>
Date: Sun, 28 Nov 2021 16:32:02 -0600
Subject: [PATCH 10/11] Added missing CancellationToken parameters to Image

---
 src/ImageSharp/Image.FromFile.cs   | 15 +++++++-----
 src/ImageSharp/Image.FromStream.cs | 39 ++++++++++++++++++------------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/ImageSharp/Image.FromFile.cs b/src/ImageSharp/Image.FromFile.cs
index 3a4b459c5..fce0835fb 100644
--- a/src/ImageSharp/Image.FromFile.cs
+++ b/src/ImageSharp/Image.FromFile.cs
@@ -255,6 +255,7 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <param name="path">The file path to the image.</param>
         /// <param name="decoder">The decoder.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The configuration is null.</exception>
         /// <exception cref="ArgumentNullException">The path is null.</exception>
         /// <exception cref="ArgumentNullException">The decoder is null.</exception>
@@ -262,14 +263,15 @@ namespace SixLabors.ImageSharp
         /// <exception cref="NotSupportedException">Image format is not supported.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image> LoadAsync(string path, IImageDecoder decoder)
-            => LoadAsync(Configuration.Default, path, decoder, default);
+        public static Task<Image> LoadAsync(string path, IImageDecoder decoder, CancellationToken cancellationToken = default)
+            => LoadAsync(Configuration.Default, path, decoder, cancellationToken);
 
         /// <summary>
         /// Create a new instance of the <see cref="Image"/> class from the given file.
         /// </summary>
         /// <param name="path">The file path to the image.</param>
         /// <param name="decoder">The decoder.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The configuration is null.</exception>
         /// <exception cref="ArgumentNullException">The path is null.</exception>
         /// <exception cref="ArgumentNullException">The decoder is null.</exception>
@@ -278,9 +280,9 @@ namespace SixLabors.ImageSharp
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image<TPixel>> LoadAsync<TPixel>(string path, IImageDecoder decoder)
+        public static Task<Image<TPixel>> LoadAsync<TPixel>(string path, IImageDecoder decoder, CancellationToken cancellationToken = default)
             where TPixel : unmanaged, IPixel<TPixel>
-            => LoadAsync<TPixel>(Configuration.Default, path, decoder, default);
+            => LoadAsync<TPixel>(Configuration.Default, path, decoder, cancellationToken);
 
         /// <summary>
         /// Create a new instance of the <see cref="Image"/> class from the given file.
@@ -342,6 +344,7 @@ namespace SixLabors.ImageSharp
         /// Create a new instance of the <see cref="Image"/> class from the given file.
         /// </summary>
         /// <param name="path">The file path to the image.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The configuration is null.</exception>
         /// <exception cref="ArgumentNullException">The path is null.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
@@ -349,9 +352,9 @@ namespace SixLabors.ImageSharp
         /// <exception cref="NotSupportedException">Image format is not supported.</exception>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image<TPixel>> LoadAsync<TPixel>(string path)
+        public static Task<Image<TPixel>> LoadAsync<TPixel>(string path, CancellationToken cancellationToken = default)
             where TPixel : unmanaged, IPixel<TPixel>
-            => LoadAsync<TPixel>(Configuration.Default, path, default(CancellationToken));
+            => LoadAsync<TPixel>(Configuration.Default, path, cancellationToken);
 
         /// <summary>
         /// Create a new instance of the <see cref="Image"/> class from the given file.
diff --git a/src/ImageSharp/Image.FromStream.cs b/src/ImageSharp/Image.FromStream.cs
index 291d6f7ca..f5e32d8ce 100644
--- a/src/ImageSharp/Image.FromStream.cs
+++ b/src/ImageSharp/Image.FromStream.cs
@@ -44,27 +44,29 @@ namespace SixLabors.ImageSharp
         /// By reading the header on the provided stream this calculates the images format type.
         /// </summary>
         /// <param name="stream">The image stream to read the header from.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable.</exception>
         /// <returns>A <see cref="Task{IImageFormat}"/> representing the asynchronous operation or null if none is found.</returns>
-        public static Task<IImageFormat> DetectFormatAsync(Stream stream)
-            => DetectFormatAsync(Configuration.Default, stream);
+        public static Task<IImageFormat> DetectFormatAsync(Stream stream, CancellationToken cancellationToken = default)
+            => DetectFormatAsync(Configuration.Default, stream, cancellationToken);
 
         /// <summary>
         /// By reading the header on the provided stream this calculates the images format type.
         /// </summary>
         /// <param name="configuration">The configuration.</param>
         /// <param name="stream">The image stream to read the header from.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The configuration is null.</exception>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable.</exception>
         /// <returns>A <see cref="Task{IImageFormat}"/> representing the asynchronous operation.</returns>
-        public static Task<IImageFormat> DetectFormatAsync(Configuration configuration, Stream stream)
+        public static Task<IImageFormat> DetectFormatAsync(Configuration configuration, Stream stream, CancellationToken cancellationToken = default)
             => WithSeekableStreamAsync(
                 configuration,
                 stream,
                 (s, _) => InternalDetectFormatAsync(s, configuration),
-                default);
+                cancellationToken);
 
         /// <summary>
         /// Reads the raw image information from the specified stream without fully decoding it.
@@ -83,6 +85,7 @@ namespace SixLabors.ImageSharp
         /// Reads the raw image information from the specified stream without fully decoding it.
         /// </summary>
         /// <param name="stream">The image stream to read the header from.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
@@ -90,8 +93,8 @@ namespace SixLabors.ImageSharp
         /// A <see cref="Task{IImageInfo}"/> representing the asynchronous operation or null if
         /// a suitable detector is not found.
         /// </returns>
-        public static Task<IImageInfo> IdentifyAsync(Stream stream)
-            => IdentifyAsync(Configuration.Default, stream);
+        public static Task<IImageInfo> IdentifyAsync(Stream stream, CancellationToken cancellationToken = default)
+            => IdentifyAsync(Configuration.Default, stream, cancellationToken);
 
         /// <summary>
         /// Reads the raw image information from the specified stream without fully decoding it.
@@ -227,13 +230,14 @@ namespace SixLabors.ImageSharp
         /// The pixel format is selected by the decoder.
         /// </summary>
         /// <param name="stream">The stream containing image information.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable or the image format is not supported.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <returns>A <see cref="Task{ValueTuple}"/> representing the asynchronous operation.</returns>
-        public static Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream)
-            => LoadWithFormatAsync(Configuration.Default, stream);
+        public static Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream, CancellationToken cancellationToken = default)
+            => LoadWithFormatAsync(Configuration.Default, stream, cancellationToken);
 
         /// <summary>
         /// Decode a new instance of the <see cref="Image"/> class from the given stream.
@@ -252,12 +256,14 @@ namespace SixLabors.ImageSharp
         /// The pixel format is selected by the decoder.
         /// </summary>
         /// <param name="stream">The stream containing image information.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable or the image format is not supported.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image> LoadAsync(Stream stream) => LoadAsync(Configuration.Default, stream);
+        public static Task<Image> LoadAsync(Stream stream, CancellationToken cancellationToken = default)
+            => LoadAsync(Configuration.Default, stream, cancellationToken);
 
         /// <summary>
         /// Decode a new instance of the <see cref="Image"/> class from the given stream.
@@ -280,14 +286,15 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <param name="stream">The stream containing image information.</param>
         /// <param name="decoder">The decoder.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="ArgumentNullException">The decoder is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable or the image format is not supported.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image> LoadAsync(Stream stream, IImageDecoder decoder)
-            => LoadAsync(Configuration.Default, stream, decoder);
+        public static Task<Image> LoadAsync(Stream stream, IImageDecoder decoder, CancellationToken cancellationToken = default)
+            => LoadAsync(Configuration.Default, stream, decoder, cancellationToken);
 
         /// <summary>
         /// Decode a new instance of the <see cref="Image"/> class from the given stream.
@@ -388,15 +395,16 @@ namespace SixLabors.ImageSharp
         /// Create a new instance of the <see cref="Image{TPixel}"/> class from the given stream.
         /// </summary>
         /// <param name="stream">The stream containing image information.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable or the image format is not supported.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <returns>A <see cref="Task{Image}"/> representing the asynchronous operation.</returns>
-        public static Task<Image<TPixel>> LoadAsync<TPixel>(Stream stream)
+        public static Task<Image<TPixel>> LoadAsync<TPixel>(Stream stream, CancellationToken cancellationToken = default)
             where TPixel : unmanaged, IPixel<TPixel>
-            => LoadAsync<TPixel>(Configuration.Default, stream);
+            => LoadAsync<TPixel>(Configuration.Default, stream, cancellationToken);
 
         /// <summary>
         /// Create a new instance of the <see cref="Image{TPixel}"/> class from the given stream.
@@ -417,15 +425,16 @@ namespace SixLabors.ImageSharp
         /// Create a new instance of the <see cref="Image{TPixel}"/> class from the given stream.
         /// </summary>
         /// <param name="stream">The stream containing image information.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
         /// <exception cref="ArgumentNullException">The stream is null.</exception>
         /// <exception cref="NotSupportedException">The stream is not readable or the image format is not supported.</exception>
         /// <exception cref="UnknownImageFormatException">Image format not recognised.</exception>
         /// <exception cref="InvalidImageContentException">Image contains invalid content.</exception>
         /// <typeparam name="TPixel">The pixel format.</typeparam>
         /// <returns>A <see cref="Task{ValueTuple}"/> representing the asynchronous operation.</returns>
-        public static async Task<(Image<TPixel> Image, IImageFormat Format)> LoadWithFormatAsync<TPixel>(Stream stream)
+        public static async Task<(Image<TPixel> Image, IImageFormat Format)> LoadWithFormatAsync<TPixel>(Stream stream, CancellationToken cancellationToken = default)
             where TPixel : unmanaged, IPixel<TPixel>
-            => await LoadWithFormatAsync<TPixel>(Configuration.Default, stream).ConfigureAwait(false);
+            => await LoadWithFormatAsync<TPixel>(Configuration.Default, stream, cancellationToken).ConfigureAwait(false);
 
         /// <summary>
         /// Create a new instance of the <see cref="Image{TPixel}"/> class from the given stream.

From 81433c2f5254b9eb6e55b96c8898f6b036c4d99f Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 29 Nov 2021 17:52:53 +1100
Subject: [PATCH 11/11] Remove more scalar bounds checks

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 39 ++++++++++++-------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index ab64a8ddb..f12a1a785 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -542,14 +542,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
                 for (i = 0; i < 4; i++)
                 {
-                    int a0 = tmp[0 + i] + tmp[12 + i]; // 15b
-                    int a1 = tmp[4 + i] + tmp[8 + i];
-                    int a2 = tmp[4 + i] - tmp[8 + i];
-                    int a3 = tmp[0 + i] - tmp[12 + i];
-                    output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
-                    output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
-                    output[8 + i] = (short)((a0 - a1 + 7) >> 4);
+                    int t12 = tmp[12 + i]; // 15b
+                    int t8 = tmp[8 + i];
+
+                    int a1 = tmp[4 + i] + t8;
+                    int a2 = tmp[4 + i] - t8;
+                    int a0 = tmp[0 + i] + t12; // 15b
+                    int a3 = tmp[0 + i] - t12;
+
                     output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
+                    output[8 + i] = (short)((a0 - a1 + 7) >> 4);
+                    output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
+                    output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
                 }
             }
         }
@@ -648,9 +652,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             int inputIdx = 0;
             for (i = 0; i < 4; i++)
             {
-                int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)];  // 13b
                 int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)];
                 int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)];
+                int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)];  // 13b
                 int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)];
                 tmp[3 + (i * 4)] = a0 - a1;
                 tmp[2 + (i * 4)] = a3 - a2;
@@ -662,18 +666,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
             for (i = 0; i < 4; i++)
             {
-                int a0 = tmp[0 + i] + tmp[8 + i];  // 15b
-                int a1 = tmp[4 + i] + tmp[12 + i];
-                int a2 = tmp[4 + i] - tmp[12 + i];
-                int a3 = tmp[0 + i] - tmp[8 + i];
+                int t12 = tmp[12 + i];
+                int t8 = tmp[8 + i];
+
+                int a1 = tmp[4 + i] + t12;
+                int a2 = tmp[4 + i] - t12;
+                int a0 = tmp[0 + i] + t8;  // 15b
+                int a3 = tmp[0 + i] - t8;
+
                 int b0 = a0 + a1;    // 16b
                 int b1 = a3 + a2;
                 int b2 = a3 - a2;
                 int b3 = a0 - a1;
-                output[0 + i] = (short)(b0 >> 1);     // 15b
-                output[4 + i] = (short)(b1 >> 1);
-                output[8 + i] = (short)(b2 >> 1);
+
                 output[12 + i] = (short)(b3 >> 1);
+                output[8 + i] = (short)(b2 >> 1);
+                output[4 + i] = (short)(b1 >> 1);
+                output[0 + i] = (short)(b0 >> 1);     // 15b
             }
         }