From 257ff1929e341e5b1af94d9adf557e5296ece957 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Fri, 29 Oct 2021 23:32:13 +1100
Subject: [PATCH 01/36] Use RgbaVector for color backing

---
 src/ImageSharp/Color/Color.Conversions.cs     | 87 ++++++++++++++++---
 src/ImageSharp/Color/Color.cs                 | 74 ++++++++--------
 .../Color/ColorTests.CastFrom.cs              | 17 +++-
 .../Color/ColorTests.ConstructFrom.cs         |  4 +-
 4 files changed, 125 insertions(+), 57 deletions(-)
diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 0455fd26a..abcb54b80 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,56 +17,90 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel) => this.data = pixel;
+        public Color(Rgba64 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgba64(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgba32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgba32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Argb32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromArgb32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgra32 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromBgra32(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgb24 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromRgb24(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgr24 pixel)
+        {
+            RgbaVector vector = default;
+            vector.FromBgr24(pixel);
+            this.data = vector;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector) => this.data = new Rgba64(vector);
+        public Color(Vector4 vector)
+        {
+            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
+            this.data = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
+        }
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToVector4();
+        public static explicit operator Vector4(Color color) => color.data.ToScaledVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -74,22 +108,47 @@ namespace SixLabors.ImageSharp
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new Color(source);
+        public static explicit operator Color(Vector4 source) => new(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32() => this.data.ToRgba32();
+        internal Rgba32 ToRgba32()
+        {
+            Rgba32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32() => this.data.ToBgra32();
+        internal Bgra32 ToBgra32()
+        {
+            Bgra32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32() => this.data.ToArgb32();
+        internal Argb32 ToArgb32()
+        {
+            Argb32 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24() => this.data.ToRgb24();
+        internal Rgb24 ToRgb24()
+        {
+            Rgb24 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24() => this.data.ToBgr24();
+        internal Bgr24 ToBgr24()
+        {
+            Bgr24 result = default;
+            result.FromScaledVector4(this.data.ToScaledVector4());
+            return result;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         internal Vector4 ToVector4() => this.data.ToVector4();
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index d5eedc160..9a4df4e62 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -20,26 +20,22 @@ namespace SixLabors.ImageSharp
     /// </remarks>
     public readonly partial struct Color : IEquatable<Color>
     {
-        private readonly Rgba64 data;
+        private readonly RgbaVector data;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
         {
-            this.data = new Rgba64(
-                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(a));
+            RgbaVector vector = default;
+            vector.FromRgba32(new(r, g, b, a));
+            this.data = vector;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b)
         {
-            this.data = new Rgba64(
-                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
-                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
-                ushort.MaxValue);
+            RgbaVector vector = default;
+            vector.FromRgba32(new(r, g, b));
+            this.data = vector;
         }
 
         /// <summary>
@@ -52,10 +48,7 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Color left, Color right) => left.Equals(right);
 
         /// <summary>
         /// Checks whether two <see cref="Color"/> structures are equal.
@@ -67,10 +60,7 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Color left, Color right) => !left.Equals(right);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -81,7 +71,7 @@ namespace SixLabors.ImageSharp
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -91,7 +81,17 @@ namespace SixLabors.ImageSharp
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
+        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
+
+        /// <summary>
+        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
+        /// </summary>
+        /// <param name="pixel">The pixel to convert from.</param>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <returns>The <see cref="Color"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Color FromPixel<TPixel>(TPixel pixel)
+            where TPixel : unmanaged, IPixel<TPixel> => new(pixel.ToScaledVector4());
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -207,13 +207,18 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <returns>A hexadecimal string representation of the value.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public string ToHex() => this.data.ToRgba32().ToHex();
+        public string ToHex()
+        {
+            Rgba32 rgba = default;
+            this.data.ToRgba32(ref rgba);
+            return rgba.ToHex();
+        }
 
         /// <inheritdoc />
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
+        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -222,12 +227,12 @@ namespace SixLabors.ImageSharp
             where TPixel : unmanaged, IPixel<TPixel>
         {
             TPixel pixel = default;
-            pixel.FromRgba64(this.data);
+            pixel.FromScaledVector4(this.data.ToScaledVector4());
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -240,28 +245,19 @@ namespace SixLabors.ImageSharp
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
-            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
+            ReadOnlySpan<RgbaVector> rgbaSpan = MemoryMarshal.Cast<Color, RgbaVector>(source);
+            PixelOperations<TPixel>.Instance.From(configuration, rgbaSpan, destination);
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public bool Equals(Color other)
-        {
-            return this.data.PackedValue == other.data.PackedValue;
-        }
+        public bool Equals(Color other) => this.data.Equals(other.data);
 
         /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Color other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public override int GetHashCode()
-        {
-            return this.data.PackedValue.GetHashCode();
-        }
+        public override int GetHashCode() => this.data.GetHashCode();
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
index 38b94f486..356ef7351 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
@@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 Color color = source;
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 Color color = source;
@@ -88,6 +88,19 @@ namespace SixLabors.ImageSharp.Tests
                 Bgr24 data = color.ToPixel<Bgr24>();
                 Assert.Equal(source, data);
             }
+
+            [Fact]
+            public void TPixel()
+            {
+                var source = new RgbaVector(1, .1F, .133F, .864F);
+
+                // Act:
+                var color = Color.FromPixel(source);
+
+                // Assert:
+                RgbaVector data = color.ToPixel<RgbaVector>();
+                Assert.Equal(source, data);
+            }
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
index 89276014b..dd51f3a6c 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
@@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);

From ef90575a119335314ea69c4cbd556469d91f032f Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 1 Nov 2021 21:42:32 +1100
Subject: [PATCH 02/36] Revert "Use RgbaVector for color backing"

This reverts commit 257ff1929e341e5b1af94d9adf557e5296ece957.
---
 src/ImageSharp/Color/Color.Conversions.cs     | 87 +++----------------
 src/ImageSharp/Color/Color.cs                 | 74 ++++++++--------
 .../Color/ColorTests.CastFrom.cs              | 17 +---
 .../Color/ColorTests.ConstructFrom.cs         |  4 +-
 4 files changed, 57 insertions(+), 125 deletions(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index abcb54b80..0455fd26a 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,90 +17,56 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgba64(pixel);
-            this.data = vector;
-        }
+        public Color(Rgba64 pixel) => this.data = pixel;
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgba32(pixel);
-            this.data = vector;
-        }
+        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromArgb32(pixel);
-            this.data = vector;
-        }
+        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromBgra32(pixel);
-            this.data = vector;
-        }
+        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromRgb24(pixel);
-            this.data = vector;
-        }
+        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel)
-        {
-            RgbaVector vector = default;
-            vector.FromBgr24(pixel);
-            this.data = vector;
-        }
+        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector)
-        {
-            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
-            this.data = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
-        }
+        public Color(Vector4 vector) => this.data = new Rgba64(vector);
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToScaledVector4();
+        public static explicit operator Vector4(Color color) => color.data.ToVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -108,47 +74,22 @@ namespace SixLabors.ImageSharp
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new(source);
+        public static explicit operator Color(Vector4 source) => new Color(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32()
-        {
-            Rgba32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Rgba32 ToRgba32() => this.data.ToRgba32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32()
-        {
-            Bgra32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Bgra32 ToBgra32() => this.data.ToBgra32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32()
-        {
-            Argb32 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Argb32 ToArgb32() => this.data.ToArgb32();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24()
-        {
-            Rgb24 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Rgb24 ToRgb24() => this.data.ToRgb24();
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24()
-        {
-            Bgr24 result = default;
-            result.FromScaledVector4(this.data.ToScaledVector4());
-            return result;
-        }
+        internal Bgr24 ToBgr24() => this.data.ToBgr24();
 
         [MethodImpl(InliningOptions.ShortMethod)]
         internal Vector4 ToVector4() => this.data.ToVector4();
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index 9a4df4e62..d5eedc160 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -20,22 +20,26 @@ namespace SixLabors.ImageSharp
     /// </remarks>
     public readonly partial struct Color : IEquatable<Color>
     {
-        private readonly RgbaVector data;
+        private readonly Rgba64 data;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
         {
-            RgbaVector vector = default;
-            vector.FromRgba32(new(r, g, b, a));
-            this.data = vector;
+            this.data = new Rgba64(
+                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(a));
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b)
         {
-            RgbaVector vector = default;
-            vector.FromRgba32(new(r, g, b));
-            this.data = vector;
+            this.data = new Rgba64(
+                ColorNumerics.UpscaleFrom8BitTo16Bit(r),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(g),
+                ColorNumerics.UpscaleFrom8BitTo16Bit(b),
+                ushort.MaxValue);
         }
 
         /// <summary>
@@ -48,7 +52,10 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right) => left.Equals(right);
+        public static bool operator ==(Color left, Color right)
+        {
+            return left.Equals(right);
+        }
 
         /// <summary>
         /// Checks whether two <see cref="Color"/> structures are equal.
@@ -60,7 +67,10 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right) => !left.Equals(right);
+        public static bool operator !=(Color left, Color right)
+        {
+            return !left.Equals(right);
+        }
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -71,7 +81,7 @@ namespace SixLabors.ImageSharp
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -81,17 +91,7 @@ namespace SixLabors.ImageSharp
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
-
-        /// <summary>
-        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
-        /// </summary>
-        /// <param name="pixel">The pixel to convert from.</param>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <returns>The <see cref="Color"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromPixel<TPixel>(TPixel pixel)
-            where TPixel : unmanaged, IPixel<TPixel> => new(pixel.ToScaledVector4());
+        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -207,18 +207,13 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <returns>A hexadecimal string representation of the value.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public string ToHex()
-        {
-            Rgba32 rgba = default;
-            this.data.ToRgba32(ref rgba);
-            return rgba.ToHex();
-        }
+        public string ToHex() => this.data.ToRgba32().ToHex();
 
         /// <inheritdoc />
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
+        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -227,12 +222,12 @@ namespace SixLabors.ImageSharp
             where TPixel : unmanaged, IPixel<TPixel>
         {
             TPixel pixel = default;
-            pixel.FromScaledVector4(this.data.ToScaledVector4());
+            pixel.FromRgba64(this.data);
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -245,19 +240,28 @@ namespace SixLabors.ImageSharp
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<RgbaVector> rgbaSpan = MemoryMarshal.Cast<Color, RgbaVector>(source);
-            PixelOperations<TPixel>.Instance.From(configuration, rgbaSpan, destination);
+            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
+            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public bool Equals(Color other) => this.data.Equals(other.data);
+        public bool Equals(Color other)
+        {
+            return this.data.PackedValue == other.data.PackedValue;
+        }
 
         /// <inheritdoc />
-        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
+        public override bool Equals(object obj)
+        {
+            return obj is Color other && this.Equals(other);
+        }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
-        public override int GetHashCode() => this.data.GetHashCode();
+        public override int GetHashCode()
+        {
+            return this.data.PackedValue.GetHashCode();
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
index 356ef7351..38b94f486 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastFrom.cs
@@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22, 231);
+                var source = new Rgb24(1, 22,  231);
 
                 // Act:
                 Color color = source;
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22, 231);
+                var source = new Bgr24(1, 22,  231);
 
                 // Act:
                 Color color = source;
@@ -88,19 +88,6 @@ namespace SixLabors.ImageSharp.Tests
                 Bgr24 data = color.ToPixel<Bgr24>();
                 Assert.Equal(source, data);
             }
-
-            [Fact]
-            public void TPixel()
-            {
-                var source = new RgbaVector(1, .1F, .133F, .864F);
-
-                // Act:
-                var color = Color.FromPixel(source);
-
-                // Assert:
-                RgbaVector data = color.ToPixel<RgbaVector>();
-                Assert.Equal(source, data);
-            }
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
index dd51f3a6c..89276014b 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.ConstructFrom.cs
@@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22, 231);
+                var source = new Rgb24(1, 22,  231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22, 231);
+                var source = new Bgr24(1, 22,  231);
 
                 // Act:
                 var color = new Color(source);

From 2ec17e7c6a31b31fafb75cfd85613681fa4125d6 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 1 Nov 2021 22:39:20 +1100
Subject: [PATCH 03/36] Use box pixel for high precision

---
 src/ImageSharp/Color/Color.Conversions.cs     | 117 +++++++++++++++---
 src/ImageSharp/Color/Color.cs                 |  77 ++++++++----
 .../Color/ColorTests.CastTo.cs                |  17 ++-
 3 files changed, 171 insertions(+), 40 deletions(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 0455fd26a..424b7dcdf 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -17,56 +17,85 @@ namespace SixLabors.ImageSharp
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba64"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba64 pixel) => this.data = pixel;
+        public Color(Rgba64 pixel)
+        {
+            this.data = pixel;
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgba32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgba32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgba32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Argb32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Argb32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Argb32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgra32"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgra32 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgra32 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Rgb24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Rgb24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Rgb24 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="pixel">The <see cref="Bgr24"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Bgr24 pixel) => this.data = new Rgba64(pixel);
+        public Color(Bgr24 pixel)
+        {
+            this.data = new Rgba64(pixel);
+            this.boxedHighPrecisionPixel = null;
+        }
 
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
         /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public Color(Vector4 vector) => this.data = new Rgba64(vector);
+        public Color(Vector4 vector)
+        {
+            vector = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
+            this.boxedHighPrecisionPixel = new RgbaVector(vector.X, vector.Y, vector.Z, vector.W);
+            this.data = default;
+        }
 
         /// <summary>
         /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
         /// </summary>
         /// <param name="color">The <see cref="Color"/>.</param>
         /// <returns>The <see cref="Vector4"/>.</returns>
-        public static explicit operator Vector4(Color color) => color.data.ToVector4();
+        public static explicit operator Vector4(Color color) => color.ToVector4();
 
         /// <summary>
         /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
@@ -74,24 +103,82 @@ namespace SixLabors.ImageSharp
         /// <param name="source">The <see cref="Vector4"/>.</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static explicit operator Color(Vector4 source) => new Color(source);
+        public static explicit operator Color(Vector4 source) => new(source);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgba32 ToRgba32() => this.data.ToRgba32();
+        internal Rgba32 ToRgba32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToRgba32();
+            }
+
+            Rgba32 value = default;
+            this.boxedHighPrecisionPixel.ToRgba32(ref value);
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgra32 ToBgra32() => this.data.ToBgra32();
+        internal Bgra32 ToBgra32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToBgra32();
+            }
+
+            Bgra32 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Argb32 ToArgb32() => this.data.ToArgb32();
+        internal Argb32 ToArgb32()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToArgb32();
+            }
+
+            Argb32 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Rgb24 ToRgb24() => this.data.ToRgb24();
+        internal Rgb24 ToRgb24()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToRgb24();
+            }
+
+            Rgb24 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Bgr24 ToBgr24() => this.data.ToBgr24();
+        internal Bgr24 ToBgr24()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToBgr24();
+            }
+
+            Bgr24 value = default;
+            value.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
+            return value;
+        }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        internal Vector4 ToVector4() => this.data.ToVector4();
+        internal Vector4 ToVector4()
+        {
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.ToScaledVector4();
+            }
+
+            return this.boxedHighPrecisionPixel.ToScaledVector4();
+        }
     }
 }
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index d5eedc160..fe66efcfb 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -4,7 +4,6 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp
@@ -21,6 +20,7 @@ namespace SixLabors.ImageSharp
     public readonly partial struct Color : IEquatable<Color>
     {
         private readonly Rgba64 data;
+        private readonly IPixel boxedHighPrecisionPixel;
 
         [MethodImpl(InliningOptions.ShortMethod)]
         private Color(byte r, byte g, byte b, byte a)
@@ -30,6 +30,8 @@ namespace SixLabors.ImageSharp
                 ColorNumerics.UpscaleFrom8BitTo16Bit(g),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(b),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(a));
+
+            this.boxedHighPrecisionPixel = null;
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
@@ -40,6 +42,15 @@ namespace SixLabors.ImageSharp
                 ColorNumerics.UpscaleFrom8BitTo16Bit(g),
                 ColorNumerics.UpscaleFrom8BitTo16Bit(b),
                 ushort.MaxValue);
+
+            this.boxedHighPrecisionPixel = null;
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private Color(IPixel pixel)
+        {
+            this.boxedHighPrecisionPixel = pixel;
+            this.data = default;
         }
 
         /// <summary>
@@ -52,13 +63,10 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator ==(Color left, Color right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Color left, Color right) => left.Equals(right);
 
         /// <summary>
-        /// Checks whether two <see cref="Color"/> structures are equal.
+        /// Checks whether two <see cref="Color"/> structures are not equal.
         /// </summary>
         /// <param name="left">The left hand <see cref="Color"/> operand.</param>
         /// <param name="right">The right hand <see cref="Color"/> operand.</param>
@@ -67,10 +75,7 @@ namespace SixLabors.ImageSharp
         /// otherwise, false.
         /// </returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static bool operator !=(Color left, Color right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Color left, Color right) => !left.Equals(right);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGBA bytes.
@@ -81,7 +86,7 @@ namespace SixLabors.ImageSharp
         /// <param name="a">The alpha component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgba(byte r, byte g, byte b, byte a) => new Color(r, g, b, a);
+        public static Color FromRgba(byte r, byte g, byte b, byte a) => new(r, g, b, a);
 
         /// <summary>
         /// Creates a <see cref="Color"/> from RGB bytes.
@@ -91,7 +96,18 @@ namespace SixLabors.ImageSharp
         /// <param name="b">The blue component (0-255).</param>
         /// <returns>The <see cref="Color"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Color FromRgb(byte r, byte g, byte b) => new Color(r, g, b);
+        public static Color FromRgb(byte r, byte g, byte b) => new(r, g, b);
+
+        /// <summary>
+        /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
+        /// </summary>
+        /// <param name="pixel">The pixel to convert from.</param>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <returns>The <see cref="Color"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Color FromPixel<TPixel>(TPixel pixel)
+            where TPixel : unmanaged, IPixel<TPixel>
+            => new(pixel);
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct
@@ -213,7 +229,7 @@ namespace SixLabors.ImageSharp
         public override string ToString() => this.ToHex();
 
         /// <summary>
-        /// Converts the color instance to a specified <see cref="IPixel{TSelf}"/> type.
+        /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <returns>The pixel value.</returns>
@@ -221,13 +237,18 @@ namespace SixLabors.ImageSharp
         public TPixel ToPixel<TPixel>()
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            TPixel pixel = default;
+            if (this.boxedHighPrecisionPixel is TPixel pixel)
+            {
+                return pixel;
+            }
+
+            pixel = default;
             pixel.FromRgba64(this.data);
             return pixel;
         }
 
         /// <summary>
-        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <see cref="IPixel{TSelf}"/> type.
+        /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
         /// </summary>
         /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
         /// <param name="configuration">The configuration.</param>
@@ -240,28 +261,38 @@ namespace SixLabors.ImageSharp
             Span<TPixel> destination)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            ReadOnlySpan<Rgba64> rgba64Span = MemoryMarshal.Cast<Color, Rgba64>(source);
-            PixelOperations<TPixel>.Instance.FromRgba64(configuration, rgba64Span, destination);
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
+            for (int i = 0; i < source.Length; i++)
+            {
+                destination[i] = source[i].ToPixel<TPixel>();
+            }
         }
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
         public bool Equals(Color other)
         {
-            return this.data.PackedValue == other.data.PackedValue;
+            if (this.boxedHighPrecisionPixel is null && other.boxedHighPrecisionPixel is null)
+            {
+                return this.data.PackedValue == other.data.PackedValue;
+            }
+
+            return this.ToVector4().Equals(other.ToVector4());
         }
 
         /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Color other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Color other && this.Equals(other);
 
         /// <inheritdoc />
         [MethodImpl(InliningOptions.ShortMethod)]
         public override int GetHashCode()
         {
-            return this.data.PackedValue.GetHashCode();
+            if (this.boxedHighPrecisionPixel is null)
+            {
+                return this.data.PackedValue.GetHashCode();
+            }
+
+            return this.boxedHighPrecisionPixel.GetHashCode();
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index ee1820de7..d3f3cf126 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Rgb24()
             {
-                var source = new Rgb24(1, 22,  231);
+                var source = new Rgb24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void Bgr24()
             {
-                var source = new Bgr24(1, 22,  231);
+                var source = new Bgr24(1, 22, 231);
 
                 // Act:
                 var color = new Color(source);
@@ -88,6 +88,19 @@ namespace SixLabors.ImageSharp.Tests
                 Bgr24 data = color;
                 Assert.Equal(source, data);
             }
+
+            [Fact]
+            public void TPixel()
+            {
+                var source = new RgbaVector(1, .1F, .133F, .864F);
+
+                // Act:
+                var color = Color.FromPixel(source);
+
+                // Assert:
+                RgbaVector data = color.ToPixel<RgbaVector>();
+                Assert.Equal(source, data);
+            }
         }
     }
 }

From fd07436736d721bedfbafc308d902aa1e7765778 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 12:40:04 +0100
Subject: [PATCH 04/36] Replace Guard with DebugGuard in FastSLog2Slow

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 22c233360..ebebe7954 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -780,7 +780,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 
         private static float FastSLog2Slow(uint v)
         {
-            Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+            DebugGuard.MustBeGreaterThanOrEqualTo<uint>(v, LogLookupIdxMax, nameof(v));
             if (v < ApproxLogWithCorrectionMax)
             {
                 int logCnt = 0;

From 2bf16bcb58556d6f3cbee5298472db42af60bd02 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 12:41:43 +0100
Subject: [PATCH 05/36] Reverse access to output array to remove bounds checks

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index ebebe7954..b278b12bc 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1262,11 +1262,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                     Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
                     Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
                     Sse2.Store((ushort*)p, diff);
+                    int paMinusPb = output[3] + output[2] + output[1] + output[0];
+                    return (paMinusPb <= 0) ? a : b;
                 }
-
-                int paMinusPb = output[0] + output[1] + output[2] + output[3];
-
-                return (paMinusPb <= 0) ? a : b;
             }
             else
 #endif

From a7ed1884e0f9439c03d913f4d4a5f2b36d38071e Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:15:13 +0100
Subject: [PATCH 06/36] Add sse2 version of ClampedAddSubtractHalf

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index b278b12bc..0dda5a79a 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1219,12 +1219,32 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 
         private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
         {
-            uint ave = Average2(c0, c1);
-            int a = AddSubtractComponentHalf((int)(ave >> 24), (int)(c2 >> 24));
-            int r = AddSubtractComponentHalf((int)((ave >> 16) & 0xff), (int)((c2 >> 16) & 0xff));
-            int g = AddSubtractComponentHalf((int)((ave >> 8) & 0xff), (int)((c2 >> 8) & 0xff));
-            int b = AddSubtractComponentHalf((int)(ave & 0xff), (int)(c2 & 0xff));
-            return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
+                Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
+                Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
+                Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
+                Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
+                Vector128<short> a2 = Sse2.Subtract(a1, bgta);
+                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2.AsInt16(), 1);
+                Vector128<short> a4 = Sse2.Add(a0.AsInt16(), a3).AsInt16();
+                Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
+                uint output = Sse2.ConvertToUInt32(a5.AsUInt32());
+                return output;
+            }
+#endif
+            {
+                uint ave = Average2(c0, c1);
+                int a = AddSubtractComponentHalf((int)(ave >> 24), (int)(c2 >> 24));
+                int r = AddSubtractComponentHalf((int)((ave >> 16) & 0xff), (int)((c2 >> 16) & 0xff));
+                int g = AddSubtractComponentHalf((int)((ave >> 8) & 0xff), (int)((c2 >> 8) & 0xff));
+                int b = AddSubtractComponentHalf((int)(ave & 0xff), (int)(c2 & 0xff));
+                return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+            }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]

From 28053739a9beeed006fd256a0ea8016631660841 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:20:33 +0100
Subject: [PATCH 07/36] Add sse2 version of ClampedAddSubtractFull

---
 .../Formats/Webp/Lossless/LosslessUtils.cs    | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 0dda5a79a..7740dc051 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1201,20 +1201,34 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 
         private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
         {
-            int a = AddSubtractComponentFull(
-                (int)(c0 >> 24),
-                (int)(c1 >> 24),
-                (int)(c2 >> 24));
-            int r = AddSubtractComponentFull(
-                (int)((c0 >> 16) & 0xff),
-                (int)((c1 >> 16) & 0xff),
-                (int)((c2 >> 16) & 0xff));
-            int g = AddSubtractComponentFull(
-                (int)((c0 >> 8) & 0xff),
-                (int)((c1 >> 8) & 0xff),
-                (int)((c2 >> 8) & 0xff));
-            int b = AddSubtractComponentFull((int)(c0 & 0xff), (int)(c1 & 0xff), (int)(c2 & 0xff));
-            return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
+                Vector128<byte> v1 = Sse2.Add(c0Vec, c1Vec);
+                Vector128<byte> v2 = Sse2.Subtract(v1, c2Vec);
+                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
+                uint output = Sse2.ConvertToUInt32(b.AsUInt32());
+            }
+#endif
+            {
+                int a = AddSubtractComponentFull(
+                    (int)(c0 >> 24),
+                    (int)(c1 >> 24),
+                    (int)(c2 >> 24));
+                int r = AddSubtractComponentFull(
+                    (int)((c0 >> 16) & 0xff),
+                    (int)((c1 >> 16) & 0xff),
+                    (int)((c2 >> 16) & 0xff));
+                int g = AddSubtractComponentFull(
+                    (int)((c0 >> 8) & 0xff),
+                    (int)((c1 >> 8) & 0xff),
+                    (int)((c2 >> 8) & 0xff));
+                int b = AddSubtractComponentFull((int)(c0 & 0xff), (int)(c1 & 0xff), (int)(c2 & 0xff));
+                return ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;
+            }
         }
 
         private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)

From f6dbc7dd8ee95115315805dab2b9b38684e505b2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 14:40:59 +0100
Subject: [PATCH 08/36] Fix issue in ClampedAddSubtractFull

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 7740dc051..65b39bd2d 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1207,10 +1207,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                 Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
                 Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
                 Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-                Vector128<byte> v1 = Sse2.Add(c0Vec, c1Vec);
-                Vector128<byte> v2 = Sse2.Subtract(v1, c2Vec);
+                Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
+                Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
                 Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
                 uint output = Sse2.ConvertToUInt32(b.AsUInt32());
+                return output;
             }
 #endif
             {

From 8fe280e9918e14ca2abb7ffd21ae35c969429447 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 16:04:29 +0100
Subject: [PATCH 09/36] Add predictor 12 and 13 tests

---
 .../Formats/WebP/LosslessUtilsTests.cs        | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
index bf381ebda..c70f332ef 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
@@ -153,9 +153,55 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
             }
         }
 
+        private static void RunPredictor12Test()
+        {
+            // arrange
+            uint[] topData = { 4294844413, 4294779388 };
+            uint left = 4294844413;
+            uint expectedResult = 4294779388;
+
+            // act
+            unsafe
+            {
+                fixed (uint* top = &topData[1])
+                {
+                    uint actual = LosslessUtils.Predictor12(left, top);
+
+                    // assert
+                    Assert.Equal(expectedResult, actual);
+                }
+            }
+        }
+
+        private static void RunPredictor13Test()
+        {
+            // arrange
+            uint[] topData = { 4278193922, 4278193666 };
+            uint left = 4278193410;
+            uint expectedResult = 4278193154;
+
+            // act
+            unsafe
+            {
+                fixed (uint* top = &topData[1])
+                {
+                    uint actual = LosslessUtils.Predictor13(left, top);
+
+                    // assert
+                    Assert.Equal(expectedResult, actual);
+                }
+            }
+        }
+
         [Fact]
         public void Predictor11_Works() => RunPredictor11Test();
 
+        [Fact]
+        public void Predictor12_Works() => RunPredictor12Test();
+
+        [Fact]
+        public void Predictor13_Works() => RunPredictor13Test();
+
         [Fact]
         public void SubtractGreen_Works() => RunSubtractGreenTest();
 
@@ -175,6 +221,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
         [Fact]
         public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2);
 
+        [Fact]
+        public void Predictor12_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor12Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Predictor12_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor12Test, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void Predictor13_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor13Test, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void Predictor13_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor13Test, HwIntrinsics.DisableSSE2);
+
         [Fact]
         public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);
 

From ffdf99bad2d8f4fb9d52a3938f3c64d750f09957 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 16:29:52 +0100
Subject: [PATCH 10/36] Add aggressive inlining

---
 src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs    | 8 ++++++++
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
index 8596d8555..02bbc38fc 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorCache.cs
@@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System.Runtime.CompilerServices;
+
 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
     /// <summary>
@@ -41,6 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// Inserts a new color into the cache.
         /// </summary>
         /// <param name="bgra">The color to insert.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public void Insert(uint bgra)
         {
             int key = HashPix(bgra, this.HashShift);
@@ -52,6 +55,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// </summary>
         /// <param name="key">The key to lookup.</param>
         /// <returns>The color for the key.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public uint Lookup(int key) => this.Colors[key];
 
         /// <summary>
@@ -59,6 +63,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// </summary>
         /// <param name="bgra">The color to check.</param>
         /// <returns>The index of the color in the cache or -1 if its not present.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public int Contains(uint bgra)
         {
             int key = HashPix(bgra, this.HashShift);
@@ -70,6 +75,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// </summary>
         /// <param name="bgra">The color.</param>
         /// <returns>The index for the color.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public int GetIndex(uint bgra) => HashPix(bgra, this.HashShift);
 
         /// <summary>
@@ -77,8 +83,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// </summary>
         /// <param name="key">The key.</param>
         /// <param name="bgra">The color to add.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public void Set(uint key, uint bgra) => this.Colors[key] = bgra;
 
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static int HashPix(uint argb, int shift) => (int)((argb * HashMul) >> shift);
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 65b39bd2d..9baa6c3c3 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -752,6 +752,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
         /// <summary>
         /// Fast calculation of log2(v) for integer input.
         /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float FastLog2(uint v) => v < LogLookupIdxMax ? WebpLookupTables.Log2Table[v] : FastLog2Slow(v);
 
         /// <summary>

From fc8d8b81d98201955655595fe682a0c5533eb6ea Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 2 Nov 2021 21:56:19 +0100
Subject: [PATCH 11/36] Remove unnecessary cast AsInt16()

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 9baa6c3c3..8bd3163cc 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1210,7 +1210,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                 Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
                 Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
                 Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
-                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2.AsInt16(), v2.AsInt16());
+                Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
                 uint output = Sse2.ConvertToUInt32(b.AsUInt32());
                 return output;
             }

From f9212f7adca384b1147af10a38e3ec0d8dcc12d2 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 3 Nov 2021 22:38:52 +1100
Subject: [PATCH 12/36] Update
 tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs

Co-authored-by: Anton Firszov <antonfir@gmail.com>
---
 tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index d3f3cf126..af35d1f89 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -92,7 +92,7 @@ namespace SixLabors.ImageSharp.Tests
             [Fact]
             public void TPixel()
             {
-                var source = new RgbaVector(1, .1F, .133F, .864F);
+                var source = new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue);
 
                 // Act:
                 var color = Color.FromPixel(source);

From 425600459e96cc5d34857fd9e0de45952fa8e6ae Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 3 Nov 2021 23:49:32 +1100
Subject: [PATCH 13/36] Update Color.Equals

---
 src/ImageSharp/Color/Color.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index fe66efcfb..61d6c8e6d 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -277,7 +277,7 @@ namespace SixLabors.ImageSharp
                 return this.data.PackedValue == other.data.PackedValue;
             }
 
-            return this.ToVector4().Equals(other.ToVector4());
+            return this.boxedHighPrecisionPixel?.Equals(other.boxedHighPrecisionPixel) == true;
         }
 
         /// <inheritdoc />

From 08785103e350266f626b3519b22e3966b4450caa Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 12:39:42 +0100
Subject: [PATCH 14/36] Add EntropyPasses default value explicit to 1

---
 src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs | 1 +
 src/ImageSharp/Formats/Webp/WebpEncoder.cs         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
index 7dbf49d45..000de4f88 100644
--- a/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
+++ b/src/ImageSharp/Formats/Webp/IWebpEncoderOptions.cs
@@ -35,6 +35,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
 
         /// <summary>
         /// Gets the number of entropy-analysis passes (in [1..10]).
+        /// Defaults to 1.
         /// </summary>
         int EntropyPasses { get; }
 
diff --git a/src/ImageSharp/Formats/Webp/WebpEncoder.cs b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
index f85f65b63..bdcbb194b 100644
--- a/src/ImageSharp/Formats/Webp/WebpEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/WebpEncoder.cs
@@ -27,7 +27,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
         public bool UseAlphaCompression { get; set; }
 
         /// <inheritdoc/>
-        public int EntropyPasses { get; set; }
+        public int EntropyPasses { get; set; } = 1;
 
         /// <inheritdoc/>
         public int SpatialNoiseShaping { get; set; } = 50;

From 947dc8d5ecff64414247ede191452cf8c7a77c26 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 12:40:39 +0100
Subject: [PATCH 15/36] Make sure magick.net and imagesharp use the same
 configuration

---
 .../Codecs/EncodeWebp.cs                      | 45 ++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 7d3dfe693..59814f465 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -4,6 +4,7 @@
 using System.IO;
 using BenchmarkDotNet.Attributes;
 using ImageMagick;
+using ImageMagick.Formats;
 using SixLabors.ImageSharp.Formats.Webp;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests;
@@ -44,8 +45,22 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
         public void MagickWebpLossy()
         {
             using var memoryStream = new MemoryStream();
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "lossless", false);
-            this.webpMagick.Write(memoryStream, MagickFormat.WebP);
+
+            var defines = new WebPWriteDefines
+            {
+                Lossless = false,
+                Method = 4,
+                AlphaCompression = WebPAlphaCompression.None,
+                FilterStrength = 60,
+                SnsStrength = 50,
+                Pass = 1,
+
+                // 100 means off.
+                NearLossless = 100
+            };
+
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Write(memoryStream, defines);
         }
 
         [Benchmark(Description = "ImageSharp Webp Lossy")]
@@ -54,7 +69,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
             using var memoryStream = new MemoryStream();
             this.webp.Save(memoryStream, new WebpEncoder()
             {
-                FileFormat = WebpFileFormatType.Lossy
+                FileFormat = WebpFileFormatType.Lossy,
+                Method = WebpEncodingMethod.Level4,
+                UseAlphaCompression = false,
+                FilterStrength = 60,
+                SpatialNoiseShaping = 50,
+                EntropyPasses = 1
             });
         }
 
@@ -62,8 +82,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
         public void MagickWebpLossless()
         {
             using var memoryStream = new MemoryStream();
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "lossless", true);
-            this.webpMagick.Write(memoryStream, MagickFormat.WebP);
+            var defines = new WebPWriteDefines
+            {
+                Lossless = true,
+                Method = 4,
+
+                // 100 means off.
+                NearLossless = 100
+            };
+
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "exact", false);
+            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Write(memoryStream, defines);
         }
 
         [Benchmark(Description = "ImageSharp Webp Lossless")]
@@ -72,7 +102,10 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
             using var memoryStream = new MemoryStream();
             this.webp.Save(memoryStream, new WebpEncoder()
             {
-                FileFormat = WebpFileFormatType.Lossless
+                FileFormat = WebpFileFormatType.Lossless,
+                Method = WebpEncodingMethod.Level4,
+                NearLossless = false,
+                TransparentColorMode = WebpTransparentColorMode.Clear
             });
         }
 

From 55b67ada2f659463f438303e77d0f1b1de4c47bc Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 21:40:02 +0100
Subject: [PATCH 16/36] Use webpMagick.Quality for the quality parameter

---
 tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 59814f465..222984992 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -59,7 +59,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
                 NearLossless = 100
             };
 
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Quality = 75;
             this.webpMagick.Write(memoryStream, defines);
         }
 
@@ -91,8 +91,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
                 NearLossless = 100
             };
 
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "exact", false);
-            this.webpMagick.Settings.SetDefine(MagickFormat.WebP, "quality", 75);
+            this.webpMagick.Quality = 75;
             this.webpMagick.Write(memoryStream, defines);
         }
 
@@ -105,6 +104,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
                 FileFormat = WebpFileFormatType.Lossless,
                 Method = WebpEncodingMethod.Level4,
                 NearLossless = false,
+
+                // This is equal to exact = false in libwebp, which is the default.
                 TransparentColorMode = WebpTransparentColorMode.Clear
             });
         }

From d6d952e477b0653b2750210ad4cd2d3fc14bbaec Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 4 Nov 2021 23:12:01 +0100
Subject: [PATCH 17/36] Remove another unnecessary cast AsInt16()

---
 src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 8bd3163cc..ee9ea5123 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -1246,8 +1246,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                 Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
                 Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
                 Vector128<short> a2 = Sse2.Subtract(a1, bgta);
-                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2.AsInt16(), 1);
-                Vector128<short> a4 = Sse2.Add(a0.AsInt16(), a3).AsInt16();
+                Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
+                Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
                 Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
                 uint output = Sse2.ConvertToUInt32(a5.AsUInt32());
                 return output;

From 2b6dbbce6fb6561a7fbddb0bd08afe69b9349382 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 5 Nov 2021 12:46:53 +0100
Subject: [PATCH 18/36] Update benchmark results

---
 .../Codecs/DecodeWebp.cs                      | 49 ++++++++---------
 .../Codecs/EncodeWebp.cs                      | 55 +++++++++----------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
index 407a4ef3b..878929823 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs
@@ -76,34 +76,29 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
             return image.Height;
         }
 
-        /* Results 17.06.2021
-         *  BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
+        /* Results 04.11.2021
+         *  BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19043.1320 (21H1/May2021Update)
             Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-            .NET Core SDK=3.1.202
-              [Host]     : .NET Core 3.1.4 (CoreCLR 4.700.20.20201, CoreFX 4.700.20.22101), X64 RyuJIT
-              Job-AQFZAV : .NET Framework 4.8 (4.8.4180.0), X64 RyuJIT
-              Job-YCDAPQ : .NET Core 2.1.18 (CoreCLR 4.6.28801.04, CoreFX 4.6.28802.05), X64 RyuJIT
-              Job-WMTYOZ : .NET Core 3.1.4 (CoreCLR 4.700.20.20201, CoreFX 4.700.20.22101), X64 RyuJIT
-
-            IterationCount=3  LaunchCount=1  WarmupCount=3
-            |                     Method |        Job |       Runtime |        TestImageLossy |        TestImageLossless |       Mean |     Error |   StdDev |     Gen 0 |     Gen 1 | Gen 2 |   Allocated |
-            |--------------------------- |----------- |-------------- |---------------------- |------------------------- |-----------:|----------:|---------:|----------:|----------:|------:|------------:|
-            |        'Magick Lossy Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.8 ms |   6.28 ms |  0.34 ms |         - |         - |     - |    17.65 KB |
-            |    'ImageSharp Lossy Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,145.0 ms | 110.82 ms |  6.07 ms |         - |         - |     - |  2779.53 KB |
-            |     'Magick Lossless Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   145.9 ms |   8.55 ms |  0.47 ms |         - |         - |     - |    18.05 KB |
-            | 'ImageSharp Lossless Webp' | Job-IERNAB |    .NET 4.7.2 | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,694.1 ms |  55.09 ms |  3.02 ms | 4000.0000 | 1000.0000 |     - | 30556.87 KB |
-            |        'Magick Lossy Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.7 ms |   1.89 ms |  0.10 ms |         - |         - |     - |    15.75 KB |
-            |    'ImageSharp Lossy Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   741.6 ms |  21.45 ms |  1.18 ms |         - |         - |     - |  2767.85 KB |
-            |     'Magick Lossless Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.1 ms |   9.52 ms |  0.52 ms |         - |         - |     - |    16.54 KB |
-            | 'ImageSharp Lossless Webp' | Job-IMRAGJ | .NET Core 2.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   522.5 ms |  21.15 ms |  1.16 ms | 4000.0000 | 1000.0000 |     - | 22860.02 KB |
-            |        'Magick Lossy Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.9 ms |   5.34 ms |  0.29 ms |         - |         - |     - |    15.45 KB |
-            |    'ImageSharp Lossy Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   748.8 ms | 290.47 ms | 15.92 ms |         - |         - |     - |  2767.84 KB |
-            |     'Magick Lossless Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.1 ms |   1.14 ms |  0.06 ms |         - |         - |     - |     15.9 KB |
-            | 'ImageSharp Lossless Webp' | Job-NAASQX | .NET Core 3.1 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   480.7 ms |  25.25 ms |  1.38 ms | 4000.0000 | 1000.0000 |     - |  22859.7 KB |
-            |        'Magick Lossy Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   105.7 ms |   4.71 ms |  0.26 ms |         - |         - |     - |    15.48 KB |
-            |    'ImageSharp Lossy Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   645.7 ms |  61.00 ms |  3.34 ms |         - |         - |     - |  2768.13 KB |
-            |     'Magick Lossless Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   146.5 ms |  18.63 ms |  1.02 ms |         - |         - |     - |     15.8 KB |
-            | 'ImageSharp Lossless Webp' | Job-GLNACU | .NET Core 5.0 | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   306.7 ms |  32.31 ms |  1.77 ms | 4000.0000 | 1000.0000 |     - | 22860.02 KB |
+            .NET SDK=6.0.100-rc.2.21505.57
+              [Host]     : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+              Job-WQLXJO : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+              Job-OJJAMD : .NET Core 3.1.20 (CoreCLR 4.700.21.47003, CoreFX 4.700.21.47101), X64 RyuJIT
+              Job-OMFOAS : .NET Framework 4.8 (4.8.4420.0), X64 RyuJIT
+
+            |                     Method |        Job |              Runtime |             Arguments |        TestImageLossy |        TestImageLossless |       Mean |     Error |  StdDev |    Gen 0 | Gen 1 | Gen 2 | Allocated |
+            |--------------------------- |----------- |--------------------- |---------------------- |---------------------- |------------------------- |-----------:|----------:|--------:|---------:|------:|------:|----------:|
+            |        'Magick Lossy Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   107.9 ms |  28.91 ms | 1.58 ms |        - |     - |     - |     25 KB |
+            |    'ImageSharp Lossy Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   282.3 ms |  25.40 ms | 1.39 ms | 500.0000 |     - |     - |  2,428 KB |
+            |     'Magick Lossless Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.3 ms |  11.99 ms | 0.66 ms |        - |     - |     - |     16 KB |
+            | 'ImageSharp Lossless Webp' | Job-HLWZLL |             .NET 5.0 | /p:DebugType=portable | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   280.2 ms |   6.21 ms | 0.34 ms |        - |     - |     - |  2,092 KB |
+            |        'Magick Lossy Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.2 ms |   9.32 ms | 0.51 ms |        - |     - |     - |     15 KB |
+            |    'ImageSharp Lossy Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   295.8 ms |  21.25 ms | 1.16 ms | 500.0000 |     - |     - |  2,427 KB |
+            |     'Magick Lossless Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.5 ms |   4.07 ms | 0.22 ms |        - |     - |     - |     15 KB |
+            | 'ImageSharp Lossless Webp' | Job-ALQPDS |        .NET Core 3.1 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   464.0 ms |  55.70 ms | 3.05 ms |        - |     - |     - |  2,090 KB |
+            |        'Magick Lossy Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   108.0 ms |  29.60 ms | 1.62 ms |        - |     - |     - |     32 KB |
+            |    'ImageSharp Lossy Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   564.9 ms |  29.69 ms | 1.63 ms |        - |     - |     - |  2,436 KB |
+            |     'Magick Lossless Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp |   106.2 ms |   4.74 ms | 0.26 ms |        - |     - |     - |     18 KB |
+            | 'ImageSharp Lossless Webp' | Job-RYVVNN | .NET Framework 4.7.2 |               Default | Webp/earth_lossy.webp | Webp/earth_lossless.webp | 1,767.5 ms | 106.33 ms | 5.83 ms |        - |     - |     - |  9,729 KB |
          */
     }
 }
diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
index 222984992..43d8c464c 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs
@@ -110,37 +110,34 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs
             });
         }
 
-        /* Results 17.06.2021
+        /* Results 04.11.2021
          * Summary *
-        BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.630 (2004/?/20H1)
+        BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19043.1320 (21H1/May2021Update)
         Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
-        .NET Core SDK=5.0.100
-          [Host]     : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-          Job-OUUGWL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
-          Job-GAIITM : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
-          Job-HWOBSO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-
-        |                     Method |        Job |       Runtime |    TestImage |      Mean |      Error |    StdDev | Ratio | RatioSD |      Gen 0 |     Gen 1 |     Gen 2 |    Allocated |
-        |--------------------------- |----------- |-------------- |------------- |----------:|-----------:|----------:|------:|--------:|-----------:|----------:|----------:|-------------:|
-        |        'Magick Webp Lossy' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png |  23.30 ms |   0.869 ms |  0.048 ms |  0.14 |    0.00 |          - |         - |         - |     68.19 KB |
-        |    'ImageSharp Webp Lossy' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png |  68.22 ms |  16.454 ms |  0.902 ms |  0.42 |    0.01 |  6125.0000 |  125.0000 |         - |  26359.49 KB |
-        |     'Magick Webp Lossless' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png | 161.96 ms |   9.879 ms |  0.541 ms |  1.00 |    0.00 |          - |         - |         - |    520.28 KB |
-        | 'ImageSharp Webp Lossless' | Job-RYVNHD |    .NET 4.7.2 | Png/Bike.png | 370.88 ms |  58.875 ms |  3.227 ms |  2.29 |    0.02 | 34000.0000 | 5000.0000 | 2000.0000 | 163177.15 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png |  23.35 ms |   0.428 ms |  0.023 ms |  0.14 |    0.00 |          - |         - |         - |     67.76 KB |
-        |    'ImageSharp Webp Lossy' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png |  43.95 ms |   2.850 ms |  0.156 ms |  0.27 |    0.00 |  6250.0000 |  250.0000 |   83.3333 |  26284.72 KB |
-        |     'Magick Webp Lossless' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png | 161.44 ms |   3.749 ms |  0.206 ms |  1.00 |    0.00 |          - |         - |         - |    519.26 KB |
-        | 'ImageSharp Webp Lossless' | Job-GOZXWU | .NET Core 2.1 | Png/Bike.png | 335.78 ms |  78.666 ms |  4.312 ms |  2.08 |    0.03 | 34000.0000 | 5000.0000 | 2000.0000 | 162727.56 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png |  23.48 ms |   4.325 ms |  0.237 ms |  0.15 |    0.00 |          - |         - |         - |     67.66 KB |
-        |    'ImageSharp Webp Lossy' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png |  43.29 ms |  16.503 ms |  0.905 ms |  0.27 |    0.01 |  6272.7273 |  272.7273 |   90.9091 |  26284.86 KB |
-        |     'Magick Webp Lossless' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png | 161.81 ms |  10.693 ms |  0.586 ms |  1.00 |    0.00 |          - |         - |         - |    523.25 KB |
-        | 'ImageSharp Webp Lossless' | Job-VRDVKW | .NET Core 3.1 | Png/Bike.png | 323.97 ms | 235.468 ms | 12.907 ms |  2.00 |    0.08 | 34000.0000 | 5000.0000 | 2000.0000 | 162724.84 KB |
-        |                            |            |               |              |           |            |           |       |         |            |           |           |              |
-        |        'Magick Webp Lossy' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png |  23.36 ms |   0.448 ms |  0.025 ms |  0.14 |    0.00 |          - |         - |         - |     67.66 KB |
-        |    'ImageSharp Webp Lossy' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png |  40.11 ms |   2.465 ms |  0.135 ms |  0.25 |    0.00 |  6307.6923 |  230.7692 |   76.9231 |  26284.71 KB |
-        |     'Magick Webp Lossless' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png | 161.55 ms |   6.662 ms |  0.365 ms |  1.00 |    0.00 |          - |         - |         - |    518.84 KB |
-        | 'ImageSharp Webp Lossless' | Job-ZJRLRB | .NET Core 5.0 | Png/Bike.png | 298.73 ms |  17.953 ms |  0.984 ms |  1.85 |    0.01 | 34000.0000 | 5000.0000 | 2000.0000 | 162725.13 KB |
+        .NET SDK=6.0.100-rc.2.21505.57
+          [Host]     : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+          Job-WQLXJO : .NET 5.0.11 (5.0.1121.47308), X64 RyuJIT
+          Job-OJJAMD : .NET Core 3.1.20 (CoreCLR 4.700.21.47003, CoreFX 4.700.21.47101), X64 RyuJIT
+          Job-OMFOAS : .NET Framework 4.8 (4.8.4420.0), X64 RyuJIT
+
+        IterationCount=3  LaunchCount=1  WarmupCount=3
+
+        |                     Method |        Job |              Runtime |             Arguments |    TestImage |      Mean |     Error |   StdDev | Ratio | RatioSD |       Gen 0 |     Gen 1 |     Gen 2 |  Allocated |
+        |--------------------------- |----------- |--------------------- |---------------------- |------------- |----------:|----------:|---------:|------:|--------:|------------:|----------:|----------:|-----------:|
+        |        'Magick Webp Lossy' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png |  23.33 ms |  1.491 ms | 0.082 ms |  0.15 |    0.00 |           - |         - |         - |      67 KB |
+        |    'ImageSharp Webp Lossy' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 245.80 ms | 24.288 ms | 1.331 ms |  1.53 |    0.01 | 135000.0000 |         - |         - | 552,713 KB |
+        |     'Magick Webp Lossless' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 160.36 ms | 11.131 ms | 0.610 ms |  1.00 |    0.00 |           - |         - |         - |     518 KB |
+        | 'ImageSharp Webp Lossless' | Job-WQLXJO |             .NET 5.0 | /p:DebugType=portable | Png/Bike.png | 313.93 ms | 45.605 ms | 2.500 ms |  1.96 |    0.01 |  34000.0000 | 5000.0000 | 2000.0000 | 161,670 KB |
+        |                            |            |                      |                       |              |           |           |          |       |         |             |           |           |            |
+        |        'Magick Webp Lossy' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png |  23.36 ms |  2.289 ms | 0.125 ms |  0.15 |    0.00 |           - |         - |         - |      67 KB |
+        |    'ImageSharp Webp Lossy' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 254.64 ms | 19.620 ms | 1.075 ms |  1.59 |    0.00 | 135000.0000 |         - |         - | 552,713 KB |
+        |     'Magick Webp Lossless' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 160.30 ms |  9.549 ms | 0.523 ms |  1.00 |    0.00 |           - |         - |         - |     518 KB |
+        | 'ImageSharp Webp Lossless' | Job-OJJAMD |        .NET Core 3.1 |               Default | Png/Bike.png | 320.35 ms | 22.924 ms | 1.257 ms |  2.00 |    0.01 |  34000.0000 | 5000.0000 | 2000.0000 | 161,669 KB |
+        |                            |            |                      |                       |              |           |           |          |       |         |             |           |           |            |
+        |        'Magick Webp Lossy' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png |  23.37 ms |  0.908 ms | 0.050 ms |  0.15 |    0.00 |           - |         - |         - |      68 KB |
+        |    'ImageSharp Webp Lossy' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 378.67 ms | 25.540 ms | 1.400 ms |  2.36 |    0.01 | 135000.0000 |         - |         - | 554,351 KB |
+        |     'Magick Webp Lossless' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 160.13 ms |  5.115 ms | 0.280 ms |  1.00 |    0.00 |           - |         - |         - |     520 KB |
+        | 'ImageSharp Webp Lossless' | Job-OMFOAS | .NET Framework 4.7.2 |               Default | Png/Bike.png | 379.01 ms | 71.192 ms | 3.902 ms |  2.37 |    0.02 |  34000.0000 | 5000.0000 | 2000.0000 | 162,119 KB |
         */
     }
 }

From b9e8f76990206843b485006bac8b9ff2cceb05ed Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Sun, 7 Nov 2021 18:07:43 +1100
Subject: [PATCH 19/36] Update FromPixel

---
 src/ImageSharp/Color/Color.Conversions.cs | 11 +++++++++++
 src/ImageSharp/Color/Color.cs             | 22 +++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 424b7dcdf..96aa05c96 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -23,6 +23,17 @@ namespace SixLabors.ImageSharp
             this.boxedHighPrecisionPixel = null;
         }
 
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="Rgb48"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(Rgb48 pixel)
+        {
+            this.data = new Rgba64(pixel.R, pixel.G, pixel.B, ushort.MaxValue);
+            this.boxedHighPrecisionPixel = null;
+        }
+
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index 61d6c8e6d..c461d034e 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -107,7 +107,27 @@ namespace SixLabors.ImageSharp
         [MethodImpl(InliningOptions.ShortMethod)]
         public static Color FromPixel<TPixel>(TPixel pixel)
             where TPixel : unmanaged, IPixel<TPixel>
-            => new(pixel);
+        {
+            // Avoid boxing in case we can convert to Rgba64 safely and efficently
+            if (typeof(TPixel) == typeof(Rgba64))
+            {
+                return new((Rgba64)(object)pixel);
+            }
+            else if (typeof(TPixel) == typeof(Rgb48))
+            {
+                return new((Rgb48)(object)pixel);
+            }
+            else if (Unsafe.SizeOf<TPixel>() <= Unsafe.SizeOf<Rgba32>())
+            {
+                Rgba32 p = default;
+                pixel.ToRgba32(ref p);
+                return new(p);
+            }
+            else
+            {
+                return new(pixel);
+            }
+        }
 
         /// <summary>
         /// Creates a new instance of the <see cref="Color"/> struct

From 5b1720eb8deccd3ea37248111a68df73ce632c3a Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 13:27:08 +0100
Subject: [PATCH 20/36] Add sse41 version of Hadamard transform

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 151 +++++++++++++++++-
 1 file changed, 146 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 04ff80b2d..0993e2a66 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,11 +4,15 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static class LossyUtils
+    internal static unsafe class LossyUtils
     {
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -61,11 +65,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w)
         {
             int d = 0;
+            int dataSize = (4 * WebpConstants.Bps) - 16;
             for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
             {
                 for (int x = 0; x < 16; x += 4)
                 {
-                    d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w);
+                    d += Vp8Disto4X4(a.Slice(x + y, dataSize), b.Slice(x + y, dataSize), w);
                 }
             }
 
@@ -75,9 +80,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w)
         {
-            int sum1 = TTransform(a, w);
-            int sum2 = TTransform(b, w);
-            return Math.Abs(sum2 - sum1) >> 5;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                int diffSum = TTransformSse41(a, b, w);
+                return Math.Abs(diffSum) >> 5;
+            }
+            else
+#endif
+            {
+                int sum1 = TTransform(a, w);
+                int sum2 = TTransform(b, w);
+                return Math.Abs(sum2 - sum1) >> 5;
+            }
         }
 
         public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
@@ -591,6 +606,132 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             return sum;
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Hadamard transform
+        /// Returns the weighted sum of the absolute value of transformed coefficients.
+        /// w[] contains a row-major 4 by 4 symmetric matrix.
+        /// </summary>
+        public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
+        {
+            Span<int> sum = stackalloc int[4];
+#pragma warning disable SA1503 // Braces should not be omitted
+            fixed (byte* inputAPtr = inputA)
+            fixed (byte* inputBPtr = inputB)
+            fixed (ushort* wPtr = w)
+            fixed (int* outputPtr = sum)
+            {
+                // Load and combine inputs.
+                Vector128<byte> ina0 = Sse2.LoadVector128(inputAPtr);
+                Vector128<byte> ina1 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 1));
+                Vector128<byte> ina2 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 2));
+                Vector128<long> ina3 = Sse2.LoadVector128((long*)(inputAPtr + (WebpConstants.Bps * 3)));
+                Vector128<byte> inb0 = Sse2.LoadVector128(inputBPtr);
+                Vector128<byte> inb1 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 1));
+                Vector128<byte> inb2 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 2));
+                Vector128<long> inb3 = Sse2.LoadVector128((long*)(inputBPtr + (WebpConstants.Bps * 3)));
+
+                // Combine inA and inB (we'll do two transforms in parallel).
+                Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
+                Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
+                Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
+                Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
+                Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
+                Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
+                Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
+                Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
+
+                // a00 a01 a02 a03   b00 b01 b02 b03
+                // a10 a11 a12 a13   b10 b11 b12 b13
+                // a20 a21 a22 a23   b20 b21 b22 b23
+                // a30 a31 a32 a33   b30 b31 b32 b33
+                // Vertical pass first to avoid a transpose (vertical and horizontal passes
+                // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+                // Calculate a and b (two 4x4 at once).
+                Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
+                Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
+                Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
+                Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
+                Vector128<short> b0 = Sse2.Add(a0, a1);
+                Vector128<short> b1 = Sse2.Add(a3, a2);
+                Vector128<short> b2 = Sse2.Subtract(a3, a2);
+                Vector128<short> b3 = Sse2.Subtract(a0, a1);
+
+                // a00 a01 a02 a03   b00 b01 b02 b03
+                // a10 a11 a12 a13   b10 b11 b12 b13
+                // a20 a21 a22 a23   b20 b21 b22 b23
+                // a30 a31 a32 a33   b30 b31 b32 b33
+                // Transpose the two 4x4.
+                Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+                Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+                Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+                Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+                // a00 a10 a01 a11   a02 a12 a03 a13
+                // a20 a30 a21 a31   a22 a32 a23 a33
+                // b00 b10 b01 b11   b02 b12 b03 b13
+                // b20 b30 b21 b31   b22 b32 b23 b33
+                Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+                Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+                Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+                Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+                // a00 a10 a20 a30 a01 a11 a21 a31
+                // b00 b10 b20 b30 b01 b11 b21 b31
+                // a02 a12 a22 a32 a03 a13 a23 a33
+                // b02 b12 a22 b32 b03 b13 b23 b33
+                Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+                Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+                Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+                Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+                // a00 a10 a20 a30   b00 b10 b20 b30
+                // a01 a11 a21 a31   b01 b11 b21 b31
+                // a02 a12 a22 a32   b02 b12 b22 b32
+                // a03 a13 a23 a33   b03 b13 b23 b33
+                // Horizontal pass and difference of weighted sums.
+                Vector128<ushort> w0 = Sse2.LoadVector128(wPtr);
+                Vector128<ushort> w8 = Sse2.LoadVector128(wPtr + 8);
+
+                // Calculate a and b (two 4x4 at once).
+                a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
+                a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
+                a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
+                a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
+                b0 = Sse2.Add(a0, a1);
+                b1 = Sse2.Add(a3, a2);
+                b2 = Sse2.Subtract(a3, a2);
+                b3 = Sse2.Subtract(a0, a1);
+
+                // Separate the transforms of inA and inB.
+                Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
+                Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
+                Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
+                Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
+
+                Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
+                Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
+                Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
+                Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
+
+                // weighted sums.
+                Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
+                Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
+                Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
+                Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
+                Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
+                Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
+
+                // difference of weighted sums.
+                Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
+                Sse2.Store(outputPtr, result.AsInt32());
+            }
+
+            return sum[3] + sum[2] + sum[1] + sum[0];
+#pragma warning restore SA1503 // Braces should not be omitted
+        }
+#endif
+
         public static void TransformTwo(Span<short> src, Span<byte> dst)
         {
             TransformOne(src, dst);

From d2017933d7042d3757062cfe3134206652ce7b27 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 13:31:11 +0100
Subject: [PATCH 21/36] Add HadamardTransform sse tests

---
 .../Formats/WebP/LossyUtilsTests.cs           | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
new file mode 100644
index 000000000..6a9a078d7
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -0,0 +1,58 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class LossyUtilsTests
+    {
+        private static void RunHadamardTransformTest()
+        {
+            byte[] a =
+            {
+                27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
+                129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
+                28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
+                26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
+                128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27
+            };
+
+            byte[] b =
+            {
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+                28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28,
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
+            };
+
+            ushort[] w = { 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 };
+            int expected = 2;
+
+            int actual = LossyUtils.Vp8Disto4X4(a, b, w);
+            Assert.Equal(expected, actual);
+        }
+
+        [Fact]
+        public void HadamardTransform_Works() => RunHadamardTransformTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41);
+
+        [Fact]
+        public void HadamardTransform_WithoutSSE2AndSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41 | HwIntrinsics.DisableSSE2);
+#endif
+
+    }
+}

From 3a03fad75eaa8464d1bd84cccd307014f9417497 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 14:51:51 +0100
Subject: [PATCH 22/36] Add sse41 version of quantize block

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 176 ++++++++++++++----
 1 file changed, 144 insertions(+), 32 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 2ed438166..02087ceda 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -3,13 +3,17 @@
 
 using System;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
     /// <summary>
     /// Quantization methods.
     /// </summary>
-    internal static class QuantEnc
+    internal static unsafe class QuantEnc
     {
         private static readonly byte[] Zigzag = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
 
@@ -17,6 +21,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         private const int MaxLevel = 2047;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<short> MaxCoeff2047 = Vector128.Create((short)MaxLevel);
+
+        private static readonly Vector128<byte> CstLo = Vector128.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13);
+
+        private static readonly Vector128<byte> Cst7 = Vector128.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255);
+
+        private static readonly Vector128<byte> CstHi = Vector128.Create(2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15);
+
+        private static readonly Vector128<byte> Cst8 = Vector128.Create(254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255);
+#endif
+
         // Diffusion weights. We under-correct a bit (15/16th of the error is actually
         // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
         private const int C1 = 7;    // fraction of error sent to the 4x4 block below
@@ -486,51 +502,147 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Quantize2Blocks(Span<short> input, Span<short> output, Vp8Matrix mtx)
         {
-            int nz = QuantizeBlock(input, output, mtx) << 0;
-            nz |= QuantizeBlock(input.Slice(1 * 16), output.Slice(1 * 16), mtx) << 1;
+            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), mtx) << 0;
+            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), mtx) << 1;
             return nz;
         }
 
         public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix mtx)
         {
-            int last = -1;
-            int n;
-            for (n = 0; n < 16; ++n)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
             {
-                int j = Zigzag[n];
-                bool sign = input[j] < 0;
-                uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
-                if (coeff > mtx.ZThresh[j])
+#pragma warning disable SA1503 // Braces should not be omitted
+                fixed (ushort* mtxIqPtr = mtx.IQ)
+                fixed (ushort* mtxQPtr = mtx.Q)
+                fixed (uint* biasQPtr = mtx.Bias)
+                fixed (short* inputPtr = input)
+                fixed (short* outputPtr = output)
                 {
-                    uint q = mtx.Q[j];
-                    uint iQ = mtx.IQ[j];
-                    uint b = mtx.Bias[j];
-                    int level = QuantDiv(coeff, iQ, b);
-                    if (level > MaxLevel)
+                    // Load all inputs.
+                    Vector128<short> input0 = Sse2.LoadVector128(inputPtr);
+                    Vector128<short> input8 = Sse2.LoadVector128(inputPtr + 8);
+                    Vector128<ushort> iq0 = Sse2.LoadVector128(mtxIqPtr);
+                    Vector128<ushort> iq8 = Sse2.LoadVector128(mtxIqPtr + 8);
+                    Vector128<ushort> q0 = Sse2.LoadVector128(mtxQPtr);
+                    Vector128<ushort> q8 = Sse2.LoadVector128(mtxQPtr + 8);
+
+                    // coeff = abs(in)
+                    Vector128<ushort> coeff0 = Ssse3.Abs(input0);
+                    Vector128<ushort> coeff8 = Ssse3.Abs(input8);
+
+                    // out = (coeff * iQ + B) >> QFIX
+                    // doing calculations with 32b precision (QFIX=17)
+                    // out = (coeff * iQ)
+                    Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
+                    Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
+                    Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
+                    Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
+                    Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
+                    Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
+                    Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
+                    Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
+
+                    // out = (coeff * iQ + B)
+                    Vector128<uint> bias00 = Sse2.LoadVector128(biasQPtr);
+                    Vector128<uint> bias04 = Sse2.LoadVector128(biasQPtr + 4);
+                    Vector128<uint> bias08 = Sse2.LoadVector128(biasQPtr + 8);
+                    Vector128<uint> bias12 = Sse2.LoadVector128(biasQPtr + 12);
+                    out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
+                    out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
+                    out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
+                    out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
+
+                    // out = QUANTDIV(coeff, iQ, B, QFIX)
+                    out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
+                    out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
+
+                    // pack result as 16b
+                    Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
+                    Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
+
+                    // if (coeff > 2047) coeff = 2047
+                    out0 = Sse2.Min(out0, MaxCoeff2047);
+                    out8 = Sse2.Min(out8, MaxCoeff2047);
+
+                    // put sign back
+                    out0 = Ssse3.Sign(out0, input0);
+                    out8 = Ssse3.Sign(out8, input8);
+
+                    // in = out * Q
+                    input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
+                    input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
+
+                    // in = out * Q
+                    Sse2.Store(inputPtr, input0);
+                    Sse2.Store(inputPtr + 8, input8);
+
+                    // zigzag the output before storing it. The re-ordering is:
+                    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+                    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+                    // There's only two misplaced entries ([8] and [7]) that are crossing the
+                    // reg's boundaries.
+                    // We use pshufb instead of pshuflo/pshufhi.
+                    Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
+                    Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
+                    Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
+                    Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
+                    Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
+                    Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+                    Sse2.Store(outputPtr, outZ0.AsInt16());
+                    Sse2.Store(outputPtr + 8, outZ8.AsInt16());
+                    Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
+
+                    // Detect if all 'out' values are zeroes or not.
+                    Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
+                    return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
+                }
+#pragma warning restore SA1503 // Braces should not be omitted
+            }
+            else
+#endif
+            {
+                int last = -1;
+                int n;
+                for (n = 0; n < 16; ++n)
+                {
+                    int j = Zigzag[n];
+                    bool sign = input[j] < 0;
+                    uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
+                    if (coeff > mtx.ZThresh[j])
                     {
-                        level = MaxLevel;
-                    }
+                        uint q = mtx.Q[j];
+                        uint iQ = mtx.IQ[j];
+                        uint b = mtx.Bias[j];
+                        int level = QuantDiv(coeff, iQ, b);
+                        if (level > MaxLevel)
+                        {
+                            level = MaxLevel;
+                        }
 
-                    if (sign)
-                    {
-                        level = -level;
-                    }
+                        if (sign)
+                        {
+                            level = -level;
+                        }
 
-                    input[j] = (short)(level * (int)q);
-                    output[n] = (short)level;
-                    if (level != 0)
+                        input[j] = (short)(level * (int)q);
+                        output[n] = (short)level;
+                        if (level != 0)
+                        {
+                            last = n;
+                        }
+                    }
+                    else
                     {
-                        last = n;
+                        output[n] = 0;
+                        input[j] = 0;
                     }
                 }
-                else
-                {
-                    output[n] = 0;
-                    input[j] = 0;
-                }
-            }
 
-            return last >= 0 ? 1 : 0;
+                return last >= 0 ? 1 : 0;
+            }
         }
 
         // Quantize as usual, but also compute and return the quantization error.

From 020134ad8c15e58621635d4ca4b5fb4c6acdbe89 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 14:52:11 +0100
Subject: [PATCH 23/36] Add QuantizeBlock sse tests

---
 .../Formats/Webp/Lossy/Vp8Matrix.cs           |  9 +++
 .../Formats/WebP/QuantEncTests.cs             | 56 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
index 4276b887f..e525e388b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
@@ -34,6 +34,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.Sharpen = new short[16];
         }
 
+        public Vp8Matrix(ushort[] q, ushort[] iq, uint[] bias, uint[] zThresh, short[] sharpen)
+        {
+            this.Q = q;
+            this.IQ = iq;
+            this.Bias = bias;
+            this.ZThresh = zThresh;
+            this.Sharpen = sharpen;
+        }
+
         /// <summary>
         /// Gets the quantizer steps.
         /// </summary>
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
new file mode 100644
index 000000000..280a7902a
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -0,0 +1,56 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.Linq;
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class QuantEncTests
+    {
+        private static void RunQuantizeBlockTest()
+        {
+            // arrange
+            short[] input = { 378, 777, -851, 888, 259, 148, 0, -111, -185, -185, -74, -37, 148, 74, 111, 74 };
+            short[] output = new short[16];
+            ushort[] q = { 42, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 };
+            ushort[] iq = { 3120, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542 };
+            uint[] bias =
+            {
+                49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296,
+                55296, 55296
+            };
+            uint[] zthresh = { 26, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 };
+            short[] expectedOutput = { 9, 21, 7, -5, 4, -23, 24, 0, -5, 4, 2, -2, -3, -1, 3, 2 };
+            int expectedResult = 1;
+            var vp8Matrix = new Vp8Matrix(q, iq, bias, zthresh, new short[16]);
+
+            // act
+            int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);
+
+            // assert
+            Assert.True(output.SequenceEqual(expectedOutput));
+            Assert.Equal(expectedResult, actualResult);
+        }
+
+        [Fact]
+        public void QuantizeBlock_Works() => RunQuantizeBlockTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void QuantizeBlock_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSSE3);
+
+        [Fact]
+        public void QuantizeBlock_WithoutSSE2AndSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
+#endif
+    }
+}

From a628909b8da58e9dbd10bfa3b70e9c8ce66ddc1d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Sun, 7 Nov 2021 15:02:08 +0100
Subject: [PATCH 24/36] Add coeff = abs(in) + sharpen

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 02087ceda..b812909b2 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -516,6 +516,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 fixed (ushort* mtxIqPtr = mtx.IQ)
                 fixed (ushort* mtxQPtr = mtx.Q)
                 fixed (uint* biasQPtr = mtx.Bias)
+                fixed (short* sharpenPtr = mtx.Sharpen)
                 fixed (short* inputPtr = input)
                 fixed (short* outputPtr = output)
                 {
@@ -531,6 +532,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                     Vector128<ushort> coeff0 = Ssse3.Abs(input0);
                     Vector128<ushort> coeff8 = Ssse3.Abs(input8);
 
+                    // coeff = abs(in) + sharpen
+                    Vector128<short> sharpen0 = Sse2.LoadVector128(sharpenPtr);
+                    Vector128<short> sharpen8 = Sse2.LoadVector128(sharpenPtr + 8);
+                    Sse2.Add(coeff0.AsInt16(), sharpen0);
+                    Sse2.Add(coeff8.AsInt16(), sharpen8);
+
                     // out = (coeff * iQ + B) >> QFIX
                     // doing calculations with 32b precision (QFIX=17)
                     // out = (coeff * iQ)

From af90336173a1ee20a6c894c113e5f799b139bf9f Mon Sep 17 00:00:00 2001
From: Anton Firszov <antonfir@gmail.com>
Date: Sun, 7 Nov 2021 15:25:47 +0100
Subject: [PATCH 25/36] stackalloc header buffer in InternalDetectFormat

---
 src/ImageSharp/Image.Decode.cs | 51 +++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/ImageSharp/Image.Decode.cs b/src/ImageSharp/Image.Decode.cs
index 94da2c995..ee340bf86 100644
--- a/src/ImageSharp/Image.Decode.cs
+++ b/src/ImageSharp/Image.Decode.cs
@@ -58,31 +58,42 @@ namespace SixLabors.ImageSharp
                 return null;
             }
 
-            using (IMemoryOwner<byte> buffer = config.MemoryAllocator.Allocate<byte>(headerSize, AllocationOptions.Clean))
+            // Header sizes are so small, that headersBuffer will be always stackalloc-ed in practice,
+            // and heap allocation will never happen, there is no need for the usual try-finally ArrayPool dance.
+            // The array case is only a safety mechanism following stackalloc best practices.
+            Span<byte> headersBuffer = headerSize > 512 ? new byte[headerSize] : stackalloc byte[headerSize];
+            long startPosition = stream.Position;
+
+            // Read doesn't always guarantee the full returned length so read a byte
+            // at a time until we get either our count or hit the end of the stream.
+            int n = 0;
+            int i;
+            do
             {
-                Span<byte> bufferSpan = buffer.GetSpan();
-                long startPosition = stream.Position;
+                i = stream.Read(headersBuffer, n, headerSize - n);
+                n += i;
+            }
+            while (n < headerSize && i > 0);
 
-                // Read doesn't always guarantee the full returned length so read a byte
-                // at a time until we get either our count or hit the end of the stream.
-                int n = 0;
-                int i;
-                do
+            stream.Position = startPosition;
+
+            // Does the given stream contain enough data to fit in the header for the format
+            // and does that data match the format specification?
+            // Individual formats should still check since they are public.
+            IImageFormat format = null;
+            foreach (IImageFormatDetector formatDetector in config.ImageFormatsManager.FormatDetectors)
+            {
+                if (formatDetector.HeaderSize <= headerSize)
                 {
-                    i = stream.Read(bufferSpan, n, headerSize - n);
-                    n += i;
+                    IImageFormat attemptFormat = formatDetector.DetectFormat(headersBuffer);
+                    if (attemptFormat != null)
+                    {
+                        format = attemptFormat;
+                    }
                 }
-                while (n < headerSize && i > 0);
-
-                stream.Position = startPosition;
-
-                // Does the given stream contain enough data to fit in the header for the format
-                // and does that data match the format specification?
-                // Individual formats should still check since they are public.
-                return config.ImageFormatsManager.FormatDetectors
-                    .Where(x => x.HeaderSize <= headerSize)
-                    .Select(x => x.DetectFormat(buffer.GetSpan())).LastOrDefault(x => x != null);
             }
+
+            return format;
         }
 
         /// <summary>

From 90bab3939770a028a45e3d824dc6949fa124c492 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 8 Nov 2021 16:56:38 +1100
Subject: [PATCH 26/36] Special case La32 and L16

---
 src/ImageSharp/Color/Color.Conversions.cs | 22 ++++++++++++++++++++++
 src/ImageSharp/Color/Color.cs             |  8 ++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/ImageSharp/Color/Color.Conversions.cs b/src/ImageSharp/Color/Color.Conversions.cs
index 96aa05c96..bf7869e53 100644
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@@ -34,6 +34,28 @@ namespace SixLabors.ImageSharp
             this.boxedHighPrecisionPixel = null;
         }
 
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="La32"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(La32 pixel)
+        {
+            this.data = new Rgba64(pixel.L, pixel.L, pixel.L, pixel.A);
+            this.boxedHighPrecisionPixel = null;
+        }
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Color"/> struct.
+        /// </summary>
+        /// <param name="pixel">The <see cref="L16"/> containing the color information.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public Color(L16 pixel)
+        {
+            this.data = new Rgba64(pixel.PackedValue, pixel.PackedValue, pixel.PackedValue, ushort.MaxValue);
+            this.boxedHighPrecisionPixel = null;
+        }
+
         /// <summary>
         /// Initializes a new instance of the <see cref="Color"/> struct.
         /// </summary>
diff --git a/src/ImageSharp/Color/Color.cs b/src/ImageSharp/Color/Color.cs
index c461d034e..7c21d62dd 100644
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@@ -117,6 +117,14 @@ namespace SixLabors.ImageSharp
             {
                 return new((Rgb48)(object)pixel);
             }
+            else if (typeof(TPixel) == typeof(La32))
+            {
+                return new((La32)(object)pixel);
+            }
+            else if (typeof(TPixel) == typeof(L16))
+            {
+                return new((L16)(object)pixel);
+            }
             else if (Unsafe.SizeOf<TPixel>() <= Unsafe.SizeOf<Rgba32>())
             {
                 Rgba32 p = default;

From 5c6e08b80c39f3cd4e24774ee66b5b011c41aa00 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 16:02:06 +0100
Subject: [PATCH 27/36] Avoid pinning of vp8 matrix data

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 169 +++++++++---------
 1 file changed, 85 insertions(+), 84 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index f935bd3ee..b300b7b5c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -3,6 +3,7 @@
 
 using System;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -537,99 +538,99 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             if (Sse41.IsSupported)
             {
 #pragma warning disable SA1503 // Braces should not be omitted
-                fixed (ushort* mtxIqPtr = mtx.IQ)
-                fixed (ushort* mtxQPtr = mtx.Q)
-                fixed (uint* biasQPtr = mtx.Bias)
-                fixed (short* sharpenPtr = mtx.Sharpen)
+                // Load all inputs.
+                Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
+                Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
+                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(0, 8)));
+                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(8, 8)));
+                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(0, 8)));
+                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(8, 8)));
+
+                // coeff = abs(in)
+                Vector128<ushort> coeff0 = Ssse3.Abs(input0);
+                Vector128<ushort> coeff8 = Ssse3.Abs(input8);
+
+                // coeff = abs(in) + sharpen
+                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(0, 8)));
+                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(8, 8)));
+                Sse2.Add(coeff0.AsInt16(), sharpen0);
+                Sse2.Add(coeff8.AsInt16(), sharpen8);
+
+                // out = (coeff * iQ + B) >> QFIX
+                // doing calculations with 32b precision (QFIX=17)
+                // out = (coeff * iQ)
+                Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
+                Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
+                Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
+                Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
+                Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
+                Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
+                Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
+                Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
+
+                // out = (coeff * iQ + B)
+                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(0, 4)));
+                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(4, 4)));
+                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(8, 4)));
+                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(12, 4)));
+                out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
+                out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
+                out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
+                out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
+
+                // out = QUANTDIV(coeff, iQ, B, QFIX)
+                out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
+                out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
+
+                // pack result as 16b
+                Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
+                Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
+
+                // if (coeff > 2047) coeff = 2047
+                out0 = Sse2.Min(out0, MaxCoeff2047);
+                out8 = Sse2.Min(out8, MaxCoeff2047);
+
+                // put sign back
+                out0 = Ssse3.Sign(out0, input0);
+                out8 = Ssse3.Sign(out8, input8);
+
+                // in = out * Q
+                input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
+                input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
+
                 fixed (short* inputPtr = input)
-                fixed (short* outputPtr = output)
                 {
-                    // Load all inputs.
-                    Vector128<short> input0 = Sse2.LoadVector128(inputPtr);
-                    Vector128<short> input8 = Sse2.LoadVector128(inputPtr + 8);
-                    Vector128<ushort> iq0 = Sse2.LoadVector128(mtxIqPtr);
-                    Vector128<ushort> iq8 = Sse2.LoadVector128(mtxIqPtr + 8);
-                    Vector128<ushort> q0 = Sse2.LoadVector128(mtxQPtr);
-                    Vector128<ushort> q8 = Sse2.LoadVector128(mtxQPtr + 8);
-
-                    // coeff = abs(in)
-                    Vector128<ushort> coeff0 = Ssse3.Abs(input0);
-                    Vector128<ushort> coeff8 = Ssse3.Abs(input8);
-
-                    // coeff = abs(in) + sharpen
-                    Vector128<short> sharpen0 = Sse2.LoadVector128(sharpenPtr);
-                    Vector128<short> sharpen8 = Sse2.LoadVector128(sharpenPtr + 8);
-                    Sse2.Add(coeff0.AsInt16(), sharpen0);
-                    Sse2.Add(coeff8.AsInt16(), sharpen8);
-
-                    // out = (coeff * iQ + B) >> QFIX
-                    // doing calculations with 32b precision (QFIX=17)
-                    // out = (coeff * iQ)
-                    Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
-                    Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
-                    Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
-                    Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
-                    Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
-                    Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
-                    Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
-                    Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
-
-                    // out = (coeff * iQ + B)
-                    Vector128<uint> bias00 = Sse2.LoadVector128(biasQPtr);
-                    Vector128<uint> bias04 = Sse2.LoadVector128(biasQPtr + 4);
-                    Vector128<uint> bias08 = Sse2.LoadVector128(biasQPtr + 8);
-                    Vector128<uint> bias12 = Sse2.LoadVector128(biasQPtr + 12);
-                    out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
-                    out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
-                    out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
-                    out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();
-
-                    // out = QUANTDIV(coeff, iQ, B, QFIX)
-                    out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
-                    out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();
-
-                    // pack result as 16b
-                    Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
-                    Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
-
-                    // if (coeff > 2047) coeff = 2047
-                    out0 = Sse2.Min(out0, MaxCoeff2047);
-                    out8 = Sse2.Min(out8, MaxCoeff2047);
-
-                    // put sign back
-                    out0 = Ssse3.Sign(out0, input0);
-                    out8 = Ssse3.Sign(out8, input8);
-
-                    // in = out * Q
-                    input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
-                    input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
-
                     // in = out * Q
                     Sse2.Store(inputPtr, input0);
                     Sse2.Store(inputPtr + 8, input8);
+                }
 
-                    // zigzag the output before storing it. The re-ordering is:
-                    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
-                    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
-                    // There's only two misplaced entries ([8] and [7]) that are crossing the
-                    // reg's boundaries.
-                    // We use pshufb instead of pshuflo/pshufhi.
-                    Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
-                    Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
-                    Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
-                    Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
-                    Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
-                    Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+                // zigzag the output before storing it. The re-ordering is:
+                //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+                // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+                // There's only two misplaced entries ([8] and [7]) that are crossing the
+                // reg's boundaries.
+                // We use pshufb instead of pshuflo/pshufhi.
+                Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
+                Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7);  // extract #7
+                Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
+                Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8);  // extract #8
+                Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
+                Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
+
+                fixed (short* outputPtr = output)
+                {
                     Sse2.Store(outputPtr, outZ0.AsInt16());
                     Sse2.Store(outputPtr + 8, outZ8.AsInt16());
-                    Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
-
-                    // Detect if all 'out' values are zeroes or not.
-                    Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
-                    return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
                 }
+
+                Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
+
+                // Detect if all 'out' values are zeroes or not.
+                Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
+                return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
 #pragma warning restore SA1503 // Braces should not be omitted
             }
             else

From 0c0812de82648be40a35dc63a9b6c914bdcbbbf7 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 16:58:40 +0100
Subject: [PATCH 28/36] Avoid pinning input and output data

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index b300b7b5c..6e25dc003 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -600,12 +600,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
                 input8 = Sse2.MultiplyLow(out8, q8.AsInt16());
 
-                fixed (short* inputPtr = input)
-                {
-                    // in = out * Q
-                    Sse2.Store(inputPtr, input0);
-                    Sse2.Store(inputPtr + 8, input8);
-                }
+                // in = out * Q
+                ref short inputRef = ref MemoryMarshal.GetReference(input);
+                Unsafe.As<short, Vector128<short>>(ref inputRef) = input0;
+                Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref inputRef, 8)) = input8;
 
                 // zigzag the output before storing it. The re-ordering is:
                 //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
@@ -620,11 +618,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
                 Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);
 
-                fixed (short* outputPtr = output)
-                {
-                    Sse2.Store(outputPtr, outZ0.AsInt16());
-                    Sse2.Store(outputPtr + 8, outZ8.AsInt16());
-                }
+                ref short outputRef = ref MemoryMarshal.GetReference(output);
+                Unsafe.As<short, Vector128<short>>(ref outputRef) = outZ0.AsInt16();
+                Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref outputRef, 8)) = outZ8.AsInt16();
 
                 Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
 

From cffa4b0c366a3d80b7e5c315127ae0a27f1ddb8d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Mon, 8 Nov 2021 17:00:18 +0100
Subject: [PATCH 29/36] Only test with and without HardwareIntrinsics

---
 tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index 280a7902a..d0cdfc1de 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -44,13 +44,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
         public void QuantizeBlock_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void QuantizeBlock_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2);
-
-        [Fact]
-        public void QuantizeBlock_WithoutSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSSE3);
-
-        [Fact]
-        public void QuantizeBlock_WithoutSSE2AndSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
+        public void QuantizeBlock_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableHWIntrinsic);
 #endif
     }
 }

From c9fc5cdb56a21deaf78ae4eb73a6e8270c951841 Mon Sep 17 00:00:00 2001
From: Berkan Diler <b.diler@gmx.de>
Date: Mon, 8 Nov 2021 18:33:24 +0100
Subject: [PATCH 30/36] Collapse AsSpan().Slice(..) calls into AsSpan(..)

---
 src/ImageSharp/Formats/Png/PngDecoderCore.cs                  | 2 +-
 src/ImageSharp/Formats/Webp/WebpDecoderCore.cs                | 2 +-
 src/ImageSharp/IO/ChunkedMemoryStream.cs                      | 4 ++--
 .../Processors/Transforms/Resize/ResizeKernelMap.cs           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
index 987dc150c..cf3cd7eb1 100644
--- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs
+++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
@@ -1071,7 +1071,7 @@ namespace SixLabors.ImageSharp.Formats.Png
                 int bytesRead = inflateStream.CompressedStream.Read(this.buffer, 0, this.buffer.Length);
                 while (bytesRead != 0)
                 {
-                    uncompressedBytes.AddRange(this.buffer.AsSpan().Slice(0, bytesRead).ToArray());
+                    uncompressedBytes.AddRange(this.buffer.AsSpan(0, bytesRead).ToArray());
                     bytesRead = inflateStream.CompressedStream.Read(this.buffer, 0, this.buffer.Length);
                 }
 
diff --git a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
index 44a55a4c6..09071406c 100644
--- a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
+++ b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
@@ -306,7 +306,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
 
             // Check for VP8 magic bytes.
             this.currentStream.Read(this.buffer, 0, 3);
-            if (!this.buffer.AsSpan().Slice(0, 3).SequenceEqual(WebpConstants.Vp8HeaderMagicBytes))
+            if (!this.buffer.AsSpan(0, 3).SequenceEqual(WebpConstants.Vp8HeaderMagicBytes))
             {
                 WebpThrowHelper.ThrowImageFormatException("VP8 magic bytes not found");
             }
diff --git a/src/ImageSharp/IO/ChunkedMemoryStream.cs b/src/ImageSharp/IO/ChunkedMemoryStream.cs
index b9220c56a..e28baf879 100644
--- a/src/ImageSharp/IO/ChunkedMemoryStream.cs
+++ b/src/ImageSharp/IO/ChunkedMemoryStream.cs
@@ -243,7 +243,7 @@ namespace SixLabors.ImageSharp.IO
             const string bufferMessage = "Offset subtracted from the buffer length is less than count.";
             Guard.IsFalse(buffer.Length - offset < count, nameof(buffer), bufferMessage);
 
-            return this.ReadImpl(buffer.AsSpan().Slice(offset, count));
+            return this.ReadImpl(buffer.AsSpan(offset, count));
         }
 
 #if SUPPORTS_SPAN_STREAM
@@ -359,7 +359,7 @@ namespace SixLabors.ImageSharp.IO
             const string bufferMessage = "Offset subtracted from the buffer length is less than count.";
             Guard.IsFalse(buffer.Length - offset < count, nameof(buffer), bufferMessage);
 
-            this.WriteImpl(buffer.AsSpan().Slice(offset, count));
+            this.WriteImpl(buffer.AsSpan(offset, count));
         }
 
 #if SUPPORTS_SPAN_STREAM
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
index a58c20f68..9cc468060 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -216,7 +216,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
 
             ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
 
-            Span<double> kernelValues = this.tempValues.AsSpan().Slice(0, kernel.Length);
+            Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
             double sum = 0;
 
             for (int j = left; j <= right; j++)

From 670e2eeafc14b7c16757f1b909eb552a9e61b1ca Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Tue, 9 Nov 2021 11:43:19 +1100
Subject: [PATCH 31/36] Update ColorTests.CastTo.cs

---
 .../ImageSharp.Tests/Color/ColorTests.CastTo.cs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
index af35d1f89..3003265ca 100644
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@@ -90,16 +90,25 @@ namespace SixLabors.ImageSharp.Tests
             }
 
             [Fact]
-            public void TPixel()
+            public void GenericPixel()
             {
-                var source = new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue);
+                AssertGenericPixel(new RgbaVector(float.Epsilon, 2 * float.Epsilon, float.MaxValue, float.MinValue));
+                AssertGenericPixel(new Rgba64(1, 2, ushort.MaxValue, ushort.MaxValue - 1));
+                AssertGenericPixel(new Rgb48(1, 2, ushort.MaxValue - 1));
+                AssertGenericPixel(new La32(1, ushort.MaxValue - 1));
+                AssertGenericPixel(new L16(ushort.MaxValue - 1));
+                AssertGenericPixel(new Rgba32(1, 2, 255, 254));
+            }
 
+            private static void AssertGenericPixel<TPixel>(TPixel source)
+                where TPixel : unmanaged, IPixel<TPixel>
+            {
                 // Act:
                 var color = Color.FromPixel(source);
 
                 // Assert:
-                RgbaVector data = color.ToPixel<RgbaVector>();
-                Assert.Equal(source, data);
+                TPixel actual = color.ToPixel<TPixel>();
+                Assert.Equal(source, actual);
             }
         }
     }

From cb513a905c52e843440f14c70e40fe9192737e91 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 11:05:18 +0100
Subject: [PATCH 32/36] Use fixed sized arrays in Vp8Matrix

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 20 ++++----
 .../Formats/Webp/Lossy/Vp8Encoder.cs          |  8 +---
 .../Formats/Webp/Lossy/Vp8Matrix.cs           | 47 +++++--------------
 .../Formats/Webp/Lossy/Vp8SegmentInfo.cs      | 12 ++---
 .../Formats/WebP/QuantEncTests.cs             | 17 ++++---
 5 files changed, 41 insertions(+), 63 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 6e25dc003..4c3a2ff5e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -541,18 +541,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 // Load all inputs.
                 Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
                 Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
-                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(0, 8)));
-                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(8, 8)));
-                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(0, 8)));
-                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(8, 8)));
+                Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.IQ[0]);
+                Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.IQ[8]);
+                Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.Q[0]);
+                Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref mtx.Q[8]);
 
                 // coeff = abs(in)
                 Vector128<ushort> coeff0 = Ssse3.Abs(input0);
                 Vector128<ushort> coeff8 = Ssse3.Abs(input8);
 
                 // coeff = abs(in) + sharpen
-                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(0, 8)));
-                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(8, 8)));
+                Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref mtx.Sharpen[0]);
+                Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref mtx.Sharpen[8]);
                 Sse2.Add(coeff0.AsInt16(), sharpen0);
                 Sse2.Add(coeff8.AsInt16(), sharpen8);
 
@@ -569,10 +569,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);
 
                 // out = (coeff * iQ + B)
-                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(0, 4)));
-                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(4, 4)));
-                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(8, 4)));
-                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(12, 4)));
+                Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[0]);
+                Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[4]);
+                Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[8]);
+                Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref mtx.Bias[12]);
                 out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
                 out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
                 out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 728574682..8a4115d21 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -502,7 +502,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             this.ResetStats();
         }
 
-        private void AdjustFilterStrength()
+        private unsafe void AdjustFilterStrength()
         {
             if (this.filterStrength > 0)
             {
@@ -806,7 +806,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             proba.NbSkip = 0;
         }
 
-        private void SetupMatrices(Vp8SegmentInfo[] dqm)
+        private unsafe void SetupMatrices(Vp8SegmentInfo[] dqm)
         {
             int tlambdaScale = this.method >= WebpEncodingMethod.Default ? this.spatialNoiseShaping : 0;
             for (int i = 0; i < dqm.Length; i++)
@@ -814,10 +814,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 Vp8SegmentInfo m = dqm[i];
                 int q = m.Quant;
 
-                m.Y1 = new Vp8Matrix();
-                m.Y2 = new Vp8Matrix();
-                m.Uv = new Vp8Matrix();
-
                 m.Y1.Q[0] = WebpLookupTables.DcTable[Numerics.Clamp(q + this.DqY1Dc, 0, 127)];
                 m.Y1.Q[1] = WebpLookupTables.AcTable[Numerics.Clamp(q, 0, 127)];
 
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
index e525e388b..66c91e44a 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
@@ -3,7 +3,7 @@
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal class Vp8Matrix
+    internal unsafe struct Vp8Matrix
     {
         private static readonly int[][] BiasMatrices =
         {
@@ -23,50 +23,29 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         private const int SharpenBits = 11;
 
         /// <summary>
-        /// Initializes a new instance of the <see cref="Vp8Matrix"/> class.
+        /// The quantizer steps.
         /// </summary>
-        public Vp8Matrix()
-        {
-            this.Q = new ushort[16];
-            this.IQ = new ushort[16];
-            this.Bias = new uint[16];
-            this.ZThresh = new uint[16];
-            this.Sharpen = new short[16];
-        }
-
-        public Vp8Matrix(ushort[] q, ushort[] iq, uint[] bias, uint[] zThresh, short[] sharpen)
-        {
-            this.Q = q;
-            this.IQ = iq;
-            this.Bias = bias;
-            this.ZThresh = zThresh;
-            this.Sharpen = sharpen;
-        }
-
-        /// <summary>
-        /// Gets the quantizer steps.
-        /// </summary>
-        public ushort[] Q { get; }
+        public fixed ushort Q[16];
 
         /// <summary>
-        /// Gets the reciprocals, fixed point.
+        /// The reciprocals, fixed point.
         /// </summary>
-        public ushort[] IQ { get; }
+        public fixed ushort IQ[16];
 
         /// <summary>
-        /// Gets the rounding bias.
+        /// The rounding bias.
         /// </summary>
-        public uint[] Bias { get; }
+        public fixed uint Bias[16];
 
         /// <summary>
-        /// Gets the value below which a coefficient is zeroed.
+        /// The value below which a coefficient is zeroed.
         /// </summary>
-        public uint[] ZThresh { get; }
+        public fixed uint ZThresh[16];
 
         /// <summary>
-        /// Gets the frequency boosters for slight sharpening.
+        /// The frequency boosters for slight sharpening.
         /// </summary>
-        public short[] Sharpen { get; }
+        public fixed short Sharpen[16];
 
         /// <summary>
         /// Returns the average quantizer.
@@ -81,7 +60,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 int isAcCoeff = i > 0 ? 1 : 0;
                 int bias = BiasMatrices[type][isAcCoeff];
                 this.IQ[i] = (ushort)((1 << WebpConstants.QFix) / this.Q[i]);
-                this.Bias[i] = (uint)this.BIAS(bias);
+                this.Bias[i] = (uint)BIAS(bias);
 
                 // zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is:
                 //   * zero if coeff <= zthresh
@@ -115,6 +94,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             return (sum + 8) >> 4;
         }
 
-        private int BIAS(int b) => b << (WebpConstants.QFix - 8);
+        private static int BIAS(int b) => b << (WebpConstants.QFix - 8);
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
index cf2a5c177..71983055c 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
@@ -8,19 +8,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     internal class Vp8SegmentInfo
     {
         /// <summary>
-        /// Gets or sets the quantization matrix y1.
+        /// Gets the quantization matrix y1.
         /// </summary>
-        public Vp8Matrix Y1 { get; set; }
+        public Vp8Matrix Y1;
 
         /// <summary>
-        /// Gets or sets the quantization matrix y2.
+        /// Gets the quantization matrix y2.
         /// </summary>
-        public Vp8Matrix Y2 { get; set; }
+        public Vp8Matrix Y2;
 
         /// <summary>
-        /// Gets or sets the quantization matrix uv.
+        /// Gets the quantization matrix uv.
         /// </summary>
-        public Vp8Matrix Uv { get; set; }
+        public Vp8Matrix Uv;
 
         /// <summary>
         /// Gets or sets the quant-susceptibility, range [-127,127]. Zero is neutral. Lower values indicate a lower risk of blurriness.
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index d0cdfc1de..7465c42ce 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -11,22 +11,25 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
     [Trait("Format", "Webp")]
     public class QuantEncTests
     {
-        private static void RunQuantizeBlockTest()
+        private static unsafe void RunQuantizeBlockTest()
         {
             // arrange
             short[] input = { 378, 777, -851, 888, 259, 148, 0, -111, -185, -185, -74, -37, 148, 74, 111, 74 };
             short[] output = new short[16];
             ushort[] q = { 42, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 };
             ushort[] iq = { 3120, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542 };
-            uint[] bias =
-            {
-                49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296,
-                55296, 55296
-            };
+            uint[] bias = { 49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296 };
             uint[] zthresh = { 26, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 };
             short[] expectedOutput = { 9, 21, 7, -5, 4, -23, 24, 0, -5, 4, 2, -2, -3, -1, 3, 2 };
             int expectedResult = 1;
-            var vp8Matrix = new Vp8Matrix(q, iq, bias, zthresh, new short[16]);
+            Vp8Matrix vp8Matrix = default;
+            for (int i = 0; i < 16; i++)
+            {
+                vp8Matrix.Q[i] = q[i];
+                vp8Matrix.IQ[i] = iq[i];
+                vp8Matrix.Bias[i] = bias[i];
+                vp8Matrix.ZThresh[i] = zthresh[i];
+            }
 
             // act
             int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);

From 6e135cbd79f391f56ee69df0da2b8be505631491 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 12:38:41 +0100
Subject: [PATCH 33/36] Avoid pinning

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 219 +++++++++---------
 1 file changed, 107 insertions(+), 112 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index b8f232a43..ee224e0b0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -614,120 +615,114 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         {
             Span<int> sum = scratch.Slice(0, 4);
             sum.Clear();
-#pragma warning disable SA1503 // Braces should not be omitted
-            fixed (byte* inputAPtr = inputA)
-            fixed (byte* inputBPtr = inputB)
-            fixed (ushort* wPtr = w)
-            fixed (int* outputPtr = sum)
-            {
-                // Load and combine inputs.
-                Vector128<byte> ina0 = Sse2.LoadVector128(inputAPtr);
-                Vector128<byte> ina1 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 1));
-                Vector128<byte> ina2 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 2));
-                Vector128<long> ina3 = Sse2.LoadVector128((long*)(inputAPtr + (WebpConstants.Bps * 3)));
-                Vector128<byte> inb0 = Sse2.LoadVector128(inputBPtr);
-                Vector128<byte> inb1 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 1));
-                Vector128<byte> inb2 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 2));
-                Vector128<long> inb3 = Sse2.LoadVector128((long*)(inputBPtr + (WebpConstants.Bps * 3)));
-
-                // Combine inA and inB (we'll do two transforms in parallel).
-                Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
-                Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
-                Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
-                Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
-                Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
-                Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
-                Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
-                Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
-
-                // a00 a01 a02 a03   b00 b01 b02 b03
-                // a10 a11 a12 a13   b10 b11 b12 b13
-                // a20 a21 a22 a23   b20 b21 b22 b23
-                // a30 a31 a32 a33   b30 b31 b32 b33
-                // Vertical pass first to avoid a transpose (vertical and horizontal passes
-                // are commutative because w/kWeightY is symmetric) and subsequent transpose.
-                // Calculate a and b (two 4x4 at once).
-                Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
-                Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
-                Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
-                Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
-                Vector128<short> b0 = Sse2.Add(a0, a1);
-                Vector128<short> b1 = Sse2.Add(a3, a2);
-                Vector128<short> b2 = Sse2.Subtract(a3, a2);
-                Vector128<short> b3 = Sse2.Subtract(a0, a1);
-
-                // a00 a01 a02 a03   b00 b01 b02 b03
-                // a10 a11 a12 a13   b10 b11 b12 b13
-                // a20 a21 a22 a23   b20 b21 b22 b23
-                // a30 a31 a32 a33   b30 b31 b32 b33
-                // Transpose the two 4x4.
-                Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
-                Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
-                Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
-                Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
-
-                // a00 a10 a01 a11   a02 a12 a03 a13
-                // a20 a30 a21 a31   a22 a32 a23 a33
-                // b00 b10 b01 b11   b02 b12 b03 b13
-                // b20 b30 b21 b31   b22 b32 b23 b33
-                Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
-                Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
-                Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
-                Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
-
-                // a00 a10 a20 a30 a01 a11 a21 a31
-                // b00 b10 b20 b30 b01 b11 b21 b31
-                // a02 a12 a22 a32 a03 a13 a23 a33
-                // b02 b12 a22 b32 b03 b13 b23 b33
-                Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
-                Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
-                Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
-                Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
-
-                // a00 a10 a20 a30   b00 b10 b20 b30
-                // a01 a11 a21 a31   b01 b11 b21 b31
-                // a02 a12 a22 a32   b02 b12 b22 b32
-                // a03 a13 a23 a33   b03 b13 b23 b33
-                // Horizontal pass and difference of weighted sums.
-                Vector128<ushort> w0 = Sse2.LoadVector128(wPtr);
-                Vector128<ushort> w8 = Sse2.LoadVector128(wPtr + 8);
-
-                // Calculate a and b (two 4x4 at once).
-                a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
-                a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
-                a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
-                a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
-                b0 = Sse2.Add(a0, a1);
-                b1 = Sse2.Add(a3, a2);
-                b2 = Sse2.Subtract(a3, a2);
-                b3 = Sse2.Subtract(a0, a1);
-
-                // Separate the transforms of inA and inB.
-                Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
-                Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
-                Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
-                Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
-
-                Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
-                Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
-                Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
-                Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
-
-                // weighted sums.
-                Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
-                Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
-                Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
-                Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
-                Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
-                Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
-
-                // difference of weighted sums.
-                Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
-                Sse2.Store(outputPtr, result.AsInt32());
-            }
 
+            // Load and combine inputs.
+            Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
+            Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+            Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
+            Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+
+            // Combine inA and inB (we'll do two transforms in parallel).
+            Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
+            Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
+            Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
+            Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
+            Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
+            Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
+            Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
+            Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Vertical pass first to avoid a transpose (vertical and horizontal passes
+            // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+            // Calculate a and b (two 4x4 at once).
+            Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
+            Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
+            Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
+            Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
+            Vector128<short> b0 = Sse2.Add(a0, a1);
+            Vector128<short> b1 = Sse2.Add(a3, a2);
+            Vector128<short> b2 = Sse2.Subtract(a3, a2);
+            Vector128<short> b3 = Sse2.Subtract(a0, a1);
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Transpose the two 4x4.
+            Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+            Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+            Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+            Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+            // a00 a10 a01 a11   a02 a12 a03 a13
+            // a20 a30 a21 a31   a22 a32 a23 a33
+            // b00 b10 b01 b11   b02 b12 b03 b13
+            // b20 b30 b21 b31   b22 b32 b23 b33
+            Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+            Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+            // a00 a10 a20 a30 a01 a11 a21 a31
+            // b00 b10 b20 b30 b01 b11 b21 b31
+            // a02 a12 a22 a32 a03 a13 a23 a33
+            // b02 b12 a22 b32 b03 b13 b23 b33
+            Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+            Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+            // a00 a10 a20 a30   b00 b10 b20 b30
+            // a01 a11 a21 a31   b01 b11 b21 b31
+            // a02 a12 a22 a32   b02 b12 b22 b32
+            // a03 a13 a23 a33   b03 b13 b23 b33
+            // Horizontal pass and difference of weighted sums.
+            Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
+            Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
+
+            // Calculate a and b (two 4x4 at once).
+            a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
+            a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
+            a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
+            a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
+            b0 = Sse2.Add(a0, a1);
+            b1 = Sse2.Add(a3, a2);
+            b2 = Sse2.Subtract(a3, a2);
+            b3 = Sse2.Subtract(a0, a1);
+
+            // Separate the transforms of inA and inB.
+            Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
+            Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
+
+            Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
+            Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
+            Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
+            Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
+
+            // weighted sums.
+            Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
+            Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
+
+            // difference of weighted sums.
+            Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
+
+            ref int outputRef = ref MemoryMarshal.GetReference(sum);
+            Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
             return sum[3] + sum[2] + sum[1] + sum[0];
-#pragma warning restore SA1503 // Braces should not be omitted
         }
 #endif
 

From d6d1868343831184d94482895e5f4d3837e643cf Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 12:40:27 +0100
Subject: [PATCH 34/36] Test Hadamard transform only with and without
 HardwareIntrinsics

---
 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index 349a0c8fc..f8b488fde 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -45,13 +45,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
         public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void HadamardTransform_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE2);
-
-        [Fact]
-        public void HadamardTransform_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41);
-
-        [Fact]
-        public void HadamardTransform_WithoutSSE2AndSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableSSE41 | HwIntrinsics.DisableSSE2);
+        public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
 #endif
 
     }

From 42c2cf7a799af7c5a6b504ec6233fc6a7308c030 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 13:40:40 +0100
Subject: [PATCH 35/36] Disable SA1401 in file: Fields should be private

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
index 71983055c..2ce383d9e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8SegmentInfo.cs
@@ -10,6 +10,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         /// <summary>
         /// Gets the quantization matrix y1.
         /// </summary>
+#pragma warning disable SA1401 // Fields should be private
         public Vp8Matrix Y1;
 
         /// <summary>
@@ -21,6 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         /// Gets the quantization matrix uv.
         /// </summary>
         public Vp8Matrix Uv;
+#pragma warning restore SA1401 // Fields should be private
 
         /// <summary>
         /// Gets or sets the quant-susceptibility, range [-127,127]. Zero is neutral. Lower values indicate a lower risk of blurriness.

From 8160a0eeb6a7bb5e8dc65ca1827a754d5a0e1e81 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Tue, 9 Nov 2021 13:40:54 +0100
Subject: [PATCH 36/36] Pass Vp8Matrix as ref

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 34 +++++++++----------
 .../Formats/WebP/QuantEncTests.cs             |  2 +-
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 4c3a2ff5e..97ef27d25 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -315,14 +315,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             }
 
             Vp8Encoding.FTransformWht(tmp, dcTmp, scratch);
-            nz |= QuantizeBlock(dcTmp, rd.YDcLevels, dqm.Y2) << 24;
+            nz |= QuantizeBlock(dcTmp, rd.YDcLevels, ref dqm.Y2) << 24;
 
             for (n = 0; n < 16; n += 2)
             {
                 // Zero-out the first coeff, so that: a) nz is correct below, and
                 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
                 tmp[n * 16] = tmp[(n + 1) * 16] = 0;
-                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), dqm.Y1) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.YAcLevels.AsSpan(n * 16, 32), ref dqm.Y1) << n;
             }
 
             // Transform back.
@@ -343,7 +343,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             tmp.Clear();
             scratch.Clear();
             Vp8Encoding.FTransform(src, reference, tmp, scratch);
-            int nz = QuantizeBlock(tmp, levels, dqm.Y1);
+            int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
             Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
 
             return nz;
@@ -370,11 +370,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                     scratch);
             }
 
-            CorrectDcValues(it, dqm.Uv, tmp, rd);
+            CorrectDcValues(it, ref dqm.Uv, tmp, rd);
 
             for (n = 0; n < 8; n += 2)
             {
-                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), dqm.Uv) << n;
+                nz |= Quantize2Blocks(tmp.Slice(n * 16, 32), rd.UvLevels.AsSpan(n * 16, 32), ref dqm.Uv) << n;
             }
 
             for (n = 0; n < 8; n += 2)
@@ -525,19 +525,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static int Quantize2Blocks(Span<short> input, Span<short> output, Vp8Matrix mtx)
+        public static int Quantize2Blocks(Span<short> input, Span<short> output, ref Vp8Matrix mtx)
         {
-            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), mtx) << 0;
-            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), mtx) << 1;
+            int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), ref mtx) << 0;
+            nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), ref mtx) << 1;
             return nz;
         }
 
-        public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix mtx)
+        public static int QuantizeBlock(Span<short> input, Span<short> output, ref Vp8Matrix mtx)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
             {
-#pragma warning disable SA1503 // Braces should not be omitted
                 // Load all inputs.
                 Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
                 Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
@@ -624,10 +623,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
                 Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());
 
-                // Detect if all 'out' values are zeroes or not.
+                // Detect if all 'out' values are zeros or not.
                 Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
                 return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
-#pragma warning restore SA1503 // Braces should not be omitted
             }
             else
 #endif
@@ -675,7 +673,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 
         // Quantize as usual, but also compute and return the quantization error.
         // Error is already divided by DSHIFT.
-        public static int QuantizeSingle(Span<short> v, Vp8Matrix mtx)
+        public static int QuantizeSingle(Span<short> v, ref Vp8Matrix mtx)
         {
             int v0 = v[0];
             bool sign = v0 < 0;
@@ -696,7 +694,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
             return (sign ? -v0 : v0) >> DSCALE;
         }
 
-        public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, Span<short> tmp, Vp8ModeScore rd)
+        public static void CorrectDcValues(Vp8EncIterator it, ref Vp8Matrix mtx, Span<short> tmp, Vp8ModeScore rd)
         {
 #pragma warning disable SA1005 // Single line comments should begin with single space
             //         | top[0] | top[1]
@@ -713,13 +711,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                 Span<sbyte> left = it.LeftDerr.AsSpan(ch, 2);
                 Span<short> c = tmp.Slice(ch * 4 * 16, 4 * 16);
                 c[0] += (short)(((C1 * top[0]) + (C2 * left[0])) >> (DSHIFT - DSCALE));
-                int err0 = QuantizeSingle(c, mtx);
+                int err0 = QuantizeSingle(c, ref mtx);
                 c[1 * 16] += (short)(((C1 * top[1]) + (C2 * err0)) >> (DSHIFT - DSCALE));
-                int err1 = QuantizeSingle(c.Slice(1 * 16), mtx);
+                int err1 = QuantizeSingle(c.Slice(1 * 16), ref mtx);
                 c[2 * 16] += (short)(((C1 * err0) + (C2 * left[1])) >> (DSHIFT - DSCALE));
-                int err2 = QuantizeSingle(c.Slice(2 * 16), mtx);
+                int err2 = QuantizeSingle(c.Slice(2 * 16), ref mtx);
                 c[3 * 16] += (short)(((C1 * err1) + (C2 * err2)) >> (DSHIFT - DSCALE));
-                int err3 = QuantizeSingle(c.Slice(3 * 16), mtx);
+                int err3 = QuantizeSingle(c.Slice(3 * 16), ref mtx);
 
                 rd.Derr[ch, 0] = err1;
                 rd.Derr[ch, 1] = err2;
diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
index 7465c42ce..55738199b 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@@ -32,7 +32,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
             }
 
             // act
-            int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);
+            int actualResult = QuantEnc.QuantizeBlock(input, output, ref vp8Matrix);
 
             // assert
             Assert.True(output.SequenceEqual(expectedOutput));