Merge branch 'main' into heic-support

2 years ago · 43569dbbe9
52 changed files with 1212 additions and 1021 deletions
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -4,6 +4,7 @@ on:
  push:
    branches:
      - main
+      - release/*
    tags:
      - "v*"
  pull_request:
@ -67,7 +68,7 @@ jobs:
        run: git lfs ls-files -l | cut -d' ' -f1 | sort > .lfs-assets-id

      - name: Git Setup LFS Cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: lfs-cache
        with:
          path: .git/lfs
@ -77,10 +78,10 @@ jobs:
        run: git lfs pull

      - name: NuGet Install
-        uses: NuGet/setup-nuget@v1
+        uses: NuGet/setup-nuget@v2

      - name: NuGet Setup Cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: nuget-cache
        with:
          path: ~/.nuget
@ -132,7 +133,7 @@ jobs:
          XUNIT_PATH: .\tests\ImageSharp.Tests # Required for xunit

      - name: Export Failed Output
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: failure()
        with:
          name: actual_output_${{ runner.os }}_${{ matrix.options.framework }}${{ matrix.options.runtime }}.zip
@ -159,10 +160,10 @@ jobs:
          submodules: recursive

      - name: NuGet Install
-        uses: NuGet/setup-nuget@v1
+        uses: NuGet/setup-nuget@v2

      - name: NuGet Setup Cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: nuget-cache
        with:
          path: ~/.nuget
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -34,7 +34,7 @@ jobs:
        run: git lfs ls-files -l | cut -d' ' -f1 | sort > .lfs-assets-id

      - name: Git Setup LFS Cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: lfs-cache
        with:
          path: .git/lfs
@ -44,10 +44,10 @@ jobs:
        run: git lfs pull

      - name: NuGet Install
-        uses: NuGet/setup-nuget@v1
+        uses: NuGet/setup-nuget@v2

      - name: NuGet Setup Cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: nuget-cache
        with:
          path: ~/.nuget
@ -74,14 +74,14 @@ jobs:
          XUNIT_PATH: .\tests\ImageSharp.Tests # Required for xunit

      - name: Export Failed Output
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: failure()
        with:
          name: actual_output_${{ runner.os }}_${{ matrix.options.framework }}${{ matrix.options.runtime }}.zip
          path: tests/Images/ActualOutput/

      - name: Codecov Update
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
        if: matrix.options.codecov == true && startsWith(github.repository, 'SixLabors')
        with:
          flags: unittests
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@ -238,6 +238,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{5C9B68
 		tests\Images\Input\Jpg\issues\issue750-exif-tranform.jpg = tests\Images\Input\Jpg\issues\issue750-exif-tranform.jpg
 		tests\Images\Input\Jpg\issues\Issue845-Incorrect-Quality99.jpg = tests\Images\Input\Jpg\issues\Issue845-Incorrect-Quality99.jpg
 		tests\Images\Input\Jpg\issues\issue855-incorrect-colorspace.jpg = tests\Images\Input\Jpg\issues\issue855-incorrect-colorspace.jpg
+		tests\Images\Input\Jpg\issues\issue-2067-comment.jpg = tests\Images\Input\Jpg\issues\issue-2067-comment.jpg
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "fuzz", "fuzz", "{516A3532-6AC2-417B-AD79-9BD5D0D378A0}"
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@ -25,7 +25,7 @@ public readonly partial struct Color : IEquatable<Color>
    /// Initializes a new instance of the <see cref="Color"/> struct.
    /// </summary>
    /// <param name="vector">The <see cref="Vector4"/> containing the color information.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private Color(Vector4 vector)
    {
        this.data = Numerics.Clamp(vector, Vector4.Zero, Vector4.One);
@ -36,28 +36,13 @@ public readonly partial struct Color : IEquatable<Color>
    /// Initializes a new instance of the <see cref="Color"/> struct.
    /// </summary>
    /// <param name="pixel">The pixel containing color information.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private Color(IPixel pixel)
    {
        this.boxedHighPrecisionPixel = pixel;
        this.data = default;
    }

-    /// <summary>
-    /// Converts a <see cref="Color"/> to <see cref="Vector4"/>.
-    /// </summary>
-    /// <param name="color">The <see cref="Color"/>.</param>
-    /// <returns>The <see cref="Vector4"/>.</returns>
-    public static explicit operator Vector4(Color color) => color.ToScaledVector4();
-
-    /// <summary>
-    /// Converts an <see cref="Vector4"/> to <see cref="Color"/>.
-    /// </summary>
-    /// <param name="source">The <see cref="Vector4"/>.</param>
-    /// <returns>The <see cref="Color"/>.</returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    public static explicit operator Color(Vector4 source) => new(source);
-
    /// <summary>
    /// Checks whether two <see cref="Color"/> structures are equal.
    /// </summary>
@ -67,7 +52,7 @@ public readonly partial struct Color : IEquatable<Color>
    /// True if the <paramref name="left"/> parameter is equal to the <paramref name="right"/> parameter;
    /// otherwise, false.
    /// </returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool operator ==(Color left, Color right) => left.Equals(right);

    /// <summary>
@ -79,36 +64,44 @@ public readonly partial struct Color : IEquatable<Color>
    /// True if the <paramref name="left"/> parameter is not equal to the <paramref name="right"/> parameter;
    /// otherwise, false.
    /// </returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool operator !=(Color left, Color right) => !left.Equals(right);

    /// <summary>
    /// Creates a <see cref="Color"/> from the given <typeparamref name="TPixel"/>.
    /// </summary>
-    /// <param name="pixel">The pixel to convert from.</param>
+    /// <param name="source">The pixel to convert from.</param>
    /// <typeparam name="TPixel">The pixel format.</typeparam>
    /// <returns>The <see cref="Color"/>.</returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    public static Color FromPixel<TPixel>(TPixel pixel)
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Color FromPixel<TPixel>(TPixel source)
        where TPixel : unmanaged, IPixel<TPixel>
    {
        // Avoid boxing in case we can convert to Vector4 safely and efficiently
        PixelTypeInfo info = TPixel.GetPixelTypeInfo();
        if (info.ComponentInfo.HasValue && info.ComponentInfo.Value.GetMaximumComponentPrecision() <= (int)PixelComponentBitDepth.Bit32)
        {
-            return new(pixel.ToScaledVector4());
+            return new(source.ToScaledVector4());
        }

-        return new(pixel);
+        return new(source);
    }

+    /// <summary>
+    /// Creates a <see cref="Color"/> from a generic scaled <see cref="Vector4"/>.
+    /// </summary>
+    /// <param name="source">The vector to load the pixel from.</param>
+    /// <returns>The <see cref="Color"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Color FromScaledVector(Vector4 source) => new(source);
+
    /// <summary>
    /// Bulk converts a span of a specified <typeparamref name="TPixel"/> type to a span of <see cref="Color"/>.
    /// </summary>
    /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
    /// <param name="source">The source pixel span.</param>
    /// <param name="destination">The destination color span.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static void FromPixel<TPixel>(ReadOnlySpan<TPixel> source, Span<Color> destination)
        where TPixel : unmanaged, IPixel<TPixel>
    {
@ -120,7 +113,7 @@ public readonly partial struct Color : IEquatable<Color>
        {
            for (int i = 0; i < destination.Length; i++)
            {
-                destination[i] = new(source[i].ToScaledVector4());
+                destination[i] = FromScaledVector(source[i].ToScaledVector4());
            }
        }
        else
@ -143,7 +136,7 @@ public readonly partial struct Color : IEquatable<Color>
    /// <returns>
    /// The <see cref="Color"/>.
    /// </returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Color ParseHex(string hex)
    {
        Rgba32 rgba = Rgba32.ParseHex(hex);
@ -162,7 +155,7 @@ public readonly partial struct Color : IEquatable<Color>
    /// <returns>
    /// The <see cref="bool"/>.
    /// </returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool TryParseHex(string hex, out Color result)
    {
        result = default;
@ -236,16 +229,16 @@ public readonly partial struct Color : IEquatable<Color>
    /// <returns>The color having it's alpha channel altered.</returns>
    public Color WithAlpha(float alpha)
    {
-        Vector4 v = (Vector4)this;
+        Vector4 v = this.ToScaledVector4();
        v.W = alpha;
-        return new Color(v);
+        return FromScaledVector(v);
    }

    /// <summary>
    /// Gets the hexadecimal representation of the color instance in rrggbbaa form.
    /// </summary>
    /// <returns>A hexadecimal string representation of the value.</returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public string ToHex()
    {
        if (this.boxedHighPrecisionPixel is not null)
@ -263,8 +256,8 @@ public readonly partial struct Color : IEquatable<Color>
    /// Converts the color instance to a specified <typeparamref name="TPixel"/> type.
    /// </summary>
    /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
-    /// <returns>The pixel value.</returns>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    /// <returns>The <typeparamref name="TPixel"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public TPixel ToPixel<TPixel>()
        where TPixel : unmanaged, IPixel<TPixel>
    {
@ -281,13 +274,30 @@ public readonly partial struct Color : IEquatable<Color>
        return TPixel.FromScaledVector4(this.boxedHighPrecisionPixel.ToScaledVector4());
    }

+    /// <summary>
+    /// Expands the color into a generic ("scaled") <see cref="Vector4"/> representation
+    /// with values scaled and clamped between <value>0</value> and <value>1</value>.
+    /// The vector components are typically expanded in least to greatest significance order.
+    /// </summary>
+    /// <returns>The <see cref="Vector4"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public Vector4 ToScaledVector4()
+    {
+        if (this.boxedHighPrecisionPixel is null)
+        {
+            return this.data;
+        }
+
+        return this.boxedHighPrecisionPixel.ToScaledVector4();
+    }
+
    /// <summary>
    /// Bulk converts a span of <see cref="Color"/> to a span of a specified <typeparamref name="TPixel"/> type.
    /// </summary>
    /// <typeparam name="TPixel">The pixel type to convert to.</typeparam>
    /// <param name="source">The source color span.</param>
    /// <param name="destination">The destination pixel span.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static void ToPixel<TPixel>(ReadOnlySpan<Color> source, Span<TPixel> destination)
        where TPixel : unmanaged, IPixel<TPixel>
    {
@ -301,7 +311,7 @@ public readonly partial struct Color : IEquatable<Color>
    }

    /// <inheritdoc />
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public bool Equals(Color other)
    {
        if (this.boxedHighPrecisionPixel is null && other.boxedHighPrecisionPixel is null)
@ -316,7 +326,7 @@ public readonly partial struct Color : IEquatable<Color>
    public override bool Equals(object? obj) => obj is Color other && this.Equals(other);

    /// <inheritdoc />
-    [MethodImpl(InliningOptions.ShortMethod)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public override int GetHashCode()
    {
        if (this.boxedHighPrecisionPixel is null)
@ -326,15 +336,4 @@ public readonly partial struct Color : IEquatable<Color>

        return this.boxedHighPrecisionPixel.GetHashCode();
    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private Vector4 ToScaledVector4()
-    {
-        if (this.boxedHighPrecisionPixel is null)
-        {
-            return this.data;
-        }
-
-        return this.boxedHighPrecisionPixel.ToScaledVector4();
-    }
 }
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -1010,6 +1010,26 @@ internal static class Numerics
        where TVector : struct
        => (uint)span.Length / (uint)Vector256<TVector>.Count;

+    /// <summary>
+    /// Gets the count of vectors that safely fit into the given span.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="span">The given span.</param>
+    /// <returns>Count of vectors that safely fit into the span.</returns>
+    public static nuint Vector512Count<TVector>(this Span<byte> span)
+        where TVector : struct
+        => (uint)span.Length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Gets the count of vectors that safely fit into the given span.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="span">The given span.</param>
+    /// <returns>Count of vectors that safely fit into the span.</returns>
+    public static nuint Vector512Count<TVector>(this ReadOnlySpan<byte> span)
+        where TVector : struct
+        => (uint)span.Length / (uint)Vector512<TVector>.Count;
+
    /// <summary>
    /// Gets the count of vectors that safely fit into the given span.
    /// </summary>
@ -1049,4 +1069,24 @@ internal static class Numerics
    public static nuint Vector256Count<TVector>(int length)
        where TVector : struct
        => (uint)length / (uint)Vector256<TVector>.Count;
+
+    /// <summary>
+    /// Gets the count of vectors that safely fit into the given span.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="span">The given span.</param>
+    /// <returns>Count of vectors that safely fit into the span.</returns>
+    public static nuint Vector512Count<TVector>(this Span<float> span)
+        where TVector : struct
+        => (uint)span.Length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Gets the count of vectors that safely fit into length.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="length">The given length.</param>
+    /// <returns>Count of vectors that safely fit into the length.</returns>
+    public static nuint Vector512Count<TVector>(int length)
+        where TVector : struct
+        => (uint)length / (uint)Vector512<TVector>.Count;
 }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
@ -0,0 +1,78 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp;
+
+internal static partial class SimdUtils
+{
+    /// <summary>
+    /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
+    /// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
+    /// but there are no restrictions on the span's length.
+    /// </summary>
+    /// <param name="source">The source span of bytes</param>
+    /// <param name="destination">The destination span of floats</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> destination)
+    {
+        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
+
+        HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref destination);
+
+        if (source.Length > 0)
+        {
+            ConvertByteToNormalizedFloatRemainder(source, destination);
+        }
+    }
+
+    /// <summary>
+    /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
+    /// The values are scaled up into [0-255] and rounded, overflows are clamped.
+    /// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
+    /// but there are no restrictions on the span's length.
+    /// </summary>
+    /// <param name="source">The source span of floats</param>
+    /// <param name="destination">The destination span of bytes</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
+    {
+        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
+
+        HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
+
+        if (source.Length > 0)
+        {
+            ConvertNormalizedFloatToByteRemainder(source, destination);
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
+    {
+        ref byte sBase = ref MemoryMarshal.GetReference(source);
+        ref float dBase = ref MemoryMarshal.GetReference(destination);
+
+        for (int i = 0; i < source.Length; i++)
+        {
+            Unsafe.Add(ref dBase, (uint)i) = Unsafe.Add(ref sBase, (uint)i) / 255f;
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
+    {
+        ref float sBase = ref MemoryMarshal.GetReference(source);
+        ref byte dBase = ref MemoryMarshal.GetReference(destination);
+
+        for (int i = 0; i < source.Length; i++)
+        {
+            Unsafe.Add(ref dBase, (uint)i) = ConvertToByte(Unsafe.Add(ref sBase, (uint)i));
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255f) + 0.5f, 0, 255f);
+}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -1,182 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-// ReSharper disable MemberHidesStaticFromOuterClass
-namespace SixLabors.ImageSharp;
-
-internal static partial class SimdUtils
-{
-    /// <summary>
-    /// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
-    /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
-    /// See:
-    /// https://github.com/dotnet/coreclr/pull/10662
-    /// API Proposal:
-    /// https://github.com/dotnet/corefx/issues/15957
-    /// </summary>
-    public static class ExtendedIntrinsics
-    {
-        public static bool IsAvailable { get; } = Vector.IsHardwareAccelerated;
-
-        /// <summary>
-        /// Widen and convert a vector of <see cref="short"/> values into 2 vectors of <see cref="float"/>-s.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void ConvertToSingle(
-            Vector<short> source,
-            out Vector<float> dest1,
-            out Vector<float> dest2)
-        {
-            Vector.Widen(source, out Vector<int> i1, out Vector<int> i2);
-            dest1 = Vector.ConvertToSingle(i1);
-            dest2 = Vector.ConvertToSingle(i2);
-        }
-
-        /// <summary>
-        /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static void ByteToNormalizedFloatReduce(
-            ref ReadOnlySpan<byte> source,
-            ref Span<float> dest)
-        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-            if (!IsAvailable)
-            {
-                return;
-            }
-
-            int remainder = Numerics.ModuloP2(source.Length, Vector<byte>.Count);
-            int adjustedCount = source.Length - remainder;
-
-            if (adjustedCount > 0)
-            {
-                ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
-
-                source = source[adjustedCount..];
-                dest = dest[adjustedCount..];
-            }
-        }
-
-        /// <summary>
-        /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static void NormalizedFloatToByteSaturateReduce(
-            ref ReadOnlySpan<float> source,
-            ref Span<byte> dest)
-        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-            if (!IsAvailable)
-            {
-                return;
-            }
-
-            int remainder = Numerics.ModuloP2(source.Length, Vector<byte>.Count);
-            int adjustedCount = source.Length - remainder;
-
-            if (adjustedCount > 0)
-            {
-                NormalizedFloatToByteSaturate(source[..adjustedCount], dest[..adjustedCount]);
-
-                source = source[adjustedCount..];
-                dest = dest[adjustedCount..];
-            }
-        }
-
-        /// <summary>
-        /// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
-        /// </summary>
-        internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
-        {
-            VerifySpanInput(source, dest, Vector<byte>.Count);
-
-            nuint n = dest.VectorCount<byte>();
-
-            ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
-            ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
-
-            for (nuint i = 0; i < n; i++)
-            {
-                Vector<byte> b = Unsafe.Add(ref sourceBase, i);
-
-                Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
-                Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
-                Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
-
-                Vector<float> f0 = ConvertToSingle(w0);
-                Vector<float> f1 = ConvertToSingle(w1);
-                Vector<float> f2 = ConvertToSingle(w2);
-                Vector<float> f3 = ConvertToSingle(w3);
-
-                ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
-                d = f0;
-                Unsafe.Add(ref d, 1) = f1;
-                Unsafe.Add(ref d, 2) = f2;
-                Unsafe.Add(ref d, 3) = f3;
-            }
-        }
-
-        /// <summary>
-        /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
-        /// </summary>
-        internal static void NormalizedFloatToByteSaturate(
-            ReadOnlySpan<float> source,
-            Span<byte> dest)
-        {
-            VerifySpanInput(source, dest, Vector<byte>.Count);
-
-            nuint n = dest.VectorCount<byte>();
-
-            ref Vector<float> sourceBase =
-                ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
-            ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));
-
-            for (nuint i = 0; i < n; i++)
-            {
-                ref Vector<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
-
-                Vector<float> f0 = s;
-                Vector<float> f1 = Unsafe.Add(ref s, 1);
-                Vector<float> f2 = Unsafe.Add(ref s, 2);
-                Vector<float> f3 = Unsafe.Add(ref s, 3);
-
-                Vector<uint> w0 = ConvertToUInt32(f0);
-                Vector<uint> w1 = ConvertToUInt32(f1);
-                Vector<uint> w2 = ConvertToUInt32(f2);
-                Vector<uint> w3 = ConvertToUInt32(f3);
-
-                var u0 = Vector.Narrow(w0, w1);
-                var u1 = Vector.Narrow(w2, w3);
-
-                Unsafe.Add(ref destBase, i) = Vector.Narrow(u0, u1);
-            }
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector<uint> ConvertToUInt32(Vector<float> vf)
-        {
-            var maxBytes = new Vector<float>(255f);
-            vf *= maxBytes;
-            vf += new Vector<float>(0.5f);
-            vf = Vector.Min(Vector.Max(vf, Vector<float>.Zero), maxBytes);
-            var vi = Vector.ConvertToInt32(vf);
-            return Vector.AsVectorUInt32(vi);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector<float> ConvertToSingle(Vector<uint> u)
-        {
-            var vi = Vector.AsVectorInt32(u);
-            var v = Vector.ConvertToSingle(vi);
-            v *= new Vector<float>(1f / 255f);
-            return v;
-        }
-    }
-}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@ -1,144 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-// ReSharper disable MemberHidesStaticFromOuterClass
-namespace SixLabors.ImageSharp;
-
-internal static partial class SimdUtils
-{
-    /// <summary>
-    /// Fallback implementation based on <see cref="Vector4"/> (128bit).
-    /// For <see cref="Vector4"/>, efficient software fallback implementations are present,
-    /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
-    /// </summary>
-    public static class FallbackIntrinsics128
-    {
-        /// <summary>
-        /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static void ByteToNormalizedFloatReduce(
-            ref ReadOnlySpan<byte> source,
-            ref Span<float> dest)
-        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-            int remainder = Numerics.Modulo4(source.Length);
-            int adjustedCount = source.Length - remainder;
-
-            if (adjustedCount > 0)
-            {
-                ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
-
-                source = source[adjustedCount..];
-                dest = dest[adjustedCount..];
-            }
-        }
-
-        /// <summary>
-        /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static void NormalizedFloatToByteSaturateReduce(
-            ref ReadOnlySpan<float> source,
-            ref Span<byte> dest)
-        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-            int remainder = Numerics.Modulo4(source.Length);
-            int adjustedCount = source.Length - remainder;
-
-            if (adjustedCount > 0)
-            {
-                NormalizedFloatToByteSaturate(
-                    source[..adjustedCount],
-                    dest[..adjustedCount]);
-
-                source = source[adjustedCount..];
-                dest = dest[adjustedCount..];
-            }
-        }
-
-        /// <summary>
-        /// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
-        /// </summary>
-        [MethodImpl(InliningOptions.ColdPath)]
-        internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
-        {
-            VerifySpanInput(source, dest, 4);
-
-            uint count = (uint)dest.Length / 4;
-            if (count == 0)
-            {
-                return;
-            }
-
-            ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
-            ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
-
-            const float scale = 1f / 255f;
-            Vector4 d = default;
-
-            for (nuint i = 0; i < count; i++)
-            {
-                ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
-                d.X = s.X;
-                d.Y = s.Y;
-                d.Z = s.Z;
-                d.W = s.W;
-                d *= scale;
-                Unsafe.Add(ref dBase, i) = d;
-            }
-        }
-
-        /// <summary>
-        /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> using <see cref="Vector4"/>.
-        /// </summary>
-        [MethodImpl(InliningOptions.ColdPath)]
-        internal static void NormalizedFloatToByteSaturate(
-            ReadOnlySpan<float> source,
-            Span<byte> dest)
-        {
-            VerifySpanInput(source, dest, 4);
-
-            uint count = (uint)source.Length / 4;
-            if (count == 0)
-            {
-                return;
-            }
-
-            ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
-            ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
-
-            var half = new Vector4(0.5f);
-            var maxBytes = new Vector4(255f);
-
-            for (nuint i = 0; i < count; i++)
-            {
-                Vector4 s = Unsafe.Add(ref sBase, i);
-                s *= maxBytes;
-                s += half;
-                s = Numerics.Clamp(s, Vector4.Zero, maxBytes);
-
-                ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
-                d.X = (byte)s.X;
-                d.Y = (byte)s.Y;
-                d.Z = (byte)s.Z;
-                d.W = (byte)s.W;
-            }
-        }
-
-        [StructLayout(LayoutKind.Sequential)]
-        private struct ByteVector4
-        {
-            public byte X;
-            public byte Y;
-            public byte Z;
-            public byte W;
-        }
-    }
-}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -17,8 +17,13 @@ internal static partial class SimdUtils
 {
    public static class HwIntrinsics
    {
+#pragma warning disable SA1117 // Parameters should be on same line or separate lines
+#pragma warning disable SA1137 // Elements should have the same indentation
        [MethodImpl(MethodImplOptions.AggressiveInlining)] // too much IL for JIT to inline, so give a hint
-        public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsInt32();
+        public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector512<int> PermuteMaskDeinterleave16x32() => Vector512.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Vector256<uint> PermuteMaskEvenOdd8x32() => Vector256.Create(0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32();
@ -38,17 +43,15 @@ internal static partial class SimdUtils
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static Vector128<byte> ShuffleMaskSlice4Nx16() => Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80);

-#pragma warning disable SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create((byte)
-            0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15,
-            0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
+        private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create(
+            (byte)0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15,
+                  0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static Vector256<uint> PermuteMaskShiftAlpha8x32() => Vector256.Create(
-            0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0,
-            5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32();
-#pragma warning restore SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
+        public static Vector256<uint> PermuteMaskShiftAlpha8x32() => Vector256.Create(0u, 1, 2, 4, 5, 6, 3, 7);
+#pragma warning restore SA1137 // Elements should have the same indentation
+#pragma warning restore SA1117 // Parameters should be on same line or separate lines

        /// <summary>
        /// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@ -749,17 +752,23 @@ internal static partial class SimdUtils
        /// <summary>
        /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        internal static void ByteToNormalizedFloatReduce(
            ref ReadOnlySpan<byte> source,
-            ref Span<float> dest)
+            ref Span<float> destination)
        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");

-            if (Avx2.IsSupported || Sse2.IsSupported)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder;
-                if (Avx2.IsSupported)
+                if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
+                {
+                    remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
+                }
+                else if (Avx2.IsSupported)
                {
                    remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
                }
@ -772,10 +781,10 @@ internal static partial class SimdUtils

                if (adjustedCount > 0)
                {
-                    ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
+                    ByteToNormalizedFloat(source[..adjustedCount], destination[..adjustedCount]);

                    source = source[adjustedCount..];
-                    dest = dest[adjustedCount..];
+                    destination = destination[adjustedCount..];
                }
            }
        }
@ -783,97 +792,126 @@ internal static partial class SimdUtils
        /// <summary>
        /// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        /// <remarks>
        /// Implementation is based on MagicScaler code:
        /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
        /// </remarks>
        internal static unsafe void ByteToNormalizedFloat(
            ReadOnlySpan<byte> source,
-            Span<float> dest)
+            Span<float> destination)
        {
-            fixed (byte* sourceBase = source)
+            if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
            {
-                if (Avx2.IsSupported)
-                {
-                    VerifySpanInput(source, dest, Vector256<byte>.Count);
-
-                    nuint n = dest.Vector256Count<byte>();
+                DebugVerifySpanInput(source, destination, Vector512<byte>.Count);

-                    ref Vector256<float> destBase =
-                        ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
+                nuint n = destination.Vector512Count<byte>();

-                    Vector256<float> scale = Vector256.Create(1 / (float)byte.MaxValue);
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));

-                    for (nuint i = 0; i < n; i++)
-                    {
-                        nuint si = (uint)Vector256<byte>.Count * i;
-                        Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
-                        Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
-                        Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
-                        Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
-
-                        Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
-                        Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
-                        Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
-                        Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
-
-                        ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
-                    }
+                for (nuint i = 0; i < n; i++)
+                {
+                    nuint si = (uint)Vector512<byte>.Count * i;
+                    Vector512<int> i0 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si));
+                    Vector512<int> i1 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector512<int>.Count));
+                    Vector512<int> i2 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 2)));
+                    Vector512<int> i3 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 3)));
+
+                    // Declare multiplier on each line. Codegen is better.
+                    Vector512<float> f0 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i0);
+                    Vector512<float> f1 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i1);
+                    Vector512<float> f2 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i2);
+                    Vector512<float> f3 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i3);
+
+                    ref Vector512<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
                }
-                else
+            }
+            else if (Avx2.IsSupported)
+            {
+                DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
+
+                nuint n = destination.Vector256Count<byte>();
+
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
+
+                for (nuint i = 0; i < n; i++)
                {
-                    // Sse
-                    VerifySpanInput(source, dest, Vector128<byte>.Count);
+                    nuint si = (uint)Vector256<byte>.Count * i;
+                    Vector256<int> i0 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si));
+                    Vector256<int> i1 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector256<int>.Count));
+                    Vector256<int> i2 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector256<int>.Count * 2)));
+
+                    // Ensure overreads past 16 byte boundary do not happen in debug due to lack of containment.
+                    ref ulong refULong = ref Unsafe.As<byte, ulong>(ref Unsafe.Add(ref sourceBase, si));
+                    Vector256<int> i3 = Avx2.ConvertToVector256Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refULong, 3)).AsByte());
+
+                    // Declare multiplier on each line. Codegen is better.
+                    Vector256<float> f0 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i0);
+                    Vector256<float> f1 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i1);
+                    Vector256<float> f2 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i2);
+                    Vector256<float> f3 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i3);
+
+                    ref Vector256<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
+                }
+            }
+            else if (Vector128.IsHardwareAccelerated)
+            {
+                DebugVerifySpanInput(source, destination, Vector128<byte>.Count);

-                    nuint n = dest.Vector128Count<byte>();
+                nuint n = destination.Vector128Count<byte>();
+
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));

-                    ref Vector128<float> destBase =
-                        ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
+                Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
+
+                for (nuint i = 0; i < n; i++)
+                {
+                    nuint si = (uint)Vector128<byte>.Count * i;

-                    Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
-                    Vector128<byte> zero = Vector128<byte>.Zero;
+                    Vector128<int> i0, i1, i2, i3;
+                    if (Sse41.IsSupported)
+                    {
+                        ref int refInt = ref Unsafe.As<byte, int>(ref Unsafe.Add(ref sourceBase, si));

-                    for (nuint i = 0; i < n; i++)
+                        i0 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(refInt).AsByte());
+                        i1 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 1)).AsByte());
+                        i2 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 2)).AsByte());
+                        i3 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 3)).AsByte());
+                    }
+                    else
                    {
-                        nuint si = (uint)Vector128<byte>.Count * i;
-
-                        Vector128<int> i0, i1, i2, i3;
-                        if (Sse41.IsSupported)
-                        {
-                            i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
-                            i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
-                            i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
-                            i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
-                        }
-                        else
-                        {
-                            Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
-                            Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
-                            Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
-
-                            i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
-                            i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
-                            i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
-                            i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
-                        }
-
-                        Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
-                        Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
-                        Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
-                        Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
-
-                        ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
+                        // Sse2, AdvSimd, etc
+                        Vector128<byte> b = Vector128.LoadUnsafe(ref sourceBase, si);
+                        (Vector128<ushort> s0, Vector128<ushort> s1) = Vector128.Widen(b);
+                        (i0, i1) = Vector128.Widen(s0.AsInt16());
+                        (i2, i3) = Vector128.Widen(s1.AsInt16());
                    }
+
+                    Vector128<float> f0 = scale * Vector128.ConvertToSingle(i0);
+                    Vector128<float> f1 = scale * Vector128.ConvertToSingle(i1);
+                    Vector128<float> f2 = scale * Vector128.ConvertToSingle(i2);
+                    Vector128<float> f3 = scale * Vector128.ConvertToSingle(i3);
+
+                    ref Vector128<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
                }
            }
        }
@ -881,17 +919,24 @@ internal static partial class SimdUtils
        /// <summary>
        /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        internal static void NormalizedFloatToByteSaturateReduce(
            ref ReadOnlySpan<float> source,
-            ref Span<byte> dest)
+            ref Span<byte> destination)
        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");

-            if (Avx2.IsSupported || Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
            {
                int remainder;
-                if (Avx2.IsSupported)
+
+                if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported)
+                {
+                    remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
+                }
+                else if (Avx2.IsSupported)
                {
                    remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
                }
@ -906,10 +951,10 @@ internal static partial class SimdUtils
                {
                    NormalizedFloatToByteSaturate(
                        source[..adjustedCount],
-                        dest[..adjustedCount]);
+                        destination[..adjustedCount]);

                    source = source[adjustedCount..];
-                    dest = dest[adjustedCount..];
+                    destination = destination[adjustedCount..];
                }
            }
        }
@ -917,25 +962,58 @@ internal static partial class SimdUtils
        /// <summary>
        /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        /// <remarks>
        /// Implementation is based on MagicScaler code:
        /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
        /// </remarks>
        internal static void NormalizedFloatToByteSaturate(
            ReadOnlySpan<float> source,
-            Span<byte> dest)
+            Span<byte> destination)
        {
-            if (Avx2.IsSupported)
+            if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported)
            {
-                VerifySpanInput(source, dest, Vector256<byte>.Count);
+                DebugVerifySpanInput(source, destination, Vector512<byte>.Count);

-                nuint n = dest.Vector256Count<byte>();
+                nuint n = destination.Vector512Count<byte>();

-                ref Vector256<float> sourceBase =
-                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector512<byte> destinationBase = ref Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(destination));

-                ref Vector256<byte> destBase =
-                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
+                Vector512<float> scale = Vector512.Create((float)byte.MaxValue);
+                Vector512<int> mask = PermuteMaskDeinterleave16x32();
+
+                for (nuint i = 0; i < n; i++)
+                {
+                    ref Vector512<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+                    Vector512<float> f0 = scale * s;
+                    Vector512<float> f1 = scale * Unsafe.Add(ref s, 1);
+                    Vector512<float> f2 = scale * Unsafe.Add(ref s, 2);
+                    Vector512<float> f3 = scale * Unsafe.Add(ref s, 3);
+
+                    Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0);
+                    Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1);
+                    Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2);
+                    Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3);
+
+                    Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1);
+                    Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3);
+                    Vector512<byte> b = Avx512BW.PackUnsignedSaturate(u0, u1);
+                    b = Avx512F.PermuteVar16x32(b.AsInt32(), mask).AsByte();
+
+                    Unsafe.Add(ref destinationBase, i) = b;
+                }
+            }
+            else if (Avx2.IsSupported)
+            {
+                DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
+
+                nuint n = destination.Vector256Count<byte>();
+
+                ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector256<byte> destinationBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(destination));

                Vector256<float> scale = Vector256.Create((float)byte.MaxValue);
                Vector256<int> mask = PermuteMaskDeinterleave8x32();
@ -944,36 +1022,33 @@ internal static partial class SimdUtils
                {
                    ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);

-                    Vector256<float> f0 = Avx.Multiply(scale, s);
-                    Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
-                    Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
-                    Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
+                    Vector256<float> f0 = scale * s;
+                    Vector256<float> f1 = scale * Unsafe.Add(ref s, 1);
+                    Vector256<float> f2 = scale * Unsafe.Add(ref s, 2);
+                    Vector256<float> f3 = scale * Unsafe.Add(ref s, 3);

-                    Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
-                    Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
-                    Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
-                    Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
+                    Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0);
+                    Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1);
+                    Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2);
+                    Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3);

                    Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
                    Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
                    Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
                    b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();

-                    Unsafe.Add(ref destBase, i) = b;
+                    Unsafe.Add(ref destinationBase, i) = b;
                }
            }
-            else
+            else if (Sse2.IsSupported || AdvSimd.IsSupported)
            {
-                // Sse
-                VerifySpanInput(source, dest, Vector128<byte>.Count);
+                // Sse, AdvSimd
+                DebugVerifySpanInput(source, destination, Vector128<byte>.Count);

-                nuint n = dest.Vector128Count<byte>();
+                nuint n = destination.Vector128Count<byte>();

-                ref Vector128<float> sourceBase =
-                    ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
-
-                ref Vector128<byte> destBase =
-                    ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));
+                ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination));

                Vector128<float> scale = Vector128.Create((float)byte.MaxValue);

@ -981,20 +1056,20 @@ internal static partial class SimdUtils
                {
                    ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4);

-                    Vector128<float> f0 = Sse.Multiply(scale, s);
-                    Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
-                    Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
-                    Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
+                    Vector128<float> f0 = scale * s;
+                    Vector128<float> f1 = scale * Unsafe.Add(ref s, 1);
+                    Vector128<float> f2 = scale * Unsafe.Add(ref s, 2);
+                    Vector128<float> f3 = scale * Unsafe.Add(ref s, 3);

-                    Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0);
-                    Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1);
-                    Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2);
-                    Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3);
+                    Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0);
+                    Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1);
+                    Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2);
+                    Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3);

-                    Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1);
-                    Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3);
+                    Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1);
+                    Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3);

-                    Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
+                    Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1);
                }
            }
        }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -22,13 +22,6 @@ internal static partial class SimdUtils
    public static bool HasVector8 { get; } =
        Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;

-    /// <summary>
-    /// Gets a value indicating whether <see cref="Vector{T}"/> code is being JIT-ed to SSE instructions
-    /// where float and integer registers are of size 128 byte.
-    /// </summary>
-    public static bool HasVector4 { get; } =
-        Vector.IsHardwareAccelerated && Vector<float>.Count == 4;
-
    /// <summary>
    /// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
    /// </summary>
@ -69,111 +62,8 @@ internal static partial class SimdUtils
        }
    }

-    /// <summary>
-    /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
-    /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
-    /// but there are no restrictions on the span's length.
-    /// </summary>
-    /// <param name="source">The source span of bytes</param>
-    /// <param name="dest">The destination span of floats</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
-    {
-        DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-        HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
-
-        // Also deals with the remainder from previous conversions:
-        FallbackIntrinsics128.ByteToNormalizedFloatReduce(ref source, ref dest);
-
-        // Deal with the remainder:
-        if (source.Length > 0)
-        {
-            ConvertByteToNormalizedFloatRemainder(source, dest);
-        }
-    }
-
-    /// <summary>
-    /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
-    /// The values are scaled up into [0-255] and rounded, overflows are clamped.
-    /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
-    /// but there are no restrictions on the span's length.
-    /// </summary>
-    /// <param name="source">The source span of floats</param>
-    /// <param name="dest">The destination span of bytes</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> dest)
-    {
-        DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-        HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
-
-        // Also deals with the remainder from previous conversions:
-        FallbackIntrinsics128.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
-
-        // Deal with the remainder:
-        if (source.Length > 0)
-        {
-            ConvertNormalizedFloatToByteRemainder(source, dest);
-        }
-    }
-
-    [MethodImpl(InliningOptions.ColdPath)]
-    private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
-    {
-        ref byte sBase = ref MemoryMarshal.GetReference(source);
-        ref float dBase = ref MemoryMarshal.GetReference(dest);
-
-        // There are at most 3 elements at this point, having a for loop is overkill.
-        // Let's minimize the no. of instructions!
-        switch (source.Length)
-        {
-            case 3:
-                Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
-                goto case 2;
-            case 2:
-                Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
-                goto case 1;
-            case 1:
-                dBase = sBase / 255f;
-                break;
-        }
-    }
-
-    [MethodImpl(InliningOptions.ColdPath)]
-    private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> dest)
-    {
-        ref float sBase = ref MemoryMarshal.GetReference(source);
-        ref byte dBase = ref MemoryMarshal.GetReference(dest);
-
-        switch (source.Length)
-        {
-            case 3:
-                Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2));
-                goto case 2;
-            case 2:
-                Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1));
-                goto case 1;
-            case 1:
-                dBase = ConvertToByte(sBase);
-                break;
-        }
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
-
    [Conditional("DEBUG")]
-    private static void VerifyHasVector8(string operation)
-    {
-        if (!HasVector8)
-        {
-            throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
-        }
-    }
-
-    [Conditional("DEBUG")]
-    private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
+    private static void DebugVerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
    {
        DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
        DebugGuard.IsTrue(
@ -183,11 +73,11 @@ internal static partial class SimdUtils
    }

    [Conditional("DEBUG")]
-    private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
+    private static void DebugVerifySpanInput(ReadOnlySpan<float> source, Span<byte> destination, int shouldBeDivisibleBy)
    {
-        DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
        DebugGuard.IsTrue(
-            Numerics.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
+            Numerics.ModuloP2(destination.Length, shouldBeDivisibleBy) == 0,
            nameof(source),
            $"length should be divisible by {shouldBeDivisibleBy}!");
    }
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -62,6 +62,7 @@ internal static class Vector128Utilities
    /// <param name="vector">The input vector from which values are selected.</param>
    /// <param name="control">The shuffle control byte.</param>
    /// <returns>The <see cref="Vector128{Single}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
    {
        if (Sse.IsSupported)
@ -84,6 +85,7 @@ internal static class Vector128Utilities
    /// <returns>
    /// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
    {
        if (Ssse3.IsSupported)
@ -155,6 +157,7 @@ internal static class Vector128Utilities
    /// <param name="right">The right hand source vector.</param>
    /// <param name="mask">An 8-bit mask used for the operation.</param>
    /// <returns>The <see cref="Vector128{Byte}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector128<byte> AlignRight(Vector128<byte> left, Vector128<byte> right, [ConstantExpected(Max = (byte)15)] byte mask)
    {
        if (Ssse3.IsSupported)
@ -171,6 +174,77 @@ internal static class Vector128Utilities
        return default;
    }

+    /// <summary>
+    /// Performs a conversion from a 128-bit vector of 4 single-precision floating-point values to a 128-bit vector of 4 signed 32-bit integer values.
+    /// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
+    /// </summary>
+    /// <param name="vector">The value to convert.</param>
+    /// <returns>The <see cref="Vector128{Int32}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<int> ConvertToInt32RoundToEven(Vector128<float> vector)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.ConvertToVector128Int32(vector);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.ConvertToInt32RoundToEven(vector);
+        }
+
+        Vector128<float> sign = vector & Vector128.Create(-0.0f);
+        Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608.0f);
+
+        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
+        return Vector128.ConvertToInt32(val_2p23_f32 | sign);
+    }
+
+    /// <summary>
+    /// Packs signed 16-bit integers to unsigned 8-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector128{Int16}"/>.</returns>
+    public static Vector128<byte> PackUnsignedSaturate(Vector128<short> left, Vector128<short> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.PackUnsignedSaturate(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
+        }
+
+        ThrowUnreachableException();
+        return default;
+    }
+
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector128{Int16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128<int> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.PackSignedSaturate(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
+        }
+
+        ThrowUnreachableException();
+        return default;
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -25,7 +25,7 @@ internal static class Vector256Utilities
    public static bool SupportsShuffleFloat
    {
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported;
+        get => Avx.IsSupported || Sse.IsSupported;
    }

    /// <summary>
@ -43,6 +43,7 @@ internal static class Vector256Utilities
    /// <param name="vector">The input vector from which values are selected.</param>
    /// <param name="control">The shuffle control byte.</param>
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
    {
        if (Avx.IsSupported)
@ -50,6 +51,13 @@ internal static class Vector256Utilities
            return Avx.Shuffle(vector, vector, control);
        }

+        if (Sse.IsSupported)
+        {
+            Vector128<float> lower = vector.GetLower();
+            Vector128<float> upper = vector.GetUpper();
+            return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
+        }
+
        ThrowUnreachableException();
        return default;
    }
@ -62,6 +70,7 @@ internal static class Vector256Utilities
    /// The per-element indices used to select a value from <paramref name="vector" />.
    /// </param>
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
    {
        if (Avx2.IsSupported)
@ -73,6 +82,34 @@ internal static class Vector256Utilities
        return default;
    }

+    /// <summary>
+    /// Performs a conversion from a 256-bit vector of 8 single-precision floating-point values to a 256-bit vector of 8 signed 32-bit integer values.
+    /// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
+    /// </summary>
+    /// <param name="vector">The value to convert.</param>
+    /// <returns>The <see cref="Vector256{Int32}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
+    {
+        if (Avx.IsSupported)
+        {
+            return Avx.ConvertToVector256Int32(vector);
+        }
+
+        if (Sse2.IsSupported)
+        {
+            Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
+            Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
+            return Vector256.Create(lower, upper);
+        }
+
+        Vector256<float> sign = vector & Vector256.Create(-0.0f);
+        Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608.0f);
+
+        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
+        return Vector256.ConvertToInt32(val_2p23_f32 | sign);
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@ -25,7 +25,7 @@ internal static class Vector512Utilities
    public static bool SupportsShuffleFloat
    {
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512F.IsSupported;
+        get => Avx512F.IsSupported || Avx.IsSupported;
    }

    /// <summary>
@ -51,6 +51,13 @@ internal static class Vector512Utilities
            return Avx512F.Shuffle(vector, vector, control);
        }

+        if (Avx.IsSupported)
+        {
+            Vector256<float> lower = vector.GetLower();
+            Vector256<float> upper = vector.GetUpper();
+            return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
+        }
+
        ThrowUnreachableException();
        return default;
    }
@ -75,6 +82,34 @@ internal static class Vector512Utilities
        return default;
    }

+    /// <summary>
+    /// Performs a conversion from a 512-bit vector of 16 single-precision floating-point values to a 512-bit vector of 16 signed 32-bit integer values.
+    /// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
+    /// </summary>
+    /// <param name="vector">The value to convert.</param>
+    /// <returns>The <see cref="Vector128{Int32}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
+    {
+        if (Avx512F.IsSupported)
+        {
+            return Avx512F.ConvertToVector512Int32(vector);
+        }
+
+        if (Avx.IsSupported)
+        {
+            Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
+            Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
+            return Vector512.Create(lower, upper);
+        }
+
+        Vector512<float> sign = vector & Vector512.Create(-0.0f);
+        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
+
+        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
+        return Vector512.ConvertToInt32(val_2p23_f32 | sign);
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Formats/Gif/GifFrameMetadata.cs
+++ b/src/ImageSharp/Formats/Gif/GifFrameMetadata.cs
@ -82,13 +82,13 @@ public class GifFrameMetadata : IDeepCloneable
    {
        // TODO: v4 How do I link the parent metadata to the frame metadata to get the global color table?
        int index = -1;
-        float background = 1f;
+        const float background = 1f;
        if (metadata.ColorTable.HasValue)
        {
            ReadOnlySpan<Color> colorTable = metadata.ColorTable.Value.Span;
            for (int i = 0; i < colorTable.Length; i++)
            {
-                Vector4 vector = (Vector4)colorTable[i];
+                Vector4 vector = colorTable[i].ToScaledVector4();
                if (vector.W < background)
                {
                    index = i;
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -386,29 +386,33 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
    {
        DebugGuard.IsTrue(
-            SimdUtils.HasVector8,
+            Avx2.IsSupported,
            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");

-        ref Vector<short> sRef = ref Unsafe.As<Block8x8, Vector<short>>(ref source);
-        ref Vector<float> dRef = ref Unsafe.As<Block8x8F, Vector<float>>(ref this);
+        ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
+        ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);

-        // Vector<ushort>.Count == 16 on AVX2
+        // Vector256<ushort>.Count == 16 on AVX2
        // We can process 2 block rows in a single step
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(sRef, out Vector<float> top, out Vector<float> bottom);
-        dRef = top;
-        Unsafe.Add(ref dRef, 1) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 1), out top, out bottom);
-        Unsafe.Add(ref dRef, 2) = top;
-        Unsafe.Add(ref dRef, 3) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 2), out top, out bottom);
-        Unsafe.Add(ref dRef, 4) = top;
-        Unsafe.Add(ref dRef, 5) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 3), out top, out bottom);
-        Unsafe.Add(ref dRef, 6) = top;
-        Unsafe.Add(ref dRef, 7) = bottom;
+        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
+        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
+        dRef = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
+        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
+        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
+        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
    }

    /// <summary>
--- a/src/ImageSharp/Formats/Jpeg/JpegComData.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegComData.cs
@ -0,0 +1,32 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+namespace SixLabors.ImageSharp.Formats.Jpeg;
+
+/// <summary>
+/// Represents a JPEG comment
+/// </summary>
+public readonly struct JpegComData
+{
+    /// <summary>
+    /// Initializes a new instance of the <see cref="JpegComData"/> struct.
+    /// </summary>
+    /// <param name="value">The comment buffer.</param>
+    public JpegComData(ReadOnlyMemory<char> value)
+        => this.Value = value;
+
+    /// <summary>
+    /// Gets the value.
+    /// </summary>
+    public ReadOnlyMemory<char> Value { get; }
+
+    /// <summary>
+    /// Converts string to <see cref="JpegComData"/>
+    /// </summary>
+    /// <param name="value">The comment string.</param>
+    /// <returns>The <see cref="JpegComData"/></returns>
+    public static JpegComData FromString(string value) => new(value.AsMemory());
+
+    /// <inheritdoc/>
+    public override string ToString() => this.Value.ToString();
+}
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@ -480,9 +480,11 @@ internal sealed class JpegDecoderCore : IRawJpegData, IImageDecoderInternals
                        break;

                    case JpegConstants.Markers.APP15:
-                    case JpegConstants.Markers.COM:
                        stream.Skip(markerContentByteSize);
                        break;
+                    case JpegConstants.Markers.COM:
+                        this.ProcessComMarker(stream, markerContentByteSize);
+                        break;

                    case JpegConstants.Markers.DAC:
                        if (metadataOnly)
@ -515,6 +517,25 @@ internal sealed class JpegDecoderCore : IRawJpegData, IImageDecoderInternals
        this.scanDecoder = null;
    }

+    /// <summary>
+    /// Assigns COM marker bytes to comment property
+    /// </summary>
+    /// <param name="stream">The input stream.</param>
+    /// <param name="markerContentByteSize">The remaining bytes in the segment block.</param>
+    private void ProcessComMarker(BufferedReadStream stream, int markerContentByteSize)
+    {
+        char[] chars = new char[markerContentByteSize];
+        JpegMetadata metadata = this.Metadata.GetFormatMetadata(JpegFormat.Instance);
+
+        for (int i = 0; i < markerContentByteSize; i++)
+        {
+            int read = stream.ReadByte();
+            chars[i] = (char)read;
+        }
+
+        metadata.Comments.Add(new JpegComData(chars));
+    }
+
    /// <summary>
    /// Returns encoded colorspace based on the adobe APP14 marker.
    /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@ -2,6 +2,7 @@
 // Licensed under the Six Labors Split License.
 #nullable disable

+using System.Buffers;
 using System.Buffers.Binary;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -25,6 +26,9 @@ internal sealed unsafe partial class JpegEncoderCore : IImageEncoderInternals
    /// </summary>
    private static readonly JpegFrameConfig[] FrameConfigs = CreateFrameConfigs();

+    /// <summary>
+    /// The current calling encoder.
+    /// </summary>
    private readonly JpegEncoder encoder;

    /// <summary>
@ -89,6 +93,9 @@ internal sealed unsafe partial class JpegEncoderCore : IImageEncoderInternals
        // Write Exif, XMP, ICC and IPTC profiles
        this.WriteProfiles(metadata, buffer);

+        // Write comments
+        this.WriteComments(image.Configuration, jpegMetadata);
+
        // Write the image dimensions.
        this.WriteStartOfFrame(image.Width, image.Height, frameConfig, buffer);

@ -167,6 +174,51 @@ internal sealed unsafe partial class JpegEncoderCore : IImageEncoderInternals
        this.outputStream.Write(buffer, 0, 18);
    }

+    /// <summary>
+    /// Writes the COM tags.
+    /// </summary>
+    /// <param name="configuration">The configuration.</param>
+    /// <param name="metadata">The image metadata.</param>
+    private void WriteComments(Configuration configuration, JpegMetadata metadata)
+    {
+        if (metadata.Comments.Count == 0)
+        {
+            return;
+        }
+
+        const int maxCommentLength = 65533;
+        using IMemoryOwner<byte> bufferOwner = configuration.MemoryAllocator.Allocate<byte>(maxCommentLength);
+        Span<byte> buffer = bufferOwner.Memory.Span;
+        foreach (JpegComData comment in metadata.Comments)
+        {
+            int totalLength = comment.Value.Length;
+            if (totalLength == 0)
+            {
+                continue;
+            }
+
+            // Loop through and split the comment into multiple comments if the comment length
+            // is greater than the maximum allowed length.
+            while (totalLength > 0)
+            {
+                int currentLength = Math.Min(totalLength, maxCommentLength);
+
+                // Write the marker header.
+                this.WriteMarkerHeader(JpegConstants.Markers.COM, currentLength + 2, buffer);
+
+                ReadOnlySpan<char> commentValue = comment.Value.Span.Slice(comment.Value.Length - totalLength, currentLength);
+                for (int i = 0; i < commentValue.Length; i++)
+                {
+                    buffer[i] = (byte)commentValue[i];
+                }
+
+                // Write the comment.
+                this.outputStream.Write(buffer, 0, currentLength);
+                totalLength -= currentLength;
+            }
+        }
+    }
+
    /// <summary>
    /// Writes the Define Huffman Table marker and tables.
    /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/JpegMetadata.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegMetadata.cs
@ -15,6 +15,7 @@ public class JpegMetadata : IDeepCloneable
    /// </summary>
    public JpegMetadata()
    {
+        this.Comments = new List<JpegComData>();
    }

    /// <summary>
@ -25,6 +26,7 @@ public class JpegMetadata : IDeepCloneable
    {
        this.ColorType = other.ColorType;

+        this.Comments = other.Comments;
        this.LuminanceQuality = other.LuminanceQuality;
        this.ChrominanceQuality = other.ChrominanceQuality;
    }
@ -101,6 +103,11 @@ public class JpegMetadata : IDeepCloneable
    /// </remarks>
    public bool? Progressive { get; internal set; }

+    /// <summary>
+    /// Gets the comments.
+    /// </summary>
+    public IList<JpegComData> Comments { get; }
+
    /// <inheritdoc/>
    public IDeepCloneable DeepClone() => new JpegMetadata(this);
 }
--- a/src/ImageSharp/Formats/Jpeg/MetadataExtensions.cs
+++ b/src/ImageSharp/Formats/Jpeg/MetadataExtensions.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

+using System.Text;
 using SixLabors.ImageSharp.Formats.Jpeg;
 using SixLabors.ImageSharp.Metadata;

--- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs
+++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
@ -126,6 +126,11 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
    /// </summary>
    private readonly Crc32 crc32 = new();

+    /// <summary>
+    /// The maximum memory in bytes that a zTXt, sPLT, iTXt, iCCP, or unknown chunk can occupy when decompressed.
+    /// </summary>
+    private readonly int maxUncompressedLength;
+
    /// <summary>
    /// Initializes a new instance of the <see cref="PngDecoderCore"/> class.
    /// </summary>
@ -138,6 +143,7 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
        this.skipMetadata = options.GeneralOptions.SkipMetadata;
        this.memoryAllocator = this.configuration.MemoryAllocator;
        this.pngCrcChunkHandling = options.PngCrcChunkHandling;
+        this.maxUncompressedLength = options.MaxUncompressedAncillaryChunkSizeBytes;
    }

    internal PngDecoderCore(PngDecoderOptions options, bool colorMetadataOnly)
@ -149,6 +155,7 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
        this.configuration = options.GeneralOptions.Configuration;
        this.memoryAllocator = this.configuration.MemoryAllocator;
        this.pngCrcChunkHandling = options.PngCrcChunkHandling;
+        this.maxUncompressedLength = options.MaxUncompressedAncillaryChunkSizeBytes;
    }

    /// <inheritdoc/>
@ -602,23 +609,7 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
    private void InitializeImage<TPixel>(ImageMetadata metadata, FrameControl frameControl, out Image<TPixel> image)
        where TPixel : unmanaged, IPixel<TPixel>
    {
-        // When ignoring data CRCs, we can't use the image constructor that leaves the buffer uncleared.
-        if (this.pngCrcChunkHandling is PngCrcChunkHandling.IgnoreData or PngCrcChunkHandling.IgnoreAll)
-        {
-            image = new Image<TPixel>(
-                this.configuration,
-                this.header.Width,
-                this.header.Height,
-                metadata);
-        }
-        else
-        {
-            image = Image.CreateUninitialized<TPixel>(
-                this.configuration,
-                this.header.Width,
-                this.header.Height,
-                metadata);
-        }
+        image = new Image<TPixel>(this.configuration, this.header.Width, this.header.Height, metadata);

        PngFrameMetadata frameMetadata = image.Frames.RootFrame.Metadata.GetPngMetadata();
        frameMetadata.FromChunk(in frameControl);
@ -1575,7 +1566,7 @@ internal sealed class PngDecoderCore : IImageDecoderInternals

        ReadOnlySpan<byte> compressedData = data[(zeroIndex + 2)..];

-        if (this.TryDecompressZlibData(compressedData, out byte[] iccpProfileBytes))
+        if (this.TryDecompressZlibData(compressedData, this.maxUncompressedLength, out byte[] iccpProfileBytes))
        {
            metadata.IccProfile = new IccProfile(iccpProfileBytes);
        }
@ -1585,9 +1576,10 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
    /// Tries to decompress zlib compressed data.
    /// </summary>
    /// <param name="compressedData">The compressed data.</param>
+    /// <param name="maxLength">The maximum uncompressed length.</param>
    /// <param name="uncompressedBytesArray">The uncompressed bytes array.</param>
    /// <returns>True, if de-compressing was successful.</returns>
-    private unsafe bool TryDecompressZlibData(ReadOnlySpan<byte> compressedData, out byte[] uncompressedBytesArray)
+    private unsafe bool TryDecompressZlibData(ReadOnlySpan<byte> compressedData, int maxLength, out byte[] uncompressedBytesArray)
    {
        fixed (byte* compressedDataBase = compressedData)
        {
@ -1607,6 +1599,12 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
            int bytesRead = inflateStream.CompressedStream.Read(destUncompressedData, 0, destUncompressedData.Length);
            while (bytesRead != 0)
            {
+                if (memoryStreamOutput.Length > maxLength)
+                {
+                    uncompressedBytesArray = Array.Empty<byte>();
+                    return false;
+                }
+
                memoryStreamOutput.Write(destUncompressedData[..bytesRead]);
                bytesRead = inflateStream.CompressedStream.Read(destUncompressedData, 0, destUncompressedData.Length);
            }
@ -1749,7 +1747,7 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
    /// <returns>The <see cref="bool"/>.</returns>
    private bool TryDecompressTextData(ReadOnlySpan<byte> compressedData, Encoding encoding, [NotNullWhen(true)] out string? value)
    {
-        if (this.TryDecompressZlibData(compressedData, out byte[] uncompressedData))
+        if (this.TryDecompressZlibData(compressedData, this.maxUncompressedLength, out byte[] uncompressedData))
        {
            value = encoding.GetString(uncompressedData);
            return true;
@ -1874,8 +1872,13 @@ internal sealed class PngDecoderCore : IImageDecoderInternals
        PngChunkType type = this.ReadChunkType(buffer);

        // If we're reading color metadata only we're only interested in the IHDR and tRNS chunks.
-        // We can skip all other chunk data in the stream for better performance.
-        if (this.colorMetadataOnly && type != PngChunkType.Header && type != PngChunkType.Transparency && type != PngChunkType.Palette)
+        // We can skip most other chunk data in the stream for better performance.
+        if (this.colorMetadataOnly &&
+            type != PngChunkType.Header &&
+            type != PngChunkType.Transparency &&
+            type != PngChunkType.Palette &&
+            type != PngChunkType.AnimationControl &&
+            type != PngChunkType.FrameControl)
        {
            chunk = new PngChunk(length, type);
            return true;
--- a/src/ImageSharp/Formats/Png/PngDecoderOptions.cs
+++ b/src/ImageSharp/Formats/Png/PngDecoderOptions.cs
@ -15,4 +15,10 @@ public sealed class PngDecoderOptions : ISpecializedDecoderOptions
    /// Gets a value indicating how to handle validation of any CRC (Cyclic Redundancy Check) data within the encoded PNG.
    /// </summary>
    public PngCrcChunkHandling PngCrcChunkHandling { get; init; } = PngCrcChunkHandling.IgnoreNonCritical;
+
+    /// <summary>
+    /// Gets the maximum memory in bytes that a zTXt, sPLT, iTXt, iCCP, or unknown chunk can occupy when decompressed.
+    /// Defaults to 8MB
+    /// </summary>
+    public int MaxUncompressedAncillaryChunkSizeBytes { get; init; } = 8 * 1024 * 1024; // 8MB
 }
--- a/src/ImageSharp/Formats/Png/PngEncoderCore.cs
+++ b/src/ImageSharp/Formats/Png/PngEncoderCore.cs
@ -4,6 +4,7 @@
 using System.Buffers;
 using System.Buffers.Binary;
 using System.IO.Hashing;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Common.Helpers;
@ -1559,7 +1560,24 @@ internal sealed class PngEncoderCore : IImageEncoderInternals, IDisposable
            {
                // We can use the color data from the decoded metadata here.
                // We avoid dithering by default to preserve the original colors.
-                this.derivedTransparencyIndex = metadata.ColorTable.Value.Span.IndexOf(Color.Transparent);
+                ReadOnlySpan<Color> palette = metadata.ColorTable.Value.Span;
+
+                // Certain operations perform alpha premultiplication, which can cause the color to change so we
+                // must search for the transparency index in the palette.
+                // Transparent pixels are much more likely to be found at the end of a palette.
+                int index = -1;
+                for (int i = palette.Length - 1; i >= 0; i--)
+                {
+                    Vector4 instance = palette[i].ToScaledVector4();
+                    if (instance.W == 0f)
+                    {
+                        index = i;
+                        break;
+                    }
+                }
+
+                this.derivedTransparencyIndex = index;
+
                this.quantizer = new PaletteQuantizer(metadata.ColorTable.Value, new() { Dither = null }, this.derivedTransparencyIndex);
            }
            else
--- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
@ -6,7 +6,9 @@ using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Webp.BitReader;
 using SixLabors.ImageSharp.Formats.Webp.Lossless;
 using SixLabors.ImageSharp.Memory;
@ -311,32 +313,28 @@ internal class AlphaDecoder : IDisposable

    private static void HorizontalUnfilter(Span<byte> prev, Span<byte> input, Span<byte> dst, int width)
    {
-        if (Sse2.IsSupported)
+        if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9)
        {
            dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0]));
-            if (width <= 1)
-            {
-                return;
-            }
-
            nuint i;
            Vector128<int> last = Vector128<int>.Zero.WithElement(0, dst[0]);
            ref byte srcRef = ref MemoryMarshal.GetReference(input);
            ref byte dstRef = ref MemoryMarshal.GetReference(dst);
+
            for (i = 1; i <= (uint)width - 8; i += 8)
            {
                Vector128<long> a0 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, i)), 0);
-                Vector128<byte> a1 = Sse2.Add(a0.AsByte(), last.AsByte());
-                Vector128<byte> a2 = Sse2.ShiftLeftLogical128BitLane(a1, 1);
-                Vector128<byte> a3 = Sse2.Add(a1, a2);
-                Vector128<byte> a4 = Sse2.ShiftLeftLogical128BitLane(a3, 2);
-                Vector128<byte> a5 = Sse2.Add(a3, a4);
-                Vector128<byte> a6 = Sse2.ShiftLeftLogical128BitLane(a5, 4);
-                Vector128<byte> a7 = Sse2.Add(a5, a6);
+                Vector128<byte> a1 = a0.AsByte() + last.AsByte();
+                Vector128<byte> a2 = Vector128Utilities.ShiftLeftBytesInVector(a1, 1);
+                Vector128<byte> a3 = a1 + a2;
+                Vector128<byte> a4 = Vector128Utilities.ShiftLeftBytesInVector(a3, 2);
+                Vector128<byte> a5 = a3 + a4;
+                Vector128<byte> a6 = Vector128Utilities.ShiftLeftBytesInVector(a5, 4);
+                Vector128<byte> a7 = a5 + a6;

                ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
                Unsafe.As<byte, Vector64<byte>>(ref outputRef) = a7.GetLower();
-                last = Sse2.ShiftRightLogical(a7.AsInt64(), 56).AsInt32();
+                last = Vector128.ShiftRightLogical(a7.AsInt64(), 56).AsInt32();
            }

            for (; i < (uint)width; ++i)
--- a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs
@ -20,15 +20,15 @@ public partial struct Rgba32
        /// <inheritdoc />
        public override void ToVector4(
            Configuration configuration,
-            ReadOnlySpan<Rgba32> sourcePixels,
+            ReadOnlySpan<Rgba32> source,
            Span<Vector4> destinationVectors,
            PixelConversionModifiers modifiers)
        {
-            Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationVectors, nameof(destinationVectors));
+            Guard.DestinationShouldNotBeTooShort(source, destinationVectors, nameof(destinationVectors));

-            destinationVectors = destinationVectors[..sourcePixels.Length];
+            destinationVectors = destinationVectors[..source.Length];
            SimdUtils.ByteToNormalizedFloat(
-                MemoryMarshal.Cast<Rgba32, byte>(sourcePixels),
+                MemoryMarshal.Cast<Rgba32, byte>(source),
                MemoryMarshal.Cast<Vector4, float>(destinationVectors));
            Vector4Converters.ApplyForwardConversionModifiers(destinationVectors, modifiers);
        }
@ -37,16 +37,16 @@ public partial struct Rgba32
        public override void FromVector4Destructive(
            Configuration configuration,
            Span<Vector4> sourceVectors,
-            Span<Rgba32> destinationPixels,
+            Span<Rgba32> destination,
            PixelConversionModifiers modifiers)
        {
-            Guard.DestinationShouldNotBeTooShort(sourceVectors, destinationPixels, nameof(destinationPixels));
+            Guard.DestinationShouldNotBeTooShort(sourceVectors, destination, nameof(destination));

-            destinationPixels = destinationPixels[..sourceVectors.Length];
+            destination = destination[..sourceVectors.Length];
            Vector4Converters.ApplyBackwardConversionModifiers(sourceVectors, modifiers);
            SimdUtils.NormalizedFloatToByteSaturate(
                MemoryMarshal.Cast<Vector4, float>(sourceVectors),
-                MemoryMarshal.Cast<Rgba32, byte>(destinationPixels));
+                MemoryMarshal.Cast<Rgba32, byte>(destination));
        }

        /// <inheritdoc />
--- a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
+++ b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
@ -5,6 +5,7 @@ using System.Buffers;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;

 namespace SixLabors.ImageSharp.PixelFormats.Utils;

@ -31,74 +32,86 @@ internal static partial class Vector4Converters
        /// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.ToVector4(Configuration,ReadOnlySpan{TPixel},Span{Vector4},PixelConversionModifiers)"/>
        /// The method works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        /// <typeparam name="TPixel">The type of pixel format.</typeparam>
+        /// <param name="configuration">The configuration.</param>
+        /// <param name="pixelOperations">The pixel operations instance.</param>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
+        /// <param name="modifiers">The conversion modifier flags.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static void ToVector4<TPixel>(
            Configuration configuration,
            PixelOperations<TPixel> pixelOperations,
-            ReadOnlySpan<TPixel> sourcePixels,
-            Span<Vector4> destVectors,
+            ReadOnlySpan<TPixel> source,
+            Span<Vector4> destination,
            PixelConversionModifiers modifiers)
            where TPixel : unmanaged, IPixel<TPixel>
        {
            Guard.NotNull(configuration, nameof(configuration));
-            Guard.DestinationShouldNotBeTooShort(sourcePixels, destVectors, nameof(destVectors));
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-            int count = sourcePixels.Length;
+            int count = source.Length;

            // Not worth for small buffers:
            if (count < Vector4ConversionThreshold)
            {
-                Default.UnsafeToVector4(sourcePixels, destVectors, modifiers);
+                Default.UnsafeToVector4(source, destination, modifiers);

                return;
            }

-            // Using the last quarter of 'destVectors' as a temporary buffer to avoid allocation:
+            // Using the last quarter of 'destination' as a temporary buffer to avoid allocation:
            int countWithoutLastItem = count - 1;
-            ReadOnlySpan<TPixel> reducedSource = sourcePixels[..countWithoutLastItem];
-            Span<Rgba32> lastQuarterOfDestBuffer = MemoryMarshal.Cast<Vector4, Rgba32>(destVectors).Slice((3 * count) + 1, countWithoutLastItem);
-            pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestBuffer);
+            ReadOnlySpan<TPixel> reducedSource = source[..countWithoutLastItem];
+            Span<Rgba32> lastQuarterOfDestination = MemoryMarshal.Cast<Vector4, Rgba32>(destination).Slice((3 * count) + 1, countWithoutLastItem);
+            pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestination);

-            // 'destVectors' and 'lastQuarterOfDestBuffer' are overlapping buffers,
+            // 'destination' and 'lastQuarterOfDestination' are overlapping buffers,
            // but we are always reading/writing at different positions:
            SimdUtils.ByteToNormalizedFloat(
-                MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestBuffer),
-                MemoryMarshal.Cast<Vector4, float>(destVectors[..countWithoutLastItem]));
+                MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestination),
+                MemoryMarshal.Cast<Vector4, float>(destination[..countWithoutLastItem]));

-            destVectors[countWithoutLastItem] = sourcePixels[countWithoutLastItem].ToVector4();
+            destination[countWithoutLastItem] = source[countWithoutLastItem].ToVector4();

            // TODO: Investigate optimized 1-pass approach!
-            ApplyForwardConversionModifiers(destVectors, modifiers);
+            ApplyForwardConversionModifiers(destination, modifiers);
        }

        /// <summary>
        /// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.FromVector4Destructive(Configuration,Span{Vector4},Span{TPixel},PixelConversionModifiers)"/>
        /// The method is works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        /// <typeparam name="TPixel">The type of pixel format.</typeparam>
+        /// <param name="configuration">The configuration.</param>
+        /// <param name="pixelOperations">The pixel operations instance.</param>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
+        /// <param name="modifiers">The conversion modifier flags.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static void FromVector4<TPixel>(
            Configuration configuration,
            PixelOperations<TPixel> pixelOperations,
-            Span<Vector4> sourceVectors,
-            Span<TPixel> destPixels,
+            Span<Vector4> source,
+            Span<TPixel> destination,
            PixelConversionModifiers modifiers)
            where TPixel : unmanaged, IPixel<TPixel>
        {
            Guard.NotNull(configuration, nameof(configuration));
-            Guard.DestinationShouldNotBeTooShort(sourceVectors, destPixels, nameof(destPixels));
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-            int count = sourceVectors.Length;
+            int count = source.Length;

            // Not worth for small buffers:
            if (count < Vector4ConversionThreshold)
            {
-                Default.UnsafeFromVector4(sourceVectors, destPixels, modifiers);
+                Default.UnsafeFromVector4(source, destination, modifiers);

                return;
            }

            // TODO: Investigate optimized 1-pass approach!
-            ApplyBackwardConversionModifiers(sourceVectors, modifiers);
+            ApplyBackwardConversionModifiers(source, modifiers);

            // For the opposite direction it's not easy to implement the trick used in RunRgba32CompatibleToVector4Conversion,
            // so let's allocate a temporary buffer as usually:
@ -106,20 +119,30 @@ internal static partial class Vector4Converters
            Span<Rgba32> tempSpan = tempBuffer.Memory.Span;

            SimdUtils.NormalizedFloatToByteSaturate(
-                MemoryMarshal.Cast<Vector4, float>(sourceVectors),
+                MemoryMarshal.Cast<Vector4, float>(source),
                MemoryMarshal.Cast<Rgba32, byte>(tempSpan));

-            pixelOperations.FromRgba32(configuration, tempSpan, destPixels);
+            pixelOperations.FromRgba32(configuration, tempSpan, destination);
        }

        private static int CalculateVector4ConversionThreshold()
        {
-            if (!Vector.IsHardwareAccelerated)
+            if (!Vector128.IsHardwareAccelerated)
            {
                return int.MaxValue;
            }

-            return SimdUtils.ExtendedIntrinsics.IsAvailable && SimdUtils.HasVector8 ? 256 : 128;
+            if (Vector512.IsHardwareAccelerated)
+            {
+                return 512;
+            }
+
+            if (Vector256.IsHardwareAccelerated)
+            {
+                return 256;
+            }
+
+            return 128;
        }
    }
 }
--- a/src/ImageSharp/Processing/Processors/Drawing/DrawImageProcessor{TPixelBg,TPixelFg}.cs
+++ b/src/ImageSharp/Processing/Processors/Drawing/DrawImageProcessor{TPixelBg,TPixelFg}.cs
@ -98,9 +98,10 @@ internal class DrawImageProcessor<TPixelBg, TPixelFg> : ImageProcessor<TPixelBg>
            top = 0;
        }

-        // clamp the height/width to the availible space left to prevent overflowing
+        // Clamp the height/width to the available space left to prevent overflowing
        foregroundRectangle.Width = Math.Min(source.Width - left, foregroundRectangle.Width);
        foregroundRectangle.Height = Math.Min(source.Height - top, foregroundRectangle.Height);
+        foregroundRectangle = Rectangle.Intersect(foregroundRectangle, this.ForegroundImage.Bounds);

        int width = foregroundRectangle.Width;
        int height = foregroundRectangle.Height;
@ -111,7 +112,6 @@ internal class DrawImageProcessor<TPixelBg, TPixelFg> : ImageProcessor<TPixelBg>
        }

        // Sanitize the dimensions so that we don't try and sample outside the image.
-        foregroundRectangle = Rectangle.Intersect(foregroundRectangle, this.ForegroundImage.Bounds);
        Rectangle backgroundRectangle = Rectangle.Intersect(new(left, top, width, height), this.SourceRectangle);
        Configuration configuration = this.Configuration;

--- a/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs
@ -18,9 +18,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
 public abstract class FromVector4<TPixel>
    where TPixel : unmanaged, IPixel<TPixel>
 {
-    protected IMemoryOwner<Vector4> source;
+    protected IMemoryOwner<Vector4> Source { get; set; }

-    protected IMemoryOwner<TPixel> destination;
+    protected IMemoryOwner<TPixel> Destination { get; set; }

    protected Configuration Configuration => Configuration.Default;

@ -31,22 +31,22 @@ public abstract class FromVector4<TPixel>
    [GlobalSetup]
    public void Setup()
    {
-        this.destination = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
-        this.source = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
+        this.Destination = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
+        this.Source = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
    }

    [GlobalCleanup]
    public void Cleanup()
    {
-        this.destination.Dispose();
-        this.source.Dispose();
+        this.Destination.Dispose();
+        this.Source.Dispose();
    }

    // [Benchmark]
    public void PerElement()
    {
-        ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan());
-        ref TPixel d = ref MemoryMarshal.GetReference(this.destination.GetSpan());
+        ref Vector4 s = ref MemoryMarshal.GetReference(this.Source.GetSpan());
+        ref TPixel d = ref MemoryMarshal.GetReference(this.Destination.GetSpan());
        for (nuint i = 0; i < (uint)this.Count; i++)
        {
            Unsafe.Add(ref d, i) = TPixel.FromVector4(Unsafe.Add(ref s, i));
@ -55,38 +55,20 @@ public abstract class FromVector4<TPixel>

    [Benchmark(Baseline = true)]
    public void PixelOperations_Base()
-        => new PixelOperations<TPixel>().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
+        => new PixelOperations<TPixel>().FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan());

    [Benchmark]
    public void PixelOperations_Specialized()
-        => PixelOperations<TPixel>.Instance.FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
+        => PixelOperations<TPixel>.Instance.FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan());
 }

 public class FromVector4Rgba32 : FromVector4<Rgba32>
 {
-    [Benchmark]
-    public void FallbackIntrinsics128()
-    {
-        Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
-        Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
-
-        SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(sBytes, dFloats);
-    }
-
-    [Benchmark]
-    public void ExtendedIntrinsic()
-    {
-        Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
-        Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
-
-        SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
-    }
-
    [Benchmark]
    public void UseHwIntrinsics()
    {
-        Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
-        Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
+        Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.Source.GetSpan());
+        Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.Destination.GetSpan());

        SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
    }
@ -96,8 +78,8 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
    [Benchmark]
    public void UseAvx2_Grouped()
    {
-        Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
-        Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
+        Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.Source.GetSpan());
+        Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.Destination.GetSpan());

        nuint n = (uint)dest.Length / (uint)Vector<byte>.Count;

@ -107,7 +89,7 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
        ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
        Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);

-        var maxBytes = Vector256.Create(255f);
+        Vector256<float> maxBytes = Vector256.Create(255f);

        for (nuint i = 0; i < n; i++)
        {
@ -137,25 +119,37 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
        }
    }

-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
-    {
-        vf = Avx.Multiply(scale, vf);
-        return Avx.ConvertToVector256Int32(vf);
-    }
-
-    // *** RESULTS 2020 March: ***
-    // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
-    // .NET Core SDK=3.1.200-preview-014971
-    //   Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
-    //
-    // |                      Method | Count |       Mean |       Error |    StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
-    // |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:|
-    // |       FallbackIntrinsics128 |  1024 | 2,952.6 ns | 1,680.77 ns |  92.13 ns |  3.32 |    0.16 |     - |     - |     - |         - |
-    // |          BasicIntrinsics256 |  1024 | 1,664.5 ns |   928.11 ns |  50.87 ns |  1.87 |    0.09 |     - |     - |     - |         - |
-    // |           ExtendedIntrinsic |  1024 |   890.6 ns |   375.48 ns |  20.58 ns |  1.00 |    0.00 |     - |     - |     - |         - |
-    // |                     UseAvx2 |  1024 |   299.0 ns |    30.47 ns |   1.67 ns |  0.34 |    0.01 |     - |     - |     - |         - |
-    // |             UseAvx2_Grouped |  1024 |   318.1 ns |    48.19 ns |   2.64 ns |  0.36 |    0.01 |     - |     - |     - |         - |
-    // |        PixelOperations_Base |  1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns |  9.14 |    0.26 |     - |     - |     - |      24 B |
-    // | PixelOperations_Specialized |  1024 |   951.1 ns |   123.93 ns |   6.79 ns |  1.07 |    0.03 |     - |     - |     - |         - |
+    /*
+    BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
+    11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
+    .NET SDK 8.0.200-preview.23624.5
+      [Host]     : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+      Job-YJYLLR : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+
+    Runtime=.NET 8.0  Arguments=/p:DebugType=portable  IterationCount=3
+    LaunchCount=1  WarmupCount=3
+
+    | Method                      | Count | Mean        | Error        | StdDev     | Ratio | RatioSD | Allocated | Alloc Ratio |
+    |---------------------------- |------ |------------:|-------------:|-----------:|------:|--------:|----------:|------------:|
+    | PixelOperations_Base        | 64    |   114.80 ns |    16.459 ns |   0.902 ns |  1.00 |    0.00 |         - |          NA |
+    | PixelOperations_Specialized | 64    |    28.91 ns |    80.482 ns |   4.411 ns |  0.25 |    0.04 |         - |          NA |
+    | FallbackIntrinsics128       | 64    |   133.60 ns |    23.750 ns |   1.302 ns |  1.16 |    0.02 |         - |          NA |
+    | ExtendedIntrinsic           | 64    |    40.11 ns |    10.183 ns |   0.558 ns |  0.35 |    0.01 |         - |          NA |
+    | UseHwIntrinsics             | 64    |    14.71 ns |     4.860 ns |   0.266 ns |  0.13 |    0.00 |         - |          NA |
+    | UseAvx2_Grouped             | 64    |    20.23 ns |    11.619 ns |   0.637 ns |  0.18 |    0.00 |         - |          NA |
+    |                             |       |             |              |            |       |         |           |             |
+    | PixelOperations_Base        | 256   |   387.94 ns |    31.591 ns |   1.732 ns |  1.00 |    0.00 |         - |          NA |
+    | PixelOperations_Specialized | 256   |    50.93 ns |    22.388 ns |   1.227 ns |  0.13 |    0.00 |         - |          NA |
+    | FallbackIntrinsics128       | 256   |   509.72 ns |   249.926 ns |  13.699 ns |  1.31 |    0.04 |         - |          NA |
+    | ExtendedIntrinsic           | 256   |   140.32 ns |     9.353 ns |   0.513 ns |  0.36 |    0.00 |         - |          NA |
+    | UseHwIntrinsics             | 256   |    41.99 ns |    16.000 ns |   0.877 ns |  0.11 |    0.00 |         - |          NA |
+    | UseAvx2_Grouped             | 256   |    63.81 ns |     2.360 ns |   0.129 ns |  0.16 |    0.00 |         - |          NA |
+    |                             |       |             |              |            |       |         |           |             |
+    | PixelOperations_Base        | 2048  | 2,979.49 ns | 2,023.706 ns | 110.926 ns |  1.00 |    0.00 |         - |          NA |
+    | PixelOperations_Specialized | 2048  |   326.19 ns |    19.077 ns |   1.046 ns |  0.11 |    0.00 |         - |          NA |
+    | FallbackIntrinsics128       | 2048  | 3,885.95 ns |   411.078 ns |  22.533 ns |  1.31 |    0.05 |         - |          NA |
+    | ExtendedIntrinsic           | 2048  | 1,078.58 ns |   136.960 ns |   7.507 ns |  0.36 |    0.01 |         - |          NA |
+    | UseHwIntrinsics             | 2048  |   312.07 ns |    68.662 ns |   3.764 ns |  0.10 |    0.00 |         - |          NA |
+    | UseAvx2_Grouped             | 2048  |   451.83 ns |    41.742 ns |   2.288 ns |  0.15 |    0.01 |         - |          NA |
+    */
 }
--- a/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs
@ -7,48 +7,26 @@ using SixLabors.ImageSharp.PixelFormats;
 namespace SixLabors.ImageSharp.Benchmarks.Bulk;

 [Config(typeof(Config.Short))]
-public class FromVector4_Rgb24 : FromVector4<Rgb24>
-{
-}
+public class FromVector4_Rgb24 : FromVector4<Rgb24>;

-// 2020-11-02
-// ##########
-//
-// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
-// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
-// .NET Core SDK=3.1.403
-//  [Host]     : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-//  Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
-//  Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
-//  Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
-//
-// IterationCount=3  LaunchCount=1  WarmupCount=3
-//
-// |                      Method |        Job |       Runtime | Count |       Mean |       Error |    StdDev | Ratio | RatioSD |  Gen 0 | Gen 1 | Gen 2 | Allocated |
-// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
-// |        PixelOperations_Base | Job-XYEQXL |    .NET 4.7.2 |    64 |   343.2 ns |   305.91 ns |  16.77 ns |  1.00 |    0.00 | 0.0057 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-XYEQXL |    .NET 4.7.2 |    64 |   320.8 ns |    19.93 ns |   1.09 ns |  0.94 |    0.05 |      - |     - |     - |         - |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 |    64 |   234.3 ns |    17.98 ns |   0.99 ns |  1.00 |    0.00 | 0.0052 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 |    64 |   246.0 ns |    82.34 ns |   4.51 ns |  1.05 |    0.02 |      - |     - |     - |         - |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-YUREJO | .NET Core 3.1 |    64 |   222.3 ns |    39.46 ns |   2.16 ns |  1.00 |    0.00 | 0.0057 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 |    64 |   243.4 ns |    33.58 ns |   1.84 ns |  1.09 |    0.01 |      - |     - |     - |         - |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-XYEQXL |    .NET 4.7.2 |   256 |   824.9 ns |    32.77 ns |   1.80 ns |  1.00 |    0.00 | 0.0057 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-XYEQXL |    .NET 4.7.2 |   256 |   967.0 ns |    39.09 ns |   2.14 ns |  1.17 |    0.01 | 0.0172 |     - |     - |      72 B |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 |   256 |   756.9 ns |    94.43 ns |   5.18 ns |  1.00 |    0.00 | 0.0048 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 |   256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns |  1.32 |    0.22 | 0.0172 |     - |     - |      72 B |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-YUREJO | .NET Core 3.1 |   256 |   748.6 ns |   248.03 ns |  13.60 ns |  1.00 |    0.00 | 0.0057 |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 |   256 |   437.0 ns |    36.48 ns |   2.00 ns |  0.58 |    0.01 | 0.0172 |     - |     - |      72 B |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-XYEQXL |    .NET 4.7.2 |  2048 | 5,751.6 ns |   704.24 ns |  38.60 ns |  1.00 |    0.00 |      - |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-XYEQXL |    .NET 4.7.2 |  2048 | 4,391.6 ns |   718.17 ns |  39.37 ns |  0.76 |    0.00 | 0.0153 |     - |     - |      72 B |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 |  2048 | 6,202.0 ns | 1,815.18 ns |  99.50 ns |  1.00 |    0.00 |      - |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 |  2048 | 4,225.6 ns | 1,004.03 ns |  55.03 ns |  0.68 |    0.01 | 0.0153 |     - |     - |      72 B |
-// |                             |            |               |       |            |             |           |       |         |        |       |       |           |
-// |        PixelOperations_Base | Job-YUREJO | .NET Core 3.1 |  2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns |  1.00 |    0.00 |      - |     - |     - |      24 B |
-// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 |  2048 | 1,822.7 ns | 1,764.43 ns |  96.71 ns |  0.30 |    0.02 | 0.0172 |     - |     - |      72 B |
+/*
+ BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
+11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
+.NET SDK 8.0.200-preview.23624.5
+  [Host]     : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+  Job-NEHCEM : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+
+Runtime=.NET 8.0  Arguments=/p:DebugType=portable  IterationCount=3
+LaunchCount=1  WarmupCount=3
+
+| Method                      | Count | Mean        | Error     | StdDev   | Ratio | Gen0   | Allocated | Alloc Ratio |
+|---------------------------- |------ |------------:|----------:|---------:|------:|-------:|----------:|------------:|
+| PixelOperations_Base        | 64    |    95.87 ns |  13.60 ns | 0.745 ns |  1.00 |      - |         - |          NA |
+| PixelOperations_Specialized | 64    |    97.34 ns |  30.34 ns | 1.663 ns |  1.02 |      - |         - |          NA |
+|                             |       |             |           |          |       |        |           |             |
+| PixelOperations_Base        | 256   |   337.80 ns |  88.10 ns | 4.829 ns |  1.00 |      - |         - |          NA |
+| PixelOperations_Specialized | 256   |   195.07 ns |  30.54 ns | 1.674 ns |  0.58 | 0.0153 |      96 B |          NA |
+|                             |       |             |           |          |       |        |           |             |
+| PixelOperations_Base        | 2048  | 2,561.79 ns | 162.45 ns | 8.905 ns |  1.00 |      - |         - |          NA |
+| PixelOperations_Specialized | 2048  |   741.85 ns |  18.05 ns | 0.989 ns |  0.29 | 0.0153 |      96 B |          NA |
+ */
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
@ -14,9 +14,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
 public abstract class ToVector4<TPixel>
    where TPixel : unmanaged, IPixel<TPixel>
 {
-    protected IMemoryOwner<TPixel> source;
+    protected IMemoryOwner<TPixel> Source { get; set; }

-    protected IMemoryOwner<Vector4> destination;
+    protected IMemoryOwner<Vector4> Destination { get; set; }

    protected Configuration Configuration => Configuration.Default;

@ -26,22 +26,22 @@ public abstract class ToVector4<TPixel>
    [GlobalSetup]
    public void Setup()
    {
-        this.source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
-        this.destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
+        this.Source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
+        this.Destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
    }

    [GlobalCleanup]
    public void Cleanup()
    {
-        this.source.Dispose();
-        this.destination.Dispose();
+        this.Source.Dispose();
+        this.Destination.Dispose();
    }

    // [Benchmark]
    public void Naive()
    {
-        Span<TPixel> s = this.source.GetSpan();
-        Span<Vector4> d = this.destination.GetSpan();
+        Span<TPixel> s = this.Source.GetSpan();
+        Span<Vector4> d = this.Destination.GetSpan();

        for (int i = 0; i < this.Count; i++)
        {
@ -50,11 +50,8 @@ public abstract class ToVector4<TPixel>
    }

    [Benchmark]
-    public void PixelOperations_Specialized()
-    {
-        PixelOperations<TPixel>.Instance.ToVector4(
+    public void PixelOperations_Specialized() => PixelOperations<TPixel>.Instance.ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
-    }
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
 }
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
@ -16,8 +16,8 @@ public class ToVector4_Bgra32 : ToVector4<Bgra32>
    {
        new PixelOperations<Bgra32>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
    }

    // RESULTS:
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
@ -16,8 +16,8 @@ public class ToVector4_Rgb24 : ToVector4<Rgb24>
    {
        new PixelOperations<Rgb24>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
    }
 }

--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
@ -14,36 +14,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
 [Config(typeof(Config.Short))]
 public class ToVector4_Rgba32 : ToVector4<Rgba32>
 {
-    [Benchmark]
-    public void FallbackIntrinsics128()
-    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
-
-        SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(sBytes, dFloats);
-    }
-
    [Benchmark]
    public void PixelOperations_Base()
        => new PixelOperations<Rgba32>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
-
-    [Benchmark]
-    public void ExtendedIntrinsics()
-    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
-
-        SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
-    }
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());

    [Benchmark]
    public void HwIntrinsics()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
    }
@ -51,8 +33,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    // [Benchmark]
    public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;

@ -76,14 +58,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
        }

        n = (uint)(dFloats.Length / Vector<float>.Count);
-        var scale = new Vector<float>(1f / 255f);
+        Vector<float> scale = new(1f / 255f);

        for (nuint i = 0; i < n; i++)
        {
            ref Vector<float> dRef = ref Unsafe.Add(ref destBase, i);

-            var du = Vector.AsVectorInt32(dRef);
-            var v = Vector.ConvertToSingle(du);
+            Vector<int> du = Vector.AsVectorInt32(dRef);
+            Vector<float> v = Vector.ConvertToSingle(du);
            v *= scale;

            dRef = v;
@ -93,14 +75,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    // [Benchmark]
    public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;

        ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
        ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
-        var scale = new Vector<float>(1f / 255f);
+        Vector<float> scale = new(1f / 255f);

        for (nuint i = 0; i < n; i++)
        {
@ -126,8 +108,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private static Vector<float> ConvertToNormalizedSingle(Vector<uint> u, Vector<float> scale)
    {
-        var vi = Vector.AsVectorInt32(u);
-        var v = Vector.ConvertToSingle(vi);
+        Vector<int> vi = Vector.AsVectorInt32(u);
+        Vector<float> v = Vector.ConvertToSingle(vi);
        v *= scale;
        return v;
    }
@ -160,4 +142,30 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
            PixelOperations_Base |    Core |  2048 | 6,752.68 ns |   272.820 ns |  15.4148 ns |   1.67 |     0.02 |      - |      24 B |
     PixelOperations_Specialized |    Core |  2048 | 1,126.13 ns |    79.192 ns |   4.4745 ns |!! 0.28 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
     */
+
+    /*
+    BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
+    11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
+    .NET SDK 8.0.200-preview.23624.5
+      [Host]     : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+      Job-DFEQJT : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+
+    Runtime=.NET 8.0  Arguments=/p:DebugType=portable  IterationCount=3
+    LaunchCount=1  WarmupCount=3
+
+    | Method                      | Count | Mean        | Error      | StdDev    | Allocated |
+    |---------------------------- |------ |------------:|-----------:|----------:|----------:|
+    | FallbackIntrinsics128       | 64    |   139.66 ns |  27.429 ns |  1.503 ns |         - |
+    | PixelOperations_Base        | 64    |   124.65 ns |  29.653 ns |  1.625 ns |         - |
+    | HwIntrinsics                | 64    |    18.16 ns |   4.731 ns |  0.259 ns |         - |
+    | PixelOperations_Specialized | 64    |    27.94 ns |  15.220 ns |  0.834 ns |         - |
+    | FallbackIntrinsics128       | 256   |   525.07 ns |  34.397 ns |  1.885 ns |         - |
+    | PixelOperations_Base        | 256   |   464.17 ns |  46.897 ns |  2.571 ns |         - |
+    | HwIntrinsics                | 256   |    43.88 ns |   4.525 ns |  0.248 ns |         - |
+    | PixelOperations_Specialized | 256   |    55.57 ns |  14.587 ns |  0.800 ns |         - |
+    | FallbackIntrinsics128       | 2048  | 4,148.44 ns | 476.583 ns | 26.123 ns |         - |
+    | PixelOperations_Base        | 2048  | 3,608.42 ns |  66.293 ns |  3.634 ns |         - |
+    | HwIntrinsics                | 2048  |   361.42 ns |  35.576 ns |  1.950 ns |         - |
+    | PixelOperations_Specialized | 2048  |   374.82 ns |  33.371 ns |  1.829 ns |         - |
+    */
 }
--- a/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
+++ b/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
@ -1,4 +1,4 @@
-The benchmarks have been adapted from the
+The benchmarks have been adapted from the
 [PhotoSauce's MemoryStress project](https://github.com/saucecontrol/core-imaging-playground/tree/beeees/MemoryStress).  

 ### Setup
--- a/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
+++ b/tests/ImageSharp.Tests/Color/ColorTests.CastTo.cs
@ -105,7 +105,7 @@ public partial class ColorTests
        public void Vector4Constructor()
        {
            // Act:
-            Color color = (Color)Vector4.One;
+            Color color = Color.FromScaledVector(Vector4.One);

            // Assert:
            Assert.Equal(new RgbaVector(1, 1, 1, 1), color.ToPixel<RgbaVector>());
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -3,6 +3,7 @@

 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests.TestUtilities;
@ -112,26 +113,15 @@ public partial class SimdUtilsTests
    public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new() { 0, 4, 8, 28, 1020 };
    public static readonly TheoryData<int> ArraySizesDivisibleBy3 = new() { 0, 3, 9, 36, 957 };
    public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new() { 0, 32, 512 };
+    public static readonly TheoryData<int> ArraySizesDivisibleBy64 = new() { 0, 64, 512 };

    public static readonly TheoryData<int> ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 };

    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy4))]
-    public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
-            count,
-            (s, d) => SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(s.Span, d.Span));
-
-    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy32))]
-    public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
-            count,
-            (s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
-
-    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy32))]
+    [MemberData(nameof(ArraySizesDivisibleBy64))]
    public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
    {
-        if (!Sse2.IsSupported)
+        if (!Sse2.IsSupported && !AdvSimd.IsSupported)
        {
            return;
        }
@ -143,7 +133,7 @@ public partial class SimdUtilsTests
        FeatureTestRunner.RunWithHwIntrinsicsFeature(
            RunTest,
            count,
-            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
+            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
    }

    [Theory]
@ -166,43 +156,10 @@ public partial class SimdUtilsTests
    }

    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy4))]
-    public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count) => TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
-            count,
-            (s, d) => SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(s.Span, d.Span));
-
-    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy32))]
-    public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) => TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
-            count,
-            (s, d) => SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span));
-
-    [Theory]
-    [InlineData(1234)]
-    public void ExtendedIntrinsics_ConvertToSingle(short scale)
-    {
-        int n = Vector<float>.Count;
-        short[] sData = new Random(scale).GenerateRandomInt16Array(2 * n, (short)-scale, scale);
-        float[] fData = sData.Select(u => (float)u).ToArray();
-
-        Vector<short> source = new(sData);
-
-        Vector<float> expected1 = new(fData, 0);
-        Vector<float> expected2 = new(fData, n);
-
-        // Act:
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(source, out Vector<float> actual1, out Vector<float> actual2);
-
-        // Assert:
-        Assert.Equal(expected1, actual1);
-        Assert.Equal(expected2, actual2);
-    }
-
-    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy32))]
+    [MemberData(nameof(ArraySizesDivisibleBy64))]
    public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
    {
-        if (!Sse2.IsSupported)
+        if (!Sse2.IsSupported && !AdvSimd.IsSupported)
        {
            return;
        }
@ -214,7 +171,7 @@ public partial class SimdUtilsTests
        FeatureTestRunner.RunWithHwIntrinsicsFeature(
            RunTest,
            count,
-            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512BW | HwIntrinsics.DisableAVX2);
    }

    [Theory]
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs
@ -425,6 +425,21 @@ public partial class JpegDecoderTests
        VerifyEncodedStrings(exif);
    }

+    [Theory]
+    [WithFile(TestImages.Jpeg.Issues.Issue2067_CommentMarker, PixelTypes.Rgba32)]
+    public void JpegDecoder_DecodeMetadataComment<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        string expectedComment = "TEST COMMENT";
+        using Image<TPixel> image = provider.GetImage(JpegDecoder.Instance);
+        JpegMetadata metadata = image.Metadata.GetJpegMetadata();
+
+        Assert.Equal(1, metadata.Comments.Count);
+        Assert.Equal(expectedComment, metadata.Comments.ElementAtOrDefault(0).ToString());
+        image.DebugSave(provider);
+        image.CompareToOriginal(provider);
+    }
+
    private static void VerifyEncodedStrings(ExifProfile exif)
    {
        Assert.NotNull(exif);
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegEncoderTests.Metadata.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegEncoderTests.Metadata.cs
@ -32,19 +32,19 @@ public partial class JpegEncoderTests
    public void Encode_PreservesIptcProfile()
    {
        // arrange
-        using var input = new Image<Rgba32>(1, 1);
-        var expectedProfile = new IptcProfile();
+        using Image<Rgba32> input = new(1, 1);
+        IptcProfile expectedProfile = new();
        expectedProfile.SetValue(IptcTag.Country, "ESPAÑA");
        expectedProfile.SetValue(IptcTag.City, "unit-test-city");
        input.Metadata.IptcProfile = expectedProfile;

        // act
-        using var memStream = new MemoryStream();
+        using MemoryStream memStream = new();
        input.Save(memStream, JpegEncoder);

        // assert
        memStream.Position = 0;
-        using var output = Image.Load<Rgba32>(memStream);
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
        IptcProfile actual = output.Metadata.IptcProfile;
        Assert.NotNull(actual);
        IEnumerable<IptcValue> values = expectedProfile.Values;
@ -55,17 +55,17 @@ public partial class JpegEncoderTests
    public void Encode_PreservesExifProfile()
    {
        // arrange
-        using var input = new Image<Rgba32>(1, 1);
+        using Image<Rgba32> input = new(1, 1);
        input.Metadata.ExifProfile = new ExifProfile();
        input.Metadata.ExifProfile.SetValue(ExifTag.Software, "unit_test");

        // act
-        using var memStream = new MemoryStream();
+        using MemoryStream memStream = new();
        input.Save(memStream, JpegEncoder);

        // assert
        memStream.Position = 0;
-        using var output = Image.Load<Rgba32>(memStream);
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
        ExifProfile actual = output.Metadata.ExifProfile;
        Assert.NotNull(actual);
        IReadOnlyList<IExifValue> values = input.Metadata.ExifProfile.Values;
@ -76,16 +76,16 @@ public partial class JpegEncoderTests
    public void Encode_PreservesIccProfile()
    {
        // arrange
-        using var input = new Image<Rgba32>(1, 1);
+        using Image<Rgba32> input = new(1, 1);
        input.Metadata.IccProfile = new IccProfile(IccTestDataProfiles.Profile_Random_Array);

        // act
-        using var memStream = new MemoryStream();
+        using MemoryStream memStream = new();
        input.Save(memStream, JpegEncoder);

        // assert
        memStream.Position = 0;
-        using var output = Image.Load<Rgba32>(memStream);
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
        IccProfile actual = output.Metadata.IccProfile;
        Assert.NotNull(actual);
        IccProfile values = input.Metadata.IccProfile;
@ -99,12 +99,10 @@ public partial class JpegEncoderTests
    {
        Exception ex = Record.Exception(() =>
        {
-            var encoder = new JpegEncoder();
-            using (var stream = new MemoryStream())
-            {
-                using Image<TPixel> image = provider.GetImage(JpegDecoder.Instance);
-                image.Save(stream, encoder);
-            }
+            JpegEncoder encoder = new();
+            using MemoryStream stream = new();
+            using Image<TPixel> image = provider.GetImage(JpegDecoder.Instance);
+            image.Save(stream, encoder);
        });

        Assert.Null(ex);
@ -114,44 +112,99 @@ public partial class JpegEncoderTests
    [MemberData(nameof(RatioFiles))]
    public void Encode_PreserveRatio(string imagePath, int xResolution, int yResolution, PixelResolutionUnit resolutionUnit)
    {
-        var testFile = TestFile.Create(imagePath);
-        using (Image<Rgba32> input = testFile.CreateRgba32Image())
-        {
-            using (var memStream = new MemoryStream())
-            {
-                input.Save(memStream, JpegEncoder);
-
-                memStream.Position = 0;
-                using (var output = Image.Load<Rgba32>(memStream))
-                {
-                    ImageMetadata meta = output.Metadata;
-                    Assert.Equal(xResolution, meta.HorizontalResolution);
-                    Assert.Equal(yResolution, meta.VerticalResolution);
-                    Assert.Equal(resolutionUnit, meta.ResolutionUnits);
-                }
-            }
-        }
+        TestFile testFile = TestFile.Create(imagePath);
+        using Image<Rgba32> input = testFile.CreateRgba32Image();
+        using MemoryStream memStream = new();
+        input.Save(memStream, JpegEncoder);
+
+        memStream.Position = 0;
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
+        ImageMetadata meta = output.Metadata;
+        Assert.Equal(xResolution, meta.HorizontalResolution);
+        Assert.Equal(yResolution, meta.VerticalResolution);
+        Assert.Equal(resolutionUnit, meta.ResolutionUnits);
    }

    [Theory]
    [MemberData(nameof(QualityFiles))]
    public void Encode_PreservesQuality(string imagePath, int quality)
    {
-        var testFile = TestFile.Create(imagePath);
-        using (Image<Rgba32> input = testFile.CreateRgba32Image())
-        {
-            using (var memStream = new MemoryStream())
-            {
-                input.Save(memStream, JpegEncoder);
-
-                memStream.Position = 0;
-                using (var output = Image.Load<Rgba32>(memStream))
-                {
-                    JpegMetadata meta = output.Metadata.GetJpegMetadata();
-                    Assert.Equal(quality, meta.Quality);
-                }
-            }
-        }
+        TestFile testFile = TestFile.Create(imagePath);
+        using Image<Rgba32> input = testFile.CreateRgba32Image();
+        using MemoryStream memStream = new();
+        input.Save(memStream, JpegEncoder);
+
+        memStream.Position = 0;
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
+        JpegMetadata meta = output.Metadata.GetJpegMetadata();
+        Assert.Equal(quality, meta.Quality);
+    }
+
+    [Theory]
+    [WithFile(TestImages.Jpeg.Issues.Issue2067_CommentMarker, PixelTypes.Rgba32)]
+    public void Encode_PreservesComments<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        // arrange
+        using Image<TPixel> input = provider.GetImage(JpegDecoder.Instance);
+        using MemoryStream memStream = new();
+
+        // act
+        input.Save(memStream, JpegEncoder);
+
+        // assert
+        memStream.Position = 0;
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
+        JpegMetadata actual = output.Metadata.GetJpegMetadata();
+        Assert.NotEmpty(actual.Comments);
+        Assert.Equal(1, actual.Comments.Count);
+        Assert.Equal("TEST COMMENT", actual.Comments[0].ToString());
+    }
+
+    [Fact]
+    public void Encode_SavesMultipleComments()
+    {
+        // arrange
+        using Image<Rgba32> input = new(1, 1);
+        JpegMetadata meta = input.Metadata.GetJpegMetadata();
+        using MemoryStream memStream = new();
+
+        // act
+        meta.Comments.Add(JpegComData.FromString("First comment"));
+        meta.Comments.Add(JpegComData.FromString("Second Comment"));
+        input.Save(memStream, JpegEncoder);
+
+        // assert
+        memStream.Position = 0;
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
+        JpegMetadata actual = output.Metadata.GetJpegMetadata();
+        Assert.NotEmpty(actual.Comments);
+        Assert.Equal(2, actual.Comments.Count);
+        Assert.Equal(meta.Comments[0].ToString(), actual.Comments[0].ToString());
+        Assert.Equal(meta.Comments[1].ToString(), actual.Comments[1].ToString());
+    }
+
+    [Fact]
+    public void Encode_SaveTooLongComment()
+    {
+        // arrange
+        string longString = new('c', 65534);
+        using Image<Rgba32> input = new(1, 1);
+        JpegMetadata meta = input.Metadata.GetJpegMetadata();
+        using MemoryStream memStream = new();
+
+        // act
+        meta.Comments.Add(JpegComData.FromString(longString));
+        input.Save(memStream, JpegEncoder);
+
+        // assert
+        memStream.Position = 0;
+        using Image<Rgba32> output = Image.Load<Rgba32>(memStream);
+        JpegMetadata actual = output.Metadata.GetJpegMetadata();
+        Assert.NotEmpty(actual.Comments);
+        Assert.Equal(2, actual.Comments.Count);
+        Assert.Equal(longString[..65533], actual.Comments[0].ToString());
+        Assert.Equal("c", actual.Comments[1].ToString());
    }

    [Theory]
@ -164,14 +217,14 @@ public partial class JpegEncoderTests
    {
        // arrange
        using Image<TPixel> input = provider.GetImage(JpegDecoder.Instance);
-        using var memoryStream = new MemoryStream();
+        using MemoryStream memoryStream = new();

        // act
        input.Save(memoryStream, JpegEncoder);

        // assert
        memoryStream.Position = 0;
-        using var output = Image.Load<Rgba32>(memoryStream);
+        using Image<Rgba32> output = Image.Load<Rgba32>(memoryStream);
        JpegMetadata meta = output.Metadata.GetJpegMetadata();
        Assert.Equal(expectedColorType, meta.ColorType);
    }
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegMetadataTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegMetadataTests.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

+using System.Collections.ObjectModel;
 using SixLabors.ImageSharp.Formats.Jpeg;

 namespace SixLabors.ImageSharp.Tests.Formats.Jpg;
@ -57,4 +58,25 @@ public class JpegMetadataTests

        Assert.Equal(meta.Quality, qualityLuma);
    }
+
+    [Fact]
+    public void Comment_EmptyComment()
+    {
+        var meta = new JpegMetadata();
+
+        Assert.True(Array.Empty<JpegComData>().SequenceEqual(meta.Comments));
+    }
+
+    [Fact]
+    public void Comment_OnlyComment()
+    {
+        string comment = "test comment";
+        var expectedCollection = new Collection<string> { comment };
+
+        var meta = new JpegMetadata();
+        meta.Comments.Add(JpegComData.FromString(comment));
+
+        Assert.Equal(1, meta.Comments.Count);
+        Assert.True(expectedCollection.FirstOrDefault() == meta.Comments.FirstOrDefault().ToString());
+    }
 }
--- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
@ -665,4 +665,30 @@ public partial class PngDecoderTests
        Assert.True(eofHitCounter.EofHitCount <= 3);
        Assert.Equal(new Size(200, 120), eofHitCounter.Image.Size);
    }
+
+    [Fact]
+    public void Decode_Issue2666()
+    {
+        string path = Path.GetFullPath(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImages.Png.Issue2666));
+        using Image image = Image.Load(path);
+    }
+
+    [Theory]
+
+    [InlineData(TestImages.Png.Bad.BadZTXT)]
+    [InlineData(TestImages.Png.Bad.BadZTXT2)]
+    public void Decode_BadZTXT(string file)
+    {
+        string path = Path.GetFullPath(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, file));
+        using Image image = Image.Load(path);
+    }
+
+    [Theory]
+    [InlineData(TestImages.Png.Bad.BadZTXT)]
+    [InlineData(TestImages.Png.Bad.BadZTXT2)]
+    public void Info_BadZTXT(string file)
+    {
+        string path = Path.GetFullPath(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, file));
+        _ = Image.Identify(path);
+    }
 }
--- a/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs
@ -8,6 +8,7 @@ using SixLabors.ImageSharp.Formats.Png;
 using SixLabors.ImageSharp.Formats.Webp;
 using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Processing;
 using SixLabors.ImageSharp.Processing.Processors.Quantization;
 using SixLabors.ImageSharp.Tests.TestUtilities;
 using SixLabors.ImageSharp.Tests.TestUtilities.ImageComparison;
@ -679,6 +680,22 @@ public partial class PngEncoderTests
        encoded.CompareToReferenceOutput(ImageComparer.Exact, provider);
    }

+    // https://github.com/SixLabors/ImageSharp/issues/2469
+    [Theory]
+    [WithFile(TestImages.Png.Issue2668, PixelTypes.Rgba32)]
+    public void Issue2668_Quantized_Encode_Alpha<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        using Image<TPixel> image = provider.GetImage(PngDecoder.Instance);
+        image.Mutate(x => x.Resize(100, 100));
+
+        PngEncoder encoder = new() { BitDepth = PngBitDepth.Bit8, ColorType = PngColorType.Palette };
+
+        string actualOutputFile = provider.Utility.SaveTestOutputFile(image, "png", encoder);
+        using Image<Rgba32> encoded = Image.Load<Rgba32>(actualOutputFile);
+        encoded.CompareToReferenceOutput(ImageComparer.Exact, provider);
+    }
+
    private static void TestPngEncoderCore<TPixel>(
        TestImageProvider<TPixel> provider,
        PngColorType pngColorType,
--- a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs
@ -439,6 +439,17 @@ public class WebpDecoderTests
        image.CompareToOriginal(provider, ReferenceDecoder);
    }

+    // https://github.com/SixLabors/ImageSharp/issues/2670
+    [Theory]
+    [WithFile(Lossy.Issue2670, PixelTypes.Rgba32)]
+    public void WebpDecoder_CanDecode_Issue2670<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        using Image<TPixel> image = provider.GetImage(WebpDecoder.Instance);
+        image.DebugSave(provider);
+        image.CompareToOriginal(provider, ReferenceDecoder);
+    }
+
    [Theory]
    [WithFile(Lossless.LossLessCorruptImage3, PixelTypes.Rgba32)]
    public void WebpDecoder_ThrowImageFormatException_OnInvalidImages<TPixel>(TestImageProvider<TPixel> provider)
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@ -73,6 +73,7 @@ public static class TestImages
        public const string DisposeBackgroundRegion = "Png/animated/15-dispose-background-region.png";
        public const string DisposePreviousFirst = "Png/animated/12-dispose-prev-first.png";
        public const string BlendOverMultiple = "Png/animated/21-blend-over-multiple.png";
+        public const string Issue2666 = "Png/issues/Issue_2666.png";

        // Filtered test images from http://www.schaik.com/pngsuite/pngsuite_fil_png.html
        public const string Filter0 = "Png/filter0.png";
@ -150,6 +151,9 @@ public static class TestImages
        // Issue 2447: https://github.com/SixLabors/ImageSharp/issues/2447
        public const string Issue2447 = "Png/issues/issue_2447.png";

+        // Issue 2668: https://github.com/SixLabors/ImageSharp/issues/2668
+        public const string Issue2668 = "Png/issues/Issue_2668.png";
+
        public static class Bad
        {
            public const string MissingDataChunk = "Png/xdtn0g01.png";
@ -182,8 +186,10 @@ public static class TestImages
            // Invalid color type.
            public const string ColorTypeOne = "Png/xc1n0g08.png";
            public const string ColorTypeNine = "Png/xc9n2c08.png";
-
            public const string FlagOfGermany0000016446 = "Png/issues/flag_of_germany-0000016446.png";
+
+            public const string BadZTXT = "Png/issues/bad-ztxt.png";
+            public const string BadZTXT2 = "Png/issues/bad-ztxt2.png";
        }
    }

@ -309,6 +315,7 @@ public static class TestImages
            public const string Issue2564 = "Jpg/issues/issue-2564.jpg";
            public const string HangBadScan = "Jpg/issues/Hang_C438A851.jpg";
            public const string Issue2517 = "Jpg/issues/issue2517-bad-d7.jpg";
+            public const string Issue2067_CommentMarker = "Jpg/issues/issue-2067-comment.jpg";

            public static class Fuzz
            {
@ -805,6 +812,7 @@ public static class TestImages
            public const string Issue1594 = "Webp/issues/Issue1594.webp";
            public const string Issue2243 = "Webp/issues/Issue2243.webp";
            public const string Issue2257 = "Webp/issues/Issue2257.webp";
+            public const string Issue2670 = "Webp/issues/Issue2670.webp";
        }
    }

--- a/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs
@ -13,14 +13,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities;
 /// </summary>
 internal class BasicSerializer : IXunitSerializationInfo
 {
-    private readonly Dictionary<string, string> map = new Dictionary<string, string>();
+    private readonly Dictionary<string, string> map = [];

    public const char Separator = ':';

    private string DumpToString(Type type)
    {
-        using var ms = new MemoryStream();
-        using var writer = new StreamWriter(ms);
+        using MemoryStream ms = new();
+        using StreamWriter writer = new(ms);
        writer.WriteLine(type.FullName);
        foreach (KeyValuePair<string, string> kv in this.map)
        {
@ -29,16 +29,16 @@ internal class BasicSerializer : IXunitSerializationInfo

        writer.Flush();
        byte[] data = ms.ToArray();
-        return System.Convert.ToBase64String(data);
+        return Convert.ToBase64String(data);
    }

    private Type LoadDump(string dump)
    {
-        byte[] data = System.Convert.FromBase64String(dump);
+        byte[] data = Convert.FromBase64String(dump);

-        using var ms = new MemoryStream(data);
-        using var reader = new StreamReader(ms);
-        var type = Type.GetType(reader.ReadLine());
+        using MemoryStream ms = new(data);
+        using StreamReader reader = new(ms);
+        Type type = Type.GetType(reader.ReadLine());
        for (string s = reader.ReadLine(); s != null; s = reader.ReadLine())
        {
            string[] kv = s.Split(Separator);
@ -50,7 +50,7 @@ internal class BasicSerializer : IXunitSerializationInfo

    public static string Serialize(IXunitSerializable serializable)
    {
-        var serializer = new BasicSerializer();
+        BasicSerializer serializer = new();
        serializable.Serialize(serializer);
        return serializer.DumpToString(serializable.GetType());
    }
@ -58,10 +58,10 @@ internal class BasicSerializer : IXunitSerializationInfo
    public static T Deserialize<T>(string dump)
        where T : IXunitSerializable
    {
-        var serializer = new BasicSerializer();
+        BasicSerializer serializer = new();
        Type type = serializer.LoadDump(dump);

-        var result = (T)Activator.CreateInstance(type);
+        T result = (T)Activator.CreateInstance(type);
        result.Deserialize(serializer);
        return result;
    }
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@ -2,6 +2,7 @@
 // Licensed under the Six Labors Split License.

 using System.Diagnostics;
+using System.Globalization;
 using Microsoft.DotNet.RemoteExecutor;
 using Xunit.Abstractions;

@ -12,7 +13,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities;
 /// </summary>
 public static class FeatureTestRunner
 {
-    private static readonly char[] SplitChars = { ',', ' ' };
+    private static readonly char[] SplitChars = [',', ' '];

    /// <summary>
    /// Allows the deserialization of parameters passed to the feature test.
@ -40,7 +41,7 @@ public static class FeatureTestRunner
    /// <returns>The <typeparamref name="T"/> value.</returns>
    public static T Deserialize<T>(string value)
        where T : IConvertible
-        => (T)Convert.ChangeType(value, typeof(T));
+        => (T)Convert.ChangeType(value, typeof(T), CultureInfo.InvariantCulture);

    /// <summary>
    /// Runs the given test <paramref name="action"/> within an environment
@ -127,6 +128,7 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="intrinsics">The intrinsics features.</param>
    /// <param name="serializable">The value to pass as a parameter to the test action.</param>
@ -170,6 +172,7 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="intrinsics">The intrinsics features.</param>
    /// <param name="serializable">The value to pass as a parameter to the test action.</param>
@ -214,6 +217,8 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
+    /// <typeparam name="T2">The addition type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="intrinsics">The intrinsics features.</param>
    /// <param name="arg1">The value to pass as a parameter to the test action.</param>
@ -261,6 +266,7 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="intrinsics">The intrinsics features.</param>
    /// <param name="arg1">The value to pass as a parameter to the test action.</param>
@ -307,6 +313,7 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="serializable">The value to pass as a parameter to the test action.</param>
    /// <param name="intrinsics">The intrinsics features.</param>
@ -350,6 +357,7 @@ public static class FeatureTestRunner
    /// Runs the given test <paramref name="action"/> within an environment
    /// where the given <paramref name="intrinsics"/> features.
    /// </summary>
+    /// <typeparam name="T">The type of argument.</typeparam>
    /// <param name="action">The test action to run.</param>
    /// <param name="arg0">The value to pass as a parameter #0 to the test action.</param>
    /// <param name="arg1">The value to pass as a parameter #1 to the test action.</param>
@ -395,10 +403,10 @@ public static class FeatureTestRunner
    internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
    {
        // Loop through and translate the given values into COMPlus equivalents
-        Dictionary<HwIntrinsics, string> features = new();
+        Dictionary<HwIntrinsics, string> features = [];
        foreach (string intrinsic in intrinsics.ToString("G").Split(SplitChars, StringSplitOptions.RemoveEmptyEntries))
        {
-            HwIntrinsics key = (HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic);
+            HwIntrinsics key = Enum.Parse<HwIntrinsics>(intrinsic);
            switch (intrinsic)
            {
                case nameof(HwIntrinsics.AllowAll):
@ -418,40 +426,47 @@ public static class FeatureTestRunner
 }

 /// <summary>
-/// See <see href="https://github.com/dotnet/runtime/blob/50ac454d8d8a1915188b2a4bb3fff3b81bf6c0cf/src/coreclr/src/jit/jitconfigvalues.h#L224"/>
-/// <remarks>
-/// <see cref="DisableSIMD"/> ends up impacting all SIMD support(including System.Numerics)
-/// but not things like <see cref="DisableBMI1"/>, <see cref="DisableBMI2"/>, and <see cref="DisableLZCNT"/>.
-/// </remarks>
+/// See <see href="https://github.com/dotnet/runtime/blob/58601ba7da092fe82bb71d087d30df95472968b6/src/coreclr/jit/jitconfigvalues.h#L315"/>
 /// </summary>
 [Flags]
 #pragma warning disable RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
-public enum HwIntrinsics
+public enum HwIntrinsics : long
 #pragma warning restore RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
 {
    // Use flags so we can pass multiple values without using params.
    // Don't base on 0 or use inverse for All as that doesn't translate to string values.
-    DisableHWIntrinsic = 1 << 0,
-    DisableSSE = 1 << 1,
-    DisableSSE2 = 1 << 2,
-    DisableAES = 1 << 3,
-    DisablePCLMULQDQ = 1 << 4,
-    DisableSSE3 = 1 << 5,
-    DisableSSSE3 = 1 << 6,
-    DisableSSE41 = 1 << 7,
-    DisableSSE42 = 1 << 8,
-    DisablePOPCNT = 1 << 9,
-    DisableAVX = 1 << 10,
-    DisableFMA = 1 << 11,
-    DisableAVX2 = 1 << 12,
-    DisableBMI1 = 1 << 13,
-    DisableBMI2 = 1 << 14,
-    DisableLZCNT = 1 << 15,
-    DisableArm64AdvSimd = 1 << 16,
-    DisableArm64Crc32 = 1 << 17,
-    DisableArm64Dp = 1 << 18,
-    DisableArm64Aes = 1 << 19,
-    DisableArm64Sha1 = 1 << 20,
-    DisableArm64Sha256 = 1 << 21,
-    AllowAll = 1 << 22
+    DisableHWIntrinsic = 1L << 0,
+    DisableSSE = 1L << 1,
+    DisableSSE2 = 1L << 2,
+    DisableAES = 1L << 3,
+    DisablePCLMULQDQ = 1L << 4,
+    DisableSSE3 = 1L << 5,
+    DisableSSSE3 = 1L << 6,
+    DisableSSE41 = 1L << 7,
+    DisableSSE42 = 1L << 8,
+    DisablePOPCNT = 1L << 9,
+    DisableAVX = 1L << 10,
+    DisableFMA = 1L << 11,
+    DisableAVX2 = 1L << 12,
+    DisableAVXVNNI = 1L << 13,
+    DisableAVX512BW = 1L << 14,
+    DisableAVX512BW_VL = 1L << 15,
+    DisableAVX512CD = 1L << 16,
+    DisableAVX512CD_VL = 1L << 17,
+    DisableAVX512DQ = 1L << 18,
+    DisableAVX512DQ_VL = 1L << 19,
+    DisableAVX512F = 1L << 20,
+    DisableAVX512F_VL = 1L << 21,
+    DisableAVX512VBMI = 1L << 22,
+    DisableAVX512VBMI_VL = 1L << 23,
+    DisableBMI1 = 1L << 24,
+    DisableBMI2 = 1L << 25,
+    DisableLZCNT = 1L << 26,
+    DisableArm64AdvSimd = 1L << 27,
+    DisableArm64Crc32 = 1L << 28,
+    DisableArm64Dp = 1L << 29,
+    DisableArm64Aes = 1L << 30,
+    DisableArm64Sha1 = 1L << 31,
+    DisableArm64Sha256 = 1L << 32,
+    AllowAll = 1L << 33
 }
--- a/tests/Images/External/ReferenceOutput/PngEncoderTests/Issue2668_Quantized_Encode_Alpha_Rgba32_Issue_2668.png
+++ b/tests/Images/External/ReferenceOutput/PngEncoderTests/Issue2668_Quantized_Encode_Alpha_Rgba32_Issue_2668.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f934af128b85b9e8f557d71ac8b1f1473a0922d0754fc0c4ece0d0e3d8d94c39
+size 7702
--- a/tests/Images/Input/Jpg/issues/issue-2067-comment.jpg
+++ b/tests/Images/Input/Jpg/issues/issue-2067-comment.jpg
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87b5429adeffcfac535aa8af2ec9801bf6c965a2e6751cfec4f8534195ba8f4
+size 21082
--- a/tests/Images/Input/Png/issues/Issue_2666.png
+++ b/tests/Images/Input/Png/issues/Issue_2666.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed7665cdfd5fad00c5995040350a254b96af6c0c95ab13975f2291e9d3fce0f3
+size 8244837
--- a/tests/Images/Input/Png/issues/Issue_2668.png
+++ b/tests/Images/Input/Png/issues/Issue_2668.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8e5b2b933fd8fefd161f1d22970cb60247fd2d93b6c07b8b9ee1fdbc2241a3c
+size 390225
--- a/tests/Images/Input/Png/issues/bad-ztxt.png
+++ b/tests/Images/Input/Png/issues/bad-ztxt.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:132a70cf0ac458a55cf4a44f4c6c025587491d304595835959955de6682fa472
+size 3913750
--- a/tests/Images/Input/Png/issues/bad-ztxt2.png
+++ b/tests/Images/Input/Png/issues/bad-ztxt2.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:778a5fc8e915d79e9f55e58c6e4f646ae55dd7e866e65960754cb67a2b445987
+size 93
--- a/tests/Images/Input/Webp/issues/Issue2670.webp
+++ b/tests/Images/Input/Webp/issues/Issue2670.webp
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23ad5eb449f693af68e51dd108a6b9847a8eb48b82ca5b848395a54c2e0be08f
+size 152