From 69caa490e04e1bb29858dfc941175e8bc5391047 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Tue, 6 May 2025 20:21:40 +1000
Subject: [PATCH 01/12] Add Vector128 rounding

---
 .../Jpeg/Components/Block8x8F.Generated.cs    | 231 ++++++++++--------
 .../Jpeg/Components/Block8x8F.Generated.tt    | 103 --------
 .../Formats/Jpeg/Components/Block8x8F.cs      |  37 ++-
 src/ImageSharp/ImageSharp.csproj              |  18 --
 .../Formats/Jpg/Block8x8FTests.cs             |  54 +++-
 5 files changed, 199 insertions(+), 244 deletions(-)
 delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
index 93bb7be36..5954ad325 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
@@ -3,13 +3,14 @@
 
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
 
 // <auto-generated />
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
 internal partial struct Block8x8F
 {
-	/// <summary>
+    /// <summary>
     /// Level shift by +maximum/2, clip to [0, maximum]
     /// </summary>
     public void NormalizeColorsInPlace(float maximum)
@@ -37,38 +38,66 @@ internal partial struct Block8x8F
     }
 
     /// <summary>
-    /// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step.
+    /// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
     /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
     [MethodImpl(InliningOptions.ShortMethod)]
-    public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
+    public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
     {
-        var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F));
-        var max = new Vector<float>(maximum);
-        
-        ref Vector<float> row0 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V0L);
-        row0 = NormalizeAndRound(row0, off, max);
-            
-        ref Vector<float> row1 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V1L);
-        row1 = NormalizeAndRound(row1, off, max);
-            
-        ref Vector<float> row2 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V2L);
-        row2 = NormalizeAndRound(row2, off, max);
-            
-        ref Vector<float> row3 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V3L);
-        row3 = NormalizeAndRound(row3, off, max);
-            
-        ref Vector<float> row4 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V4L);
-        row4 = NormalizeAndRound(row4, off, max);
-            
-        ref Vector<float> row5 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V5L);
-        row5 = NormalizeAndRound(row5, off, max);
-            
-        ref Vector<float> row6 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V6L);
-        row6 = NormalizeAndRound(row6, off, max);
-            
-        ref Vector<float> row7 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V7L);
-        row7 = NormalizeAndRound(row7, off, max);
-            
+        Vector256<float> off =  Vector256.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector256<float> max = Vector256.Create(maximum);
+
+        ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
+        row0 = NormalizeAndRoundVector256(row0, off, max);
+
+        ref Vector256<float> row1 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V1L);
+        row1 = NormalizeAndRoundVector256(row1, off, max);
+
+        ref Vector256<float> row2 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V2L);
+        row2 = NormalizeAndRoundVector256(row2, off, max);
+
+        ref Vector256<float> row3 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V3L);
+        row3 = NormalizeAndRoundVector256(row3, off, max);
+
+        ref Vector256<float> row4 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V4L);
+        row4 = NormalizeAndRoundVector256(row4, off, max);
+
+        ref Vector256<float> row5 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V5L);
+        row5 = NormalizeAndRoundVector256(row5, off, max);
+
+        ref Vector256<float> row6 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V6L);
+        row6 = NormalizeAndRoundVector256(row6, off, max);
+
+        ref Vector256<float> row7 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V7L);
+        row7 = NormalizeAndRoundVector256(row7, off, max);
+    }
+
+    /// <summary>
+    /// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
+    /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
+    {
+        Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector128<float> max = Vector128.Create(maximum);
+
+        this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
+        this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
+        this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
+        this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
+        this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
+        this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
+        this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
+        this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
+        this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
+        this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
+        this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
+        this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
+        this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
+        this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
+        this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
+        this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
     }
 
     /// <summary>
@@ -78,76 +107,76 @@ internal partial struct Block8x8F
     {
         ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
 
-        this.V0L.X =  Unsafe.Add(ref selfRef, 0);
-        this.V0L.Y =  Unsafe.Add(ref selfRef, 1);
-        this.V0L.Z =  Unsafe.Add(ref selfRef, 2);
-        this.V0L.W =  Unsafe.Add(ref selfRef, 3);
-        this.V0R.X =  Unsafe.Add(ref selfRef, 4);
-        this.V0R.Y =  Unsafe.Add(ref selfRef, 5);
-        this.V0R.Z =  Unsafe.Add(ref selfRef, 6);
-        this.V0R.W =  Unsafe.Add(ref selfRef, 7);
-
-        this.V1L.X =  Unsafe.Add(ref selfRef, 8);
-        this.V1L.Y =  Unsafe.Add(ref selfRef, 9);
-        this.V1L.Z =  Unsafe.Add(ref selfRef, 10);
-        this.V1L.W =  Unsafe.Add(ref selfRef, 11);
-        this.V1R.X =  Unsafe.Add(ref selfRef, 12);
-        this.V1R.Y =  Unsafe.Add(ref selfRef, 13);
-        this.V1R.Z =  Unsafe.Add(ref selfRef, 14);
-        this.V1R.W =  Unsafe.Add(ref selfRef, 15);
-
-        this.V2L.X =  Unsafe.Add(ref selfRef, 16);
-        this.V2L.Y =  Unsafe.Add(ref selfRef, 17);
-        this.V2L.Z =  Unsafe.Add(ref selfRef, 18);
-        this.V2L.W =  Unsafe.Add(ref selfRef, 19);
-        this.V2R.X =  Unsafe.Add(ref selfRef, 20);
-        this.V2R.Y =  Unsafe.Add(ref selfRef, 21);
-        this.V2R.Z =  Unsafe.Add(ref selfRef, 22);
-        this.V2R.W =  Unsafe.Add(ref selfRef, 23);
-
-        this.V3L.X =  Unsafe.Add(ref selfRef, 24);
-        this.V3L.Y =  Unsafe.Add(ref selfRef, 25);
-        this.V3L.Z =  Unsafe.Add(ref selfRef, 26);
-        this.V3L.W =  Unsafe.Add(ref selfRef, 27);
-        this.V3R.X =  Unsafe.Add(ref selfRef, 28);
-        this.V3R.Y =  Unsafe.Add(ref selfRef, 29);
-        this.V3R.Z =  Unsafe.Add(ref selfRef, 30);
-        this.V3R.W =  Unsafe.Add(ref selfRef, 31);
-
-        this.V4L.X =  Unsafe.Add(ref selfRef, 32);
-        this.V4L.Y =  Unsafe.Add(ref selfRef, 33);
-        this.V4L.Z =  Unsafe.Add(ref selfRef, 34);
-        this.V4L.W =  Unsafe.Add(ref selfRef, 35);
-        this.V4R.X =  Unsafe.Add(ref selfRef, 36);
-        this.V4R.Y =  Unsafe.Add(ref selfRef, 37);
-        this.V4R.Z =  Unsafe.Add(ref selfRef, 38);
-        this.V4R.W =  Unsafe.Add(ref selfRef, 39);
-
-        this.V5L.X =  Unsafe.Add(ref selfRef, 40);
-        this.V5L.Y =  Unsafe.Add(ref selfRef, 41);
-        this.V5L.Z =  Unsafe.Add(ref selfRef, 42);
-        this.V5L.W =  Unsafe.Add(ref selfRef, 43);
-        this.V5R.X =  Unsafe.Add(ref selfRef, 44);
-        this.V5R.Y =  Unsafe.Add(ref selfRef, 45);
-        this.V5R.Z =  Unsafe.Add(ref selfRef, 46);
-        this.V5R.W =  Unsafe.Add(ref selfRef, 47);
-
-        this.V6L.X =  Unsafe.Add(ref selfRef, 48);
-        this.V6L.Y =  Unsafe.Add(ref selfRef, 49);
-        this.V6L.Z =  Unsafe.Add(ref selfRef, 50);
-        this.V6L.W =  Unsafe.Add(ref selfRef, 51);
-        this.V6R.X =  Unsafe.Add(ref selfRef, 52);
-        this.V6R.Y =  Unsafe.Add(ref selfRef, 53);
-        this.V6R.Z =  Unsafe.Add(ref selfRef, 54);
-        this.V6R.W =  Unsafe.Add(ref selfRef, 55);
-
-        this.V7L.X =  Unsafe.Add(ref selfRef, 56);
-        this.V7L.Y =  Unsafe.Add(ref selfRef, 57);
-        this.V7L.Z =  Unsafe.Add(ref selfRef, 58);
-        this.V7L.W =  Unsafe.Add(ref selfRef, 59);
-        this.V7R.X =  Unsafe.Add(ref selfRef, 60);
-        this.V7R.Y =  Unsafe.Add(ref selfRef, 61);
-        this.V7R.Z =  Unsafe.Add(ref selfRef, 62);
-        this.V7R.W =  Unsafe.Add(ref selfRef, 63);
+        this.V0L.X = Unsafe.Add(ref selfRef, 0);
+        this.V0L.Y = Unsafe.Add(ref selfRef, 1);
+        this.V0L.Z = Unsafe.Add(ref selfRef, 2);
+        this.V0L.W = Unsafe.Add(ref selfRef, 3);
+        this.V0R.X = Unsafe.Add(ref selfRef, 4);
+        this.V0R.Y = Unsafe.Add(ref selfRef, 5);
+        this.V0R.Z = Unsafe.Add(ref selfRef, 6);
+        this.V0R.W = Unsafe.Add(ref selfRef, 7);
+
+        this.V1L.X = Unsafe.Add(ref selfRef, 8);
+        this.V1L.Y = Unsafe.Add(ref selfRef, 9);
+        this.V1L.Z = Unsafe.Add(ref selfRef, 10);
+        this.V1L.W = Unsafe.Add(ref selfRef, 11);
+        this.V1R.X = Unsafe.Add(ref selfRef, 12);
+        this.V1R.Y = Unsafe.Add(ref selfRef, 13);
+        this.V1R.Z = Unsafe.Add(ref selfRef, 14);
+        this.V1R.W = Unsafe.Add(ref selfRef, 15);
+
+        this.V2L.X = Unsafe.Add(ref selfRef, 16);
+        this.V2L.Y = Unsafe.Add(ref selfRef, 17);
+        this.V2L.Z = Unsafe.Add(ref selfRef, 18);
+        this.V2L.W = Unsafe.Add(ref selfRef, 19);
+        this.V2R.X = Unsafe.Add(ref selfRef, 20);
+        this.V2R.Y = Unsafe.Add(ref selfRef, 21);
+        this.V2R.Z = Unsafe.Add(ref selfRef, 22);
+        this.V2R.W = Unsafe.Add(ref selfRef, 23);
+
+        this.V3L.X = Unsafe.Add(ref selfRef, 24);
+        this.V3L.Y = Unsafe.Add(ref selfRef, 25);
+        this.V3L.Z = Unsafe.Add(ref selfRef, 26);
+        this.V3L.W = Unsafe.Add(ref selfRef, 27);
+        this.V3R.X = Unsafe.Add(ref selfRef, 28);
+        this.V3R.Y = Unsafe.Add(ref selfRef, 29);
+        this.V3R.Z = Unsafe.Add(ref selfRef, 30);
+        this.V3R.W = Unsafe.Add(ref selfRef, 31);
+
+        this.V4L.X = Unsafe.Add(ref selfRef, 32);
+        this.V4L.Y = Unsafe.Add(ref selfRef, 33);
+        this.V4L.Z = Unsafe.Add(ref selfRef, 34);
+        this.V4L.W = Unsafe.Add(ref selfRef, 35);
+        this.V4R.X = Unsafe.Add(ref selfRef, 36);
+        this.V4R.Y = Unsafe.Add(ref selfRef, 37);
+        this.V4R.Z = Unsafe.Add(ref selfRef, 38);
+        this.V4R.W = Unsafe.Add(ref selfRef, 39);
+
+        this.V5L.X = Unsafe.Add(ref selfRef, 40);
+        this.V5L.Y = Unsafe.Add(ref selfRef, 41);
+        this.V5L.Z = Unsafe.Add(ref selfRef, 42);
+        this.V5L.W = Unsafe.Add(ref selfRef, 43);
+        this.V5R.X = Unsafe.Add(ref selfRef, 44);
+        this.V5R.Y = Unsafe.Add(ref selfRef, 45);
+        this.V5R.Z = Unsafe.Add(ref selfRef, 46);
+        this.V5R.W = Unsafe.Add(ref selfRef, 47);
+
+        this.V6L.X = Unsafe.Add(ref selfRef, 48);
+        this.V6L.Y = Unsafe.Add(ref selfRef, 49);
+        this.V6L.Z = Unsafe.Add(ref selfRef, 50);
+        this.V6L.W = Unsafe.Add(ref selfRef, 51);
+        this.V6R.X = Unsafe.Add(ref selfRef, 52);
+        this.V6R.Y = Unsafe.Add(ref selfRef, 53);
+        this.V6R.Z = Unsafe.Add(ref selfRef, 54);
+        this.V6R.W = Unsafe.Add(ref selfRef, 55);
+
+        this.V7L.X = Unsafe.Add(ref selfRef, 56);
+        this.V7L.Y = Unsafe.Add(ref selfRef, 57);
+        this.V7L.Z = Unsafe.Add(ref selfRef, 58);
+        this.V7L.W = Unsafe.Add(ref selfRef, 59);
+        this.V7R.X = Unsafe.Add(ref selfRef, 60);
+        this.V7R.Y = Unsafe.Add(ref selfRef, 61);
+        this.V7R.Z = Unsafe.Add(ref selfRef, 62);
+        this.V7R.W = Unsafe.Add(ref selfRef, 63);
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
deleted file mode 100644
index 19b795c23..000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
+++ /dev/null
@@ -1,103 +0,0 @@
-<#
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-#>
-<#@ template debug="false" hostspecific="false" language="C#" #>
-<#@ assembly name="System.Core" #>
-<#@ import namespace="System.Linq" #>
-<#@ import namespace="System.Text" #>
-<#@ import namespace="System.Collections.Generic" #>
-<#@ output extension=".cs" #>
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-
-// <auto-generated />
-<#
-char[] coordz = {'X', 'Y', 'Z', 'W'};
-#>
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal partial struct Block8x8F
-{
-	/// <summary>
-    /// Level shift by +maximum/2, clip to [0, maximum]
-    /// </summary>
-    public void NormalizeColorsInPlace(float maximum)
-    {
-        var CMin4 = new Vector4(0F);
-        var CMax4 = new Vector4(maximum);
-        var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F));
-
-        <#
-
-		PushIndent("        ");
-
-        for (int i = 0; i < 8; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-				char side = j == 0 ? 'L' : 'R';
-				Write($"this.V{i}{side} = Numerics.Clamp(this.V{i}{side} + COff4, CMin4, CMax4);\r\n");
-            }
-        }
-		PopIndent();
-		#>
-    }
-
-    /// <summary>
-    /// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step.
-    /// </summary>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
-    {
-        var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F));
-        var max = new Vector<float>(maximum);
-        <#
-
-        for (int i = 0; i < 8; i++)
-        {
-            #>
-
-        ref Vector<float> row<#=i#> = ref Unsafe.As<Vector4, Vector<float>>(ref this.V<#=i#>L);
-        row<#=i#> = NormalizeAndRound(row<#=i#>, off, max);
-            <#
-        }
-		#>
-
-    }
-
-    /// <summary>
-    /// Fill the block from 'source' doing short -> float conversion.
-    /// </summary>
-    public void LoadFromInt16Scalar(ref Block8x8 source)
-    {
-        ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
-
-        <#
-		PushIndent("        ");
-        for (int j = 0; j < 8; j++)
-        {
-            for (int i = 0; i < 8; i++)
-            {
-                char destCoord = coordz[i % 4];
-                char destSide = (i / 4) % 2 == 0 ? 'L' : 'R';
-
-				if(j > 0 && i == 0){
-				WriteLine("");
-				}
-
-                char srcCoord = coordz[j % 4];
-                char srcSide = (j / 4) % 2 == 0 ? 'L' : 'R';
-
-                var expression = $"this.V{j}{destSide}.{destCoord} =  Unsafe.Add(ref selfRef, {j*8+i});\r\n";
-				Write(expression);
-
-            }
-        }
-		PopIndent();
-		#>
-    }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 018df5f9f..7aa1fb296 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -8,6 +8,8 @@ using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 using System.Text;
 using SixLabors.ImageSharp.Common.Helpers;
+using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
+using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -332,22 +334,13 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     /// <param name="maximum">The maximum value.</param>
     public void NormalizeColorsAndRoundInPlace(float maximum)
     {
-        if (SimdUtils.HasVector8)
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.NormalizeColorsAndRoundInPlaceVector8(maximum);
-        }
-        else
-        {
-            this.NormalizeColorsInPlace(maximum);
-            this.RoundInPlace();
+            this.NormalizeColorsAndRoundInPlaceVector256(maximum);
         }
-    }
-
-    public void DE_NormalizeColors(float maximum)
-    {
-        if (SimdUtils.HasVector8)
+        else if (Vector128.IsHardwareAccelerated)
         {
-            this.NormalizeColorsAndRoundInPlaceVector8(maximum);
+            this.NormalizeColorsAndRoundInPlaceVector128(maximum);
         }
         else
         {
@@ -590,4 +583,22 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
         row = Vector.Min(row, max);
         return row.FastRound();
     }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> row, Vector256<float> off, Vector256<float> max)
+    {
+        row += off;
+        row = Vector256.Max(row, Vector256<float>.Zero);
+        row = Vector256.Min(row, max);
+        return Vector256_.RoundToNearestInteger(row);
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> row, Vector128<float> off, Vector128<float> max)
+    {
+        row += off;
+        row = Vector128.Max(row, Vector128<float>.Zero);
+        row = Vector128.Min(row, max);
+        return Vector128_.RoundToNearestInteger(row);
+    }
 }
diff --git a/src/ImageSharp/ImageSharp.csproj b/src/ImageSharp/ImageSharp.csproj
index 0d36340bf..fde3e94e9 100644
--- a/src/ImageSharp/ImageSharp.csproj
+++ b/src/ImageSharp/ImageSharp.csproj
@@ -56,16 +56,6 @@
       <AutoGen>True</AutoGen>
       <DependentUpon>ImageMetadataExtensions.tt</DependentUpon>
     </Compile>
-    <Compile Update="Formats\Jpeg\Components\Block8x8F.Generated.cs">
-      <DesignTime>True</DesignTime>
-      <AutoGen>True</AutoGen>
-      <DependentUpon>Block8x8F.Generated.tt</DependentUpon>
-    </Compile>
-    <Compile Update="Formats\Jpeg\Components\Block8x8F.Generated.cs">
-      <DesignTime>True</DesignTime>
-      <AutoGen>True</AutoGen>
-      <DependentUpon>Block8x8F.Generated.tt</DependentUpon>
-    </Compile>
     <Compile Update="PixelFormats\PixelImplementations\PixelOperations\Generated\Abgr32.PixelOperations.Generated.cs">
       <DesignTime>True</DesignTime>
       <AutoGen>True</AutoGen>
@@ -158,14 +148,6 @@
       <LastGenOutput>ImageMetadataExtensions.cs</LastGenOutput>
       <Generator>TextTemplatingFileGenerator</Generator>
     </None>
-    <None Update="Formats\Jpeg\Components\Block8x8F.Generated.tt">
-      <Generator>TextTemplatingFileGenerator</Generator>
-      <LastGenOutput>Block8x8F.Generated.cs</LastGenOutput>
-    </None>
-    <None Update="Formats\Jpeg\Components\Block8x8F.Generated.tt">
-      <Generator>TextTemplatingFileGenerator</Generator>
-      <LastGenOutput>Block8x8F.Generated.cs</LastGenOutput>
-    </None>
     <None Update="PixelFormats\PixelImplementations\PixelOperations\Generated\Abgr32.PixelOperations.Generated.tt">
       <Generator>TextTemplatingFileGenerator</Generator>
       <LastGenOutput>Abgr32.PixelOperations.Generated.cs</LastGenOutput>
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index cde9e776b..4d804f646 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -3,6 +3,7 @@
 
 // Uncomment this to turn unit tests into benchmarks:
 // #define BENCHMARKING
+using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
 using SixLabors.ImageSharp.Tests.TestUtilities;
@@ -24,11 +25,22 @@ public partial class Block8x8FTests : JpegFixture
     {
     }
 
-    private bool SkipOnNonAvx2Runner()
+    private bool SkipOnNonVector256Runner()
     {
-        if (!SimdUtils.HasVector8)
+        if (!Vector256.IsHardwareAccelerated)
         {
-            this.Output.WriteLine("AVX2 not supported, skipping!");
+            this.Output.WriteLine("Vector256 not supported, skipping!");
+            return true;
+        }
+
+        return false;
+    }
+
+    private bool SkipOnNonVector128Runner()
+    {
+        if (!Vector128.IsHardwareAccelerated)
+        {
+            this.Output.WriteLine("Vector128 not supported, skipping!");
             return true;
         }
 
@@ -172,9 +184,33 @@ public partial class Block8x8FTests : JpegFixture
     [Theory]
     [InlineData(1)]
     [InlineData(2)]
-    public void NormalizeColorsAndRoundAvx2(int seed)
+    public void NormalizeColorsAndRoundVector256(int seed)
+    {
+        if (this.SkipOnNonVector256Runner())
+        {
+            return;
+        }
+
+        Block8x8F source = CreateRandomFloatBlock(-200, 200, seed);
+
+        Block8x8F expected = source;
+        expected.NormalizeColorsInPlace(255);
+        expected.RoundInPlace();
+
+        Block8x8F actual = source;
+        actual.NormalizeColorsAndRoundInPlaceVector256(255);
+
+        this.Output.WriteLine(expected.ToString());
+        this.Output.WriteLine(actual.ToString());
+        this.CompareBlocks(expected, actual, 0);
+    }
+
+    [Theory]
+    [InlineData(1)]
+    [InlineData(2)]
+    public void NormalizeColorsAndRoundVector128(int seed)
     {
-        if (this.SkipOnNonAvx2Runner())
+        if (this.SkipOnNonVector128Runner())
         {
             return;
         }
@@ -186,7 +222,7 @@ public partial class Block8x8FTests : JpegFixture
         expected.RoundInPlace();
 
         Block8x8F actual = source;
-        actual.NormalizeColorsAndRoundInPlaceVector8(255);
+        actual.NormalizeColorsAndRoundInPlaceVector128(255);
 
         this.Output.WriteLine(expected.ToString());
         this.Output.WriteLine(actual.ToString());
@@ -206,7 +242,7 @@ public partial class Block8x8FTests : JpegFixture
             Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
 
             // Quantization code is used only in jpeg where it's guaranteed that
-            // qunatization valus are greater than 1
+            // quantization values are greater than 1
             // Quantize method supports negative numbers by very small numbers can cause troubles
             Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed);
 
@@ -345,7 +381,7 @@ public partial class Block8x8FTests : JpegFixture
     [Fact]
     public void LoadFromUInt16Scalar()
     {
-        if (this.SkipOnNonAvx2Runner())
+        if (this.SkipOnNonVector256Runner())
         {
             return;
         }
@@ -366,7 +402,7 @@ public partial class Block8x8FTests : JpegFixture
     [Fact]
     public void LoadFromUInt16ExtendedAvx2()
     {
-        if (this.SkipOnNonAvx2Runner())
+        if (this.SkipOnNonVector256Runner())
         {
             return;
         }

From 29a56350ce6b4e0e0ac623fb58ecfeaa5513ad68 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 7 May 2025 10:20:48 +1000
Subject: [PATCH 02/12] Clean up and prep for Vector512 multiply

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  |  4 +-
 .../Common/Helpers/Vector128Utilities.cs      | 37 ++++++++--
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    |  1 +
 ...ck8x8F.Generated.cs => Block8x8F.Round.cs} | 45 +++++++------
 .../Formats/Jpeg/Components/Block8x8F.cs      | 67 ++++++++++---------
 .../Components/FloatingPointDCT.Intrinsic.cs  |  4 +-
 .../Jpeg/Components/FloatingPointDCT.cs       |  8 +--
 .../Jpeg/Components/ScaledFloatingPointDCT.cs |  2 +-
 .../BlockOperations/Block8x8F_Transpose.cs    |  2 +-
 .../Formats/Jpg/Block8x8FTests.cs             |  2 +-
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  | 12 ++--
 11 files changed, 109 insertions(+), 75 deletions(-)
 rename src/ImageSharp/Formats/Jpeg/Components/{Block8x8F.Generated.cs => Block8x8F.Round.cs} (85%)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index a0733b660..372fff08c 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -1012,9 +1012,9 @@ internal static partial class SimdUtils
                     Unsafe.Add(ref destinationBase, i) = b;
                 }
             }
-            else if (Sse2.IsSupported || AdvSimd.IsSupported)
+            else if (Vector128.IsHardwareAccelerated)
             {
-                // Sse, AdvSimd
+                // Sse, AdvSimd, etc.
                 DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
 
                 nuint n = destination.Vector128Count<byte>();
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 765737906..e99eecc42 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Common.Helpers;
@@ -270,8 +271,16 @@ internal static class Vector128Utilities
             return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
+        }
+
+        Vector128<short> min = Vector128.Create((short)byte.MinValue);
+        Vector128<short> max = Vector128.Create((short)byte.MaxValue);
+        Vector128<ushort> lefClamped = Clamp(left, min, max).AsUInt16();
+        Vector128<ushort> rightClamped = Clamp(right, min, max).AsUInt16();
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
     /// <summary>
@@ -293,10 +302,30 @@ internal static class Vector128Utilities
             return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
+        }
+
+        Vector128<int> min = Vector128.Create((int)short.MinValue);
+        Vector128<int> max = Vector128.Create((int)short.MaxValue);
+        Vector128<int> lefClamped = Clamp(left, min, max);
+        Vector128<int> rightClamped = Clamp(right, min, max);
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
+    /// <summary
+    /// >Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector128{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
+        => Vector128.Min(Vector128.Max(value, min), max);
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
index 63be76f00..3921eccb7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -64,6 +64,7 @@ internal partial struct Block8x8F
 
         ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
 
+        // TODO: We can use the v128 utilities for this.
         for (nuint i = 0; i < 16; i += 2)
         {
             Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
similarity index 85%
rename from src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
rename to src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
index 5954ad325..899a883e4 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
@@ -5,7 +5,6 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 
-// <auto-generated />
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
 internal partial struct Block8x8F
@@ -13,28 +12,29 @@ internal partial struct Block8x8F
     /// <summary>
     /// Level shift by +maximum/2, clip to [0, maximum]
     /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
     public void NormalizeColorsInPlace(float maximum)
     {
-        var CMin4 = new Vector4(0F);
-        var CMax4 = new Vector4(maximum);
-        var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F));
-
-        this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4);
-        this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4);
-        this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4);
-        this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4);
-        this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4);
-        this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4);
-        this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4);
-        this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4);
-        this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4);
-        this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4);
-        this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4);
-        this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4);
-        this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4);
-        this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4);
-        this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4);
-        this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4);
+        Vector4 min = Vector4.Zero;
+        Vector4 max = new(maximum);
+        Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
+
+        this.V0L = Vector4.Clamp(this.V0L + off, min, max);
+        this.V0R = Vector4.Clamp(this.V0R + off, min, max);
+        this.V1L = Vector4.Clamp(this.V1L + off, min, max);
+        this.V1R = Vector4.Clamp(this.V1R + off, min, max);
+        this.V2L = Vector4.Clamp(this.V2L + off, min, max);
+        this.V2R = Vector4.Clamp(this.V2R + off, min, max);
+        this.V3L = Vector4.Clamp(this.V3L + off, min, max);
+        this.V3R = Vector4.Clamp(this.V3R + off, min, max);
+        this.V4L = Vector4.Clamp(this.V4L + off, min, max);
+        this.V4R = Vector4.Clamp(this.V4R + off, min, max);
+        this.V5L = Vector4.Clamp(this.V5L + off, min, max);
+        this.V5R = Vector4.Clamp(this.V5R + off, min, max);
+        this.V6L = Vector4.Clamp(this.V6L + off, min, max);
+        this.V6R = Vector4.Clamp(this.V6R + off, min, max);
+        this.V7L = Vector4.Clamp(this.V7L + off, min, max);
+        this.V7R = Vector4.Clamp(this.V7R + off, min, max);
     }
 
     /// <summary>
@@ -44,7 +44,7 @@ internal partial struct Block8x8F
     [MethodImpl(InliningOptions.ShortMethod)]
     public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
     {
-        Vector256<float> off =  Vector256.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
         Vector256<float> max = Vector256.Create(maximum);
 
         ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
@@ -103,6 +103,7 @@ internal partial struct Block8x8F
     /// <summary>
     /// Fill the block from 'source' doing short -> float conversion.
     /// </summary>
+    /// <param name="source">The source block</param>
     public void LoadFromInt16Scalar(ref Block8x8 source)
     {
         ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 7aa1fb296..2eecafc13 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -159,17 +159,18 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void MultiplyInPlace(float value)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 = Avx.Multiply(this.V0, valueVec);
-            this.V1 = Avx.Multiply(this.V1, valueVec);
-            this.V2 = Avx.Multiply(this.V2, valueVec);
-            this.V3 = Avx.Multiply(this.V3, valueVec);
-            this.V4 = Avx.Multiply(this.V4, valueVec);
-            this.V5 = Avx.Multiply(this.V5, valueVec);
-            this.V6 = Avx.Multiply(this.V6, valueVec);
-            this.V7 = Avx.Multiply(this.V7, valueVec);
+            this.V0 *= valueVec;
+            this.V1 *= valueVec;
+            this.V2 *= valueVec;
+            this.V3 *= valueVec;
+            this.V4 *= valueVec;
+            this.V5 *= valueVec;
+            this.V6 *= valueVec;
+            this.V7 *= valueVec;
         }
         else
         {
@@ -200,16 +201,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public unsafe void MultiplyInPlace(ref Block8x8F other)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.V0 = Avx.Multiply(this.V0, other.V0);
-            this.V1 = Avx.Multiply(this.V1, other.V1);
-            this.V2 = Avx.Multiply(this.V2, other.V2);
-            this.V3 = Avx.Multiply(this.V3, other.V3);
-            this.V4 = Avx.Multiply(this.V4, other.V4);
-            this.V5 = Avx.Multiply(this.V5, other.V5);
-            this.V6 = Avx.Multiply(this.V6, other.V6);
-            this.V7 = Avx.Multiply(this.V7, other.V7);
+            this.V0 *= other.V0;
+            this.V1 *= other.V1;
+            this.V2 *= other.V2;
+            this.V3 *= other.V3;
+            this.V4 *= other.V4;
+            this.V5 *= other.V5;
+            this.V6 *= other.V6;
+            this.V7 *= other.V7;
         }
         else
         {
@@ -239,17 +241,18 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void AddInPlace(float value)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 = Avx.Add(this.V0, valueVec);
-            this.V1 = Avx.Add(this.V1, valueVec);
-            this.V2 = Avx.Add(this.V2, valueVec);
-            this.V3 = Avx.Add(this.V3, valueVec);
-            this.V4 = Avx.Add(this.V4, valueVec);
-            this.V5 = Avx.Add(this.V5, valueVec);
-            this.V6 = Avx.Add(this.V6, valueVec);
-            this.V7 = Avx.Add(this.V7, valueVec);
+            this.V0 += valueVec;
+            this.V1 += valueVec;
+            this.V2 += valueVec;
+            this.V3 += valueVec;
+            this.V4 += valueVec;
+            this.V5 += valueVec;
+            this.V6 += valueVec;
+            this.V7 += valueVec;
         }
         else
         {
@@ -509,10 +512,10 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     }
 
     /// <summary>
-    /// Transpose the block inplace.
+    /// Transpose the block in-place.
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    public void TransposeInplace()
+    public void TransposeInPlace()
     {
         if (Avx.IsSupported)
         {
@@ -520,15 +523,15 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
         }
         else
         {
-            this.TransposeInplace_Scalar();
+            this.TransposeInPlace_Scalar();
         }
     }
 
     /// <summary>
-    /// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
+    /// Scalar in-place transpose implementation for <see cref="TransposeInPlace"/>
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    private void TransposeInplace_Scalar()
+    private void TransposeInPlace_Scalar()
     {
         ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
index 7e102f696..b11d834a8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@@ -20,7 +20,7 @@ internal static partial class FloatingPointDCT
         FDCT8x8_1D_Avx(ref block);
 
         // Second pass - process rows
-        block.TransposeInplace();
+        block.TransposeInPlace();
         FDCT8x8_1D_Avx(ref block);
 
         // Applies 1D floating point FDCT inplace
@@ -81,7 +81,7 @@ internal static partial class FloatingPointDCT
         IDCT8x8_1D_Avx(ref transposedBlock);
 
         // Second pass - process rows
-        transposedBlock.TransposeInplace();
+        transposedBlock.TransposeInPlace();
         IDCT8x8_1D_Avx(ref transposedBlock);
 
         // Applies 1D floating point FDCT inplace
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
index 0aca33b4c..4c22307cf 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
@@ -77,7 +77,7 @@ internal static partial class FloatingPointDCT
 
         // Spectral macroblocks are transposed before quantization
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
@@ -97,7 +97,7 @@ internal static partial class FloatingPointDCT
         // Spectral macroblocks are not transposed before quantization
         // Transpose is done after quantization at zig-zag stage
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
@@ -155,7 +155,7 @@ internal static partial class FloatingPointDCT
         IDCT8x4_Vector4(ref transposedBlock.V0R);
 
         // Second pass - process rows
-        transposedBlock.TransposeInplace();
+        transposedBlock.TransposeInPlace();
         IDCT8x4_Vector4(ref transposedBlock.V0L);
         IDCT8x4_Vector4(ref transposedBlock.V0R);
 
@@ -225,7 +225,7 @@ internal static partial class FloatingPointDCT
         FDCT8x4_Vector4(ref block.V0R);
 
         // Second pass - process rows
-        block.TransposeInplace();
+        block.TransposeInPlace();
         FDCT8x4_Vector4(ref block.V0L);
         FDCT8x4_Vector4(ref block.V0R);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
index 98e385797..b8234ff3e 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
@@ -48,7 +48,7 @@ internal static class ScaledFloatingPointDCT
 
         // Spectral macroblocks are transposed before quantization
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
index 07907f21d..caca630bc 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -14,7 +14,7 @@ public class Block8x8F_Transpose
     [Benchmark]
     public float TransposeInplace()
     {
-        this.source.TransposeInplace();
+        this.source.TransposeInPlace();
         return this.source[0];
     }
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 4d804f646..7b73c0c52 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -130,7 +130,7 @@ public partial class Block8x8FTests : JpegFixture
 
             Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData());
 
-            block8x8.TransposeInplace();
+            block8x8.TransposeInPlace();
 
             float[] actual = new float[64];
             block8x8.ScaledCopyTo(actual);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 5a1488c41..7b411a28f 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -62,7 +62,7 @@ public static class DCTTests
             FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
 
             // IDCT implementation tranforms blocks after transposition
-            srcBlock.TransposeInplace();
+            srcBlock.TransposeInPlace();
             srcBlock.MultiplyInPlace(ref dequantMatrix);
 
             // IDCT calculation
@@ -95,7 +95,7 @@ public static class DCTTests
             FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
 
             // IDCT implementation tranforms blocks after transposition
-            srcBlock.TransposeInplace();
+            srcBlock.TransposeInPlace();
             srcBlock.MultiplyInPlace(ref dequantMatrix);
 
             // IDCT calculation
@@ -136,7 +136,7 @@ public static class DCTTests
 
                 // testee
                 // IDCT implementation tranforms blocks after transposition
-                srcBlock.TransposeInplace();
+                srcBlock.TransposeInPlace();
                 FloatingPointDCT.TransformIDCT(ref srcBlock);
 
                 float[] actualDest = srcBlock.ToArray();
@@ -182,7 +182,7 @@ public static class DCTTests
 
             // testee
             // IDCT implementation tranforms blocks after transposition
-            srcBlock.TransposeInplace();
+            srcBlock.TransposeInPlace();
             ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
 
             Span<float> expectedSpan = expectedDest.AsSpan();
@@ -243,7 +243,7 @@ public static class DCTTests
 
             // testee
             // IDCT implementation tranforms blocks after transposition
-            srcBlock.TransposeInplace();
+            srcBlock.TransposeInPlace();
             ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
 
             Span<float> expectedSpan = expectedDest.AsSpan();
@@ -338,7 +338,7 @@ public static class DCTTests
                 // Second transpose call is done by Quantize step
                 // Do this manually here just to be complient to the reference implementation
                 FloatingPointDCT.TransformFDCT(ref block);
-                block.TransposeInplace();
+                block.TransposeInPlace();
 
                 // Part of the IDCT calculations is fused into the quantization step
                 // We must multiply input block with adjusted no-quantization matrix

From 5125a0480fe2f1249cc578da1336c38359b6430a Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 7 May 2025 13:38:19 +1000
Subject: [PATCH 03/12] Rename utils, organize BlockF8x8

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 194 +++++++--------
 .../Common/Helpers/Vector128Utilities.cs      |   8 +-
 .../Common/Helpers/Vector256Utilities.cs      |  16 +-
 .../Common/Helpers/Vector512Utilities.cs      |  30 ++-
 .../Jpeg/Components/Block8x8F.Intrinsic.cs    | 145 ------------
 .../Jpeg/Components/Block8x8F.Round.cs        | 183 ---------------
 .../Jpeg/Components/Block8x8F.Vector128.cs    |  66 ++++++
 .../Jpeg/Components/Block8x8F.Vector256.cs    | 191 +++++++++++++++
 .../Formats/Jpeg/Components/Block8x8F.cs      | 221 ++++++++++--------
 .../JpegColorConverter.GrayScaleVector128.cs  |   4 +-
 .../JpegColorConverter.GrayScaleVector256.cs  |   4 +-
 .../JpegColorConverter.GrayScaleVector512.cs  |   4 +-
 .../JpegColorConverter.YCbCrVector128.cs      |   4 +-
 .../JpegColorConverter.YCbCrVector256.cs      |   4 +-
 .../JpegColorConverter.YCbCrVector512.cs      |   4 +-
 .../JpegColorConverter.YccKVector128.cs       |   4 +-
 .../JpegColorConverter.YccKVector256.cs       |   4 +-
 .../JpegColorConverter.YccKVector512.cs       |   2 +-
 .../Components/FloatingPointDCT.Intrinsic.cs  |  64 ++---
 src/ImageSharp/Formats/Webp/AlphaDecoder.cs   |   6 +-
 .../Codecs/Jpeg/DecodeJpeg.cs                 |  11 +-
 .../Config.HwIntrinsics.cs                    |  33 +++
 22 files changed, 624 insertions(+), 578 deletions(-)
 delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
 delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 372fff08c..449dc37d0 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -66,9 +66,9 @@ internal static partial class SimdUtils
             ref Span<float> destination,
             [ConstantExpected] byte control)
         {
-            if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) ||
-                (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) ||
-                (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat))
+            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) ||
+                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) ||
+                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat))
             {
                 int remainder = 0;
                 if (Vector512.IsHardwareAccelerated)
@@ -112,9 +112,9 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) ||
-                (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) ||
-                (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte))
+            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) ||
+                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) ||
+                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte))
             {
                 int remainder = 0;
                 if (Vector512.IsHardwareAccelerated)
@@ -158,7 +158,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
             {
                 int remainder = source.Length % (Vector128<byte>.Count * 3);
 
@@ -190,7 +190,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
             {
                 int remainder = source.Length % (Vector128<byte>.Count * 3);
 
@@ -223,7 +223,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
             {
                 int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1);    // bit-hack for modulo
 
@@ -249,7 +249,7 @@ internal static partial class SimdUtils
             Span<float> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat)
+            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat)
             {
                 ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@@ -263,21 +263,21 @@ internal static partial class SimdUtils
                     ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector512Utilities.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector512_.Shuffle(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
-            else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat)
+            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat)
             {
                 ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -291,21 +291,21 @@ internal static partial class SimdUtils
                     ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector256Utilities.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector256_.Shuffle(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
-            else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat)
+            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)
             {
                 ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
@@ -319,17 +319,17 @@ internal static partial class SimdUtils
                     ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector128Utilities.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector128_.Shuffle(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
@@ -341,7 +341,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte)
+            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte)
             {
                 Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -359,21 +359,21 @@ internal static partial class SimdUtils
                     ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector512Utilities.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector512_.Shuffle(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
-            else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte)
+            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte)
             {
                 Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -391,21 +391,21 @@ internal static partial class SimdUtils
                     ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector256Utilities.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector256_.Shuffle(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
-            else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte)
+            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)
             {
                 Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -423,17 +423,17 @@ internal static partial class SimdUtils
                     ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector128Utilities.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector128_.Shuffle(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
@@ -445,11 +445,11 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
-                Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
+                Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
 
                 Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
                 Shuffle.MMShuffleSpan(ref bytes, control);
@@ -467,28 +467,28 @@ internal static partial class SimdUtils
                     Vector128<byte> v0 = vs;
                     Vector128<byte> v1 = Unsafe.Add(ref vs, (nuint)1);
                     Vector128<byte> v2 = Unsafe.Add(ref vs, (nuint)2);
-                    Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
+                    Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
 
-                    v2 = Vector128Utilities.AlignRight(v2, v1, 8);
-                    v1 = Vector128Utilities.AlignRight(v1, v0, 12);
+                    v2 = Vector128_.AlignRight(v2, v1, 8);
+                    v1 = Vector128_.AlignRight(v1, v0, 12);
 
-                    v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16), mask);
-                    v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16), mask);
-                    v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16), mask);
-                    v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16), mask);
+                    v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask);
+                    v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask);
+                    v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask);
+                    v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask);
 
-                    v0 = Vector128Utilities.Shuffle(v0, maskE);
-                    v1 = Vector128Utilities.Shuffle(v1, maskSlice4Nx16);
-                    v2 = Vector128Utilities.Shuffle(v2, maskE);
-                    v3 = Vector128Utilities.Shuffle(v3, maskSlice4Nx16);
+                    v0 = Vector128_.Shuffle(v0, maskE);
+                    v1 = Vector128_.Shuffle(v1, maskSlice4Nx16);
+                    v2 = Vector128_.Shuffle(v2, maskE);
+                    v3 = Vector128_.Shuffle(v3, maskSlice4Nx16);
 
-                    v0 = Vector128Utilities.AlignRight(v1, v0, 4);
-                    v3 = Vector128Utilities.AlignRight(v3, v2, 12);
+                    v0 = Vector128_.AlignRight(v1, v0, 4);
+                    v3 = Vector128_.AlignRight(v3, v2, 12);
 
-                    v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
-                    v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
+                    v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
+                    v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
 
-                    v1 = Vector128Utilities.AlignRight(v2, v1, 8);
+                    v1 = Vector128_.AlignRight(v2, v1, 8);
 
                     ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, i);
 
@@ -505,7 +505,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@@ -527,17 +527,17 @@ internal static partial class SimdUtils
                     ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i);
                     Vector128<byte> v1 = Unsafe.Add(ref v0, 1);
                     Vector128<byte> v2 = Unsafe.Add(ref v0, 2);
-                    Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
+                    Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
 
-                    v2 = Vector128Utilities.AlignRight(v2, v1, 8);
-                    v1 = Vector128Utilities.AlignRight(v1, v0, 12);
+                    v2 = Vector128_.AlignRight(v2, v1, 8);
+                    v1 = Vector128_.AlignRight(v1, v0, 12);
 
                     ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
 
-                    vd = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 1) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 2) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 3) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16) | fill, mask);
+                    vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask);
                 }
             }
         }
@@ -548,10 +548,10 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
             {
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
-                Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
+                Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
 
                 Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -574,18 +574,18 @@ internal static partial class SimdUtils
                     Vector128<byte> v2 = Unsafe.Add(ref vs, 2);
                     Vector128<byte> v3 = Unsafe.Add(ref vs, 3);
 
-                    v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, mask), maskE);
-                    v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, mask), maskSlice4Nx16);
-                    v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, mask), maskE);
-                    v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, mask), maskSlice4Nx16);
+                    v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE);
+                    v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16);
+                    v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE);
+                    v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16);
 
-                    v0 = Vector128Utilities.AlignRight(v1, v0, 4);
-                    v3 = Vector128Utilities.AlignRight(v3, v2, 12);
+                    v0 = Vector128_.AlignRight(v1, v0, 4);
+                    v3 = Vector128_.AlignRight(v3, v2, 12);
 
-                    v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
-                    v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
+                    v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
+                    v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
 
-                    v1 = Vector128Utilities.AlignRight(v2, v1, 8);
+                    v1 = Vector128_.AlignRight(v2, v1, 8);
 
                     ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
 
@@ -965,10 +965,10 @@ internal static partial class SimdUtils
                     Vector512<float> f2 = scale * Unsafe.Add(ref s, 2);
                     Vector512<float> f3 = scale * Unsafe.Add(ref s, 3);
 
-                    Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0);
-                    Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1);
-                    Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2);
-                    Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3);
+                    Vector512<int> w0 = Vector512_.ConvertToInt32RoundToEven(f0);
+                    Vector512<int> w1 = Vector512_.ConvertToInt32RoundToEven(f1);
+                    Vector512<int> w2 = Vector512_.ConvertToInt32RoundToEven(f2);
+                    Vector512<int> w3 = Vector512_.ConvertToInt32RoundToEven(f3);
 
                     Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1);
                     Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3);
@@ -999,10 +999,10 @@ internal static partial class SimdUtils
                     Vector256<float> f2 = scale * Unsafe.Add(ref s, 2);
                     Vector256<float> f3 = scale * Unsafe.Add(ref s, 3);
 
-                    Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0);
-                    Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1);
-                    Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2);
-                    Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3);
+                    Vector256<int> w0 = Vector256_.ConvertToInt32RoundToEven(f0);
+                    Vector256<int> w1 = Vector256_.ConvertToInt32RoundToEven(f1);
+                    Vector256<int> w2 = Vector256_.ConvertToInt32RoundToEven(f2);
+                    Vector256<int> w3 = Vector256_.ConvertToInt32RoundToEven(f3);
 
                     Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
                     Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
@@ -1033,15 +1033,15 @@ internal static partial class SimdUtils
                     Vector128<float> f2 = scale * Unsafe.Add(ref s, 2);
                     Vector128<float> f3 = scale * Unsafe.Add(ref s, 3);
 
-                    Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0);
-                    Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1);
-                    Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2);
-                    Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3);
+                    Vector128<int> w0 = Vector128_.ConvertToInt32RoundToEven(f0);
+                    Vector128<int> w1 = Vector128_.ConvertToInt32RoundToEven(f1);
+                    Vector128<int> w2 = Vector128_.ConvertToInt32RoundToEven(f2);
+                    Vector128<int> w3 = Vector128_.ConvertToInt32RoundToEven(f3);
 
-                    Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1);
-                    Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3);
+                    Vector128<short> u0 = Vector128_.PackSignedSaturate(w0, w1);
+                    Vector128<short> u1 = Vector128_.PackSignedSaturate(w2, w3);
 
-                    Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1);
+                    Unsafe.Add(ref destinationBase, i) = Vector128_.PackUnsignedSaturate(u0, u1);
                 }
             }
         }
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index e99eecc42..85b09b351 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -19,7 +19,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 /// </list>
 /// Should only be used if the intrinsics are available.
 /// </summary>
-internal static class Vector128Utilities
+#pragma warning disable SA1649 // File name should match first type name
+internal static class Vector128_
+#pragma warning restore SA1649 // File name should match first type name
 {
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
@@ -314,8 +316,8 @@ internal static class Vector128Utilities
         return Vector128.Narrow(lefClamped, rightClamped);
     }
 
-    /// <summary
-    /// >Restricts a vector between a minimum and a maximum value.
+    /// <summary>
+    /// Restricts a vector between a minimum and a maximum value.
     /// </summary>
     /// <typeparam name="T">The type of the elements in the vector.</typeparam>
     /// <param name="value">The vector to restrict.</param>
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 4c12cb272..893b6240d 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 /// </list>
 /// Should only be used if the intrinsics are available.
 /// </summary>
-internal static class Vector256Utilities
+#pragma warning disable SA1649 // File name should match first type name
+internal static class Vector256_
+#pragma warning restore SA1649 // File name should match first type name
 {
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
@@ -152,6 +154,18 @@ internal static class Vector256Utilities
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
+        => Vector256.Min(Vector256.Max(value, min), max);
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
index 40e8ac344..3c773bc52 100644
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 /// </list>
 /// Should only be used if the intrinsics are available.
 /// </summary>
-internal static class Vector512Utilities
+#pragma warning disable SA1649 // File name should match first type name
+internal static class Vector512_
+#pragma warning restore SA1649 // File name should match first type name
 {
     /// <summary>
     /// Gets a value indicating whether shuffle float operations are supported.
@@ -126,6 +128,13 @@ internal static class Vector512Utilities
             return Avx512F.RoundScale(vector, 0b0000_1000);
         }
 
+        if (Avx.IsSupported)
+        {
+            Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
+            Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
+            return Vector512.Create(lower, upper);
+        }
+
         Vector512<float> sign = vector & Vector512.Create(-0F);
         Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
 
@@ -152,9 +161,28 @@ internal static class Vector512Utilities
             return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
         }
 
+        if (Fma.IsSupported)
+        {
+            Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
+            Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
+            return Vector512.Create(lower, upper);
+        }
+
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector512{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
+        => Vector512.Min(Vector512.Max(value, min), max);
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
deleted file mode 100644
index 3921eccb7..000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal partial struct Block8x8F
-{
-    /// <summary>
-    /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
-    /// </summary>
-    public const int RowCount = 8;
-
-    [FieldOffset(0)]
-    public Vector256<float> V0;
-    [FieldOffset(32)]
-    public Vector256<float> V1;
-    [FieldOffset(64)]
-    public Vector256<float> V2;
-    [FieldOffset(96)]
-    public Vector256<float> V3;
-    [FieldOffset(128)]
-    public Vector256<float> V4;
-    [FieldOffset(160)]
-    public Vector256<float> V5;
-    [FieldOffset(192)]
-    public Vector256<float> V6;
-    [FieldOffset(224)]
-    public Vector256<float> V7;
-
-    private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
-    {
-        DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
-
-        ref Vector256<float> aBase = ref a.V0;
-        ref Vector256<float> bBase = ref b.V0;
-
-        ref Vector256<short> destRef = ref dest.V01;
-        Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
-
-        for (nuint i = 0; i < 8; i += 2)
-        {
-            Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-            Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
-
-            Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
-            row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
-
-            Unsafe.Add(ref destRef, i / 2) = row;
-        }
-    }
-
-    private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
-    {
-        DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
-
-        ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
-        ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
-
-        ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
-
-        // TODO: We can use the v128 utilities for this.
-        for (nuint i = 0; i < 16; i += 2)
-        {
-            Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-            Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
-
-            Vector128<short> row = Sse2.PackSignedSaturate(left, right);
-            Unsafe.Add(ref destBase, i / 2) = row;
-        }
-    }
-
-    private void TransposeInplace_Avx()
-    {
-        // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-        Vector256<float> r0 = Avx.InsertVector128(
-            this.V0,
-            Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-            1);
-
-        Vector256<float> r1 = Avx.InsertVector128(
-           this.V1,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-           1);
-
-        Vector256<float> r2 = Avx.InsertVector128(
-           this.V2,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-           1);
-
-        Vector256<float> r3 = Avx.InsertVector128(
-           this.V3,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-           1);
-
-        Vector256<float> r4 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-           1);
-
-        Vector256<float> r5 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-           1);
-
-        Vector256<float> r6 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-           1);
-
-        Vector256<float> r7 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-           1);
-
-        Vector256<float> t0 = Avx.UnpackLow(r0, r1);
-        Vector256<float> t2 = Avx.UnpackLow(r2, r3);
-        Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-        this.V0 = Avx.Blend(t0, v, 0xCC);
-        this.V1 = Avx.Blend(t2, v, 0x33);
-
-        Vector256<float> t4 = Avx.UnpackLow(r4, r5);
-        Vector256<float> t6 = Avx.UnpackLow(r6, r7);
-        v = Avx.Shuffle(t4, t6, 0x4E);
-        this.V4 = Avx.Blend(t4, v, 0xCC);
-        this.V5 = Avx.Blend(t6, v, 0x33);
-
-        Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
-        Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
-        v = Avx.Shuffle(t1, t3, 0x4E);
-        this.V2 = Avx.Blend(t1, v, 0xCC);
-        this.V3 = Avx.Blend(t3, v, 0x33);
-
-        Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
-        Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
-        v = Avx.Shuffle(t5, t7, 0x4E);
-        this.V6 = Avx.Blend(t5, v, 0xCC);
-        this.V7 = Avx.Blend(t7, v, 0x33);
-    }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
deleted file mode 100644
index 899a883e4..000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics;
-
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal partial struct Block8x8F
-{
-    /// <summary>
-    /// Level shift by +maximum/2, clip to [0, maximum]
-    /// </summary>
-    /// <param name="maximum">The maximum value to normalize to.</param>
-    public void NormalizeColorsInPlace(float maximum)
-    {
-        Vector4 min = Vector4.Zero;
-        Vector4 max = new(maximum);
-        Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
-
-        this.V0L = Vector4.Clamp(this.V0L + off, min, max);
-        this.V0R = Vector4.Clamp(this.V0R + off, min, max);
-        this.V1L = Vector4.Clamp(this.V1L + off, min, max);
-        this.V1R = Vector4.Clamp(this.V1R + off, min, max);
-        this.V2L = Vector4.Clamp(this.V2L + off, min, max);
-        this.V2R = Vector4.Clamp(this.V2R + off, min, max);
-        this.V3L = Vector4.Clamp(this.V3L + off, min, max);
-        this.V3R = Vector4.Clamp(this.V3R + off, min, max);
-        this.V4L = Vector4.Clamp(this.V4L + off, min, max);
-        this.V4R = Vector4.Clamp(this.V4R + off, min, max);
-        this.V5L = Vector4.Clamp(this.V5L + off, min, max);
-        this.V5R = Vector4.Clamp(this.V5R + off, min, max);
-        this.V6L = Vector4.Clamp(this.V6L + off, min, max);
-        this.V6R = Vector4.Clamp(this.V6R + off, min, max);
-        this.V7L = Vector4.Clamp(this.V7L + off, min, max);
-        this.V7R = Vector4.Clamp(this.V7R + off, min, max);
-    }
-
-    /// <summary>
-    /// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
-    /// </summary>
-    /// <param name="maximum">The maximum value to normalize to.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
-    {
-        Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
-        Vector256<float> max = Vector256.Create(maximum);
-
-        ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
-        row0 = NormalizeAndRoundVector256(row0, off, max);
-
-        ref Vector256<float> row1 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V1L);
-        row1 = NormalizeAndRoundVector256(row1, off, max);
-
-        ref Vector256<float> row2 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V2L);
-        row2 = NormalizeAndRoundVector256(row2, off, max);
-
-        ref Vector256<float> row3 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V3L);
-        row3 = NormalizeAndRoundVector256(row3, off, max);
-
-        ref Vector256<float> row4 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V4L);
-        row4 = NormalizeAndRoundVector256(row4, off, max);
-
-        ref Vector256<float> row5 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V5L);
-        row5 = NormalizeAndRoundVector256(row5, off, max);
-
-        ref Vector256<float> row6 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V6L);
-        row6 = NormalizeAndRoundVector256(row6, off, max);
-
-        ref Vector256<float> row7 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V7L);
-        row7 = NormalizeAndRoundVector256(row7, off, max);
-    }
-
-    /// <summary>
-    /// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
-    /// </summary>
-    /// <param name="maximum">The maximum value to normalize to.</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
-    {
-        Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
-        Vector128<float> max = Vector128.Create(maximum);
-
-        this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
-        this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
-        this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
-        this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
-        this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
-        this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
-        this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
-        this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
-        this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
-        this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
-        this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
-        this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
-        this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
-        this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
-        this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
-        this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
-    }
-
-    /// <summary>
-    /// Fill the block from 'source' doing short -> float conversion.
-    /// </summary>
-    /// <param name="source">The source block</param>
-    public void LoadFromInt16Scalar(ref Block8x8 source)
-    {
-        ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
-
-        this.V0L.X = Unsafe.Add(ref selfRef, 0);
-        this.V0L.Y = Unsafe.Add(ref selfRef, 1);
-        this.V0L.Z = Unsafe.Add(ref selfRef, 2);
-        this.V0L.W = Unsafe.Add(ref selfRef, 3);
-        this.V0R.X = Unsafe.Add(ref selfRef, 4);
-        this.V0R.Y = Unsafe.Add(ref selfRef, 5);
-        this.V0R.Z = Unsafe.Add(ref selfRef, 6);
-        this.V0R.W = Unsafe.Add(ref selfRef, 7);
-
-        this.V1L.X = Unsafe.Add(ref selfRef, 8);
-        this.V1L.Y = Unsafe.Add(ref selfRef, 9);
-        this.V1L.Z = Unsafe.Add(ref selfRef, 10);
-        this.V1L.W = Unsafe.Add(ref selfRef, 11);
-        this.V1R.X = Unsafe.Add(ref selfRef, 12);
-        this.V1R.Y = Unsafe.Add(ref selfRef, 13);
-        this.V1R.Z = Unsafe.Add(ref selfRef, 14);
-        this.V1R.W = Unsafe.Add(ref selfRef, 15);
-
-        this.V2L.X = Unsafe.Add(ref selfRef, 16);
-        this.V2L.Y = Unsafe.Add(ref selfRef, 17);
-        this.V2L.Z = Unsafe.Add(ref selfRef, 18);
-        this.V2L.W = Unsafe.Add(ref selfRef, 19);
-        this.V2R.X = Unsafe.Add(ref selfRef, 20);
-        this.V2R.Y = Unsafe.Add(ref selfRef, 21);
-        this.V2R.Z = Unsafe.Add(ref selfRef, 22);
-        this.V2R.W = Unsafe.Add(ref selfRef, 23);
-
-        this.V3L.X = Unsafe.Add(ref selfRef, 24);
-        this.V3L.Y = Unsafe.Add(ref selfRef, 25);
-        this.V3L.Z = Unsafe.Add(ref selfRef, 26);
-        this.V3L.W = Unsafe.Add(ref selfRef, 27);
-        this.V3R.X = Unsafe.Add(ref selfRef, 28);
-        this.V3R.Y = Unsafe.Add(ref selfRef, 29);
-        this.V3R.Z = Unsafe.Add(ref selfRef, 30);
-        this.V3R.W = Unsafe.Add(ref selfRef, 31);
-
-        this.V4L.X = Unsafe.Add(ref selfRef, 32);
-        this.V4L.Y = Unsafe.Add(ref selfRef, 33);
-        this.V4L.Z = Unsafe.Add(ref selfRef, 34);
-        this.V4L.W = Unsafe.Add(ref selfRef, 35);
-        this.V4R.X = Unsafe.Add(ref selfRef, 36);
-        this.V4R.Y = Unsafe.Add(ref selfRef, 37);
-        this.V4R.Z = Unsafe.Add(ref selfRef, 38);
-        this.V4R.W = Unsafe.Add(ref selfRef, 39);
-
-        this.V5L.X = Unsafe.Add(ref selfRef, 40);
-        this.V5L.Y = Unsafe.Add(ref selfRef, 41);
-        this.V5L.Z = Unsafe.Add(ref selfRef, 42);
-        this.V5L.W = Unsafe.Add(ref selfRef, 43);
-        this.V5R.X = Unsafe.Add(ref selfRef, 44);
-        this.V5R.Y = Unsafe.Add(ref selfRef, 45);
-        this.V5R.Z = Unsafe.Add(ref selfRef, 46);
-        this.V5R.W = Unsafe.Add(ref selfRef, 47);
-
-        this.V6L.X = Unsafe.Add(ref selfRef, 48);
-        this.V6L.Y = Unsafe.Add(ref selfRef, 49);
-        this.V6L.Z = Unsafe.Add(ref selfRef, 50);
-        this.V6L.W = Unsafe.Add(ref selfRef, 51);
-        this.V6R.X = Unsafe.Add(ref selfRef, 52);
-        this.V6R.Y = Unsafe.Add(ref selfRef, 53);
-        this.V6R.Z = Unsafe.Add(ref selfRef, 54);
-        this.V6R.W = Unsafe.Add(ref selfRef, 55);
-
-        this.V7L.X = Unsafe.Add(ref selfRef, 56);
-        this.V7L.Y = Unsafe.Add(ref selfRef, 57);
-        this.V7L.Z = Unsafe.Add(ref selfRef, 58);
-        this.V7L.W = Unsafe.Add(ref selfRef, 59);
-        this.V7R.X = Unsafe.Add(ref selfRef, 60);
-        this.V7R.Y = Unsafe.Add(ref selfRef, 61);
-        this.V7R.Z = Unsafe.Add(ref selfRef, 62);
-        this.V7R.W = Unsafe.Add(ref selfRef, 63);
-    }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
new file mode 100644
index 000000000..37332db62
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
@@ -0,0 +1,66 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+/// <content>
+/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
+/// </content>
+internal partial struct Block8x8F
+{
+    /// <summary>
+    /// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
+    /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
+    {
+        Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector128<float> max = Vector128.Create(maximum);
+
+        this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
+        this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
+        this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
+        this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
+        this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
+        this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
+        this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
+        this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
+        this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
+        this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
+        this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
+        this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
+        this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
+        this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
+        this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
+        this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
+        => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
+
+    private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+    {
+        DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+        ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
+        ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
+
+        ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
+
+        // TODO: We can use the v128 utilities for this.
+        for (nuint i = 0; i < 16; i += 2)
+        {
+            Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+            Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+            Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right);
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
new file mode 100644
index 000000000..a7d5c89b3
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
@@ -0,0 +1,191 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+/// <content>
+/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
+/// </content>
+internal partial struct Block8x8F
+{
+    /// <summary>
+    /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+    /// </summary>
+    public const int RowCount = 8;
+
+#pragma warning disable SA1310 // Field names should not contain underscore
+    [FieldOffset(0)]
+    public Vector256<float> V256_0;
+    [FieldOffset(32)]
+    public Vector256<float> V256_1;
+    [FieldOffset(64)]
+    public Vector256<float> V256_2;
+    [FieldOffset(96)]
+    public Vector256<float> V256_3;
+    [FieldOffset(128)]
+    public Vector256<float> V256_4;
+    [FieldOffset(160)]
+    public Vector256<float> V256_5;
+    [FieldOffset(192)]
+    public Vector256<float> V256_6;
+    [FieldOffset(224)]
+    public Vector256<float> V256_7;
+#pragma warning restore SA1310 // Field names should not contain underscore
+
+    /// <summary>
+    /// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
+    /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
+    {
+        Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector256<float> max = Vector256.Create(maximum);
+
+        this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max);
+        this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max);
+        this.V256_2 = NormalizeAndRoundVector256(this.V256_2, off, max);
+        this.V256_3 = NormalizeAndRoundVector256(this.V256_3, off, max);
+        this.V256_4 = NormalizeAndRoundVector256(this.V256_4, off, max);
+        this.V256_5 = NormalizeAndRoundVector256(this.V256_5, off, max);
+        this.V256_6 = NormalizeAndRoundVector256(this.V256_6, off, max);
+        this.V256_7 = NormalizeAndRoundVector256(this.V256_7, off, max);
+    }
+
+    /// <summary>
+    /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
+    /// </summary>
+    /// <param name="source">The source <see cref="Block8x8"/></param>
+    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
+    {
+        DebugGuard.IsTrue(
+            Avx2.IsSupported,
+            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
+
+        ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
+        ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
+
+        // Vector256<ushort>.Count == 16 on AVX2
+        // We can process 2 block rows in a single step
+        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
+        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
+        dRef = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
+        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
+        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
+        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
+        => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
+
+    private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+    {
+        DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+        ref Vector256<float> aBase = ref a.V256_0;
+        ref Vector256<float> bBase = ref b.V256_0;
+
+        ref Vector256<short> destRef = ref dest.V01;
+        Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
+
+        for (nuint i = 0; i < 8; i += 2)
+        {
+            Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+            Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+            Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+            row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
+
+            Unsafe.Add(ref destRef, i / 2) = row;
+        }
+    }
+
+    private void TransposeInplace_Avx()
+    {
+        // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+        Vector256<float> r0 = Avx.InsertVector128(
+            this.V256_0,
+            Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
+            1);
+
+        Vector256<float> r1 = Avx.InsertVector128(
+           this.V256_1,
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
+           1);
+
+        Vector256<float> r2 = Avx.InsertVector128(
+           this.V256_2,
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
+           1);
+
+        Vector256<float> r3 = Avx.InsertVector128(
+           this.V256_3,
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
+           1);
+
+        Vector256<float> r4 = Avx.InsertVector128(
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
+           1);
+
+        Vector256<float> r5 = Avx.InsertVector128(
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
+           1);
+
+        Vector256<float> r6 = Avx.InsertVector128(
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
+           1);
+
+        Vector256<float> r7 = Avx.InsertVector128(
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
+           Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
+           1);
+
+        Vector256<float> t0 = Avx.UnpackLow(r0, r1);
+        Vector256<float> t2 = Avx.UnpackLow(r2, r3);
+        Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
+        this.V256_0 = Avx.Blend(t0, v, 0xCC);
+        this.V256_1 = Avx.Blend(t2, v, 0x33);
+
+        Vector256<float> t4 = Avx.UnpackLow(r4, r5);
+        Vector256<float> t6 = Avx.UnpackLow(r6, r7);
+        v = Avx.Shuffle(t4, t6, 0x4E);
+        this.V256_4 = Avx.Blend(t4, v, 0xCC);
+        this.V256_5 = Avx.Blend(t6, v, 0x33);
+
+        Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
+        Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
+        v = Avx.Shuffle(t1, t3, 0x4E);
+        this.V256_2 = Avx.Blend(t1, v, 0xCC);
+        this.V256_3 = Avx.Blend(t3, v, 0x33);
+
+        Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
+        Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
+        v = Avx.Shuffle(t5, t7, 0x4E);
+        this.V256_6 = Avx.Blend(t5, v, 0xCC);
+        this.V256_7 = Avx.Blend(t7, v, 0x33);
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 2eecafc13..ec563897d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -8,8 +8,6 @@ using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 using System.Text;
 using SixLabors.ImageSharp.Common.Helpers;
-using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
-using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -25,7 +23,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     /// </summary>
     public const int Size = 64;
 
-#pragma warning disable SA1600 // ElementsMustBeDocumented
     [FieldOffset(0)]
     public Vector4 V0L;
     [FieldOffset(16)]
@@ -65,7 +62,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     public Vector4 V7L;
     [FieldOffset(240)]
     public Vector4 V7R;
-#pragma warning restore SA1600 // ElementsMustBeDocumented
 
     /// <summary>
     /// Get/Set scalar elements at a given index
@@ -159,18 +155,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void MultiplyInPlace(float value)
     {
-        // TODO: Vector512
         if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 *= valueVec;
-            this.V1 *= valueVec;
-            this.V2 *= valueVec;
-            this.V3 *= valueVec;
-            this.V4 *= valueVec;
-            this.V5 *= valueVec;
-            this.V6 *= valueVec;
-            this.V7 *= valueVec;
+            this.V256_0 *= valueVec;
+            this.V256_1 *= valueVec;
+            this.V256_2 *= valueVec;
+            this.V256_3 *= valueVec;
+            this.V256_4 *= valueVec;
+            this.V256_5 *= valueVec;
+            this.V256_6 *= valueVec;
+            this.V256_7 *= valueVec;
         }
         else
         {
@@ -201,17 +196,16 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public unsafe void MultiplyInPlace(ref Block8x8F other)
     {
-        // TODO: Vector512
         if (Vector256.IsHardwareAccelerated)
         {
-            this.V0 *= other.V0;
-            this.V1 *= other.V1;
-            this.V2 *= other.V2;
-            this.V3 *= other.V3;
-            this.V4 *= other.V4;
-            this.V5 *= other.V5;
-            this.V6 *= other.V6;
-            this.V7 *= other.V7;
+            this.V256_0 *= other.V256_0;
+            this.V256_1 *= other.V256_1;
+            this.V256_2 *= other.V256_2;
+            this.V256_3 *= other.V256_3;
+            this.V256_4 *= other.V256_4;
+            this.V256_5 *= other.V256_5;
+            this.V256_6 *= other.V256_6;
+            this.V256_7 *= other.V256_7;
         }
         else
         {
@@ -241,18 +235,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void AddInPlace(float value)
     {
-        // TODO: Vector512
         if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 += valueVec;
-            this.V1 += valueVec;
-            this.V2 += valueVec;
-            this.V3 += valueVec;
-            this.V4 += valueVec;
-            this.V5 += valueVec;
-            this.V6 += valueVec;
-            this.V7 += valueVec;
+            this.V256_0 += valueVec;
+            this.V256_1 += valueVec;
+            this.V256_2 += valueVec;
+            this.V256_3 += valueVec;
+            this.V256_4 += valueVec;
+            this.V256_5 += valueVec;
+            this.V256_6 += valueVec;
+            this.V256_7 += valueVec;
         }
         else
         {
@@ -352,6 +345,34 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
         }
     }
 
+    /// <summary>
+    /// Level shift by +maximum/2, clip to [0, maximum]
+    /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
+    public void NormalizeColorsInPlace(float maximum)
+    {
+        Vector4 min = Vector4.Zero;
+        Vector4 max = new(maximum);
+        Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
+
+        this.V0L = Vector4.Clamp(this.V0L + off, min, max);
+        this.V0R = Vector4.Clamp(this.V0R + off, min, max);
+        this.V1L = Vector4.Clamp(this.V1L + off, min, max);
+        this.V1R = Vector4.Clamp(this.V1R + off, min, max);
+        this.V2L = Vector4.Clamp(this.V2L + off, min, max);
+        this.V2R = Vector4.Clamp(this.V2R + off, min, max);
+        this.V3L = Vector4.Clamp(this.V3L + off, min, max);
+        this.V3R = Vector4.Clamp(this.V3R + off, min, max);
+        this.V4L = Vector4.Clamp(this.V4L + off, min, max);
+        this.V4R = Vector4.Clamp(this.V4R + off, min, max);
+        this.V5L = Vector4.Clamp(this.V5L + off, min, max);
+        this.V5R = Vector4.Clamp(this.V5R + off, min, max);
+        this.V6L = Vector4.Clamp(this.V6L + off, min, max);
+        this.V6R = Vector4.Clamp(this.V6R + off, min, max);
+        this.V7L = Vector4.Clamp(this.V7L + off, min, max);
+        this.V7R = Vector4.Clamp(this.V7R + off, min, max);
+    }
+
     /// <summary>
     /// Rounds all values in the block.
     /// </summary>
@@ -376,39 +397,84 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     }
 
     /// <summary>
-    /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
+    /// Fill the block from <paramref name="source"/> doing short -&gt; float conversion.
     /// </summary>
-    /// <param name="source">The source <see cref="Block8x8"/></param>
-    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
+    /// <param name="source">The source block</param>
+    public void LoadFromInt16Scalar(ref Block8x8 source)
     {
-        DebugGuard.IsTrue(
-            Avx2.IsSupported,
-            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
-
-        ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
-        ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
-
-        // Vector256<ushort>.Count == 16 on AVX2
-        // We can process 2 block rows in a single step
-        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
-        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
-        dRef = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
-        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
-        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
-        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
+        ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
+
+        this.V0L.X = Unsafe.Add(ref selfRef, 0);
+        this.V0L.Y = Unsafe.Add(ref selfRef, 1);
+        this.V0L.Z = Unsafe.Add(ref selfRef, 2);
+        this.V0L.W = Unsafe.Add(ref selfRef, 3);
+        this.V0R.X = Unsafe.Add(ref selfRef, 4);
+        this.V0R.Y = Unsafe.Add(ref selfRef, 5);
+        this.V0R.Z = Unsafe.Add(ref selfRef, 6);
+        this.V0R.W = Unsafe.Add(ref selfRef, 7);
+
+        this.V1L.X = Unsafe.Add(ref selfRef, 8);
+        this.V1L.Y = Unsafe.Add(ref selfRef, 9);
+        this.V1L.Z = Unsafe.Add(ref selfRef, 10);
+        this.V1L.W = Unsafe.Add(ref selfRef, 11);
+        this.V1R.X = Unsafe.Add(ref selfRef, 12);
+        this.V1R.Y = Unsafe.Add(ref selfRef, 13);
+        this.V1R.Z = Unsafe.Add(ref selfRef, 14);
+        this.V1R.W = Unsafe.Add(ref selfRef, 15);
+
+        this.V2L.X = Unsafe.Add(ref selfRef, 16);
+        this.V2L.Y = Unsafe.Add(ref selfRef, 17);
+        this.V2L.Z = Unsafe.Add(ref selfRef, 18);
+        this.V2L.W = Unsafe.Add(ref selfRef, 19);
+        this.V2R.X = Unsafe.Add(ref selfRef, 20);
+        this.V2R.Y = Unsafe.Add(ref selfRef, 21);
+        this.V2R.Z = Unsafe.Add(ref selfRef, 22);
+        this.V2R.W = Unsafe.Add(ref selfRef, 23);
+
+        this.V3L.X = Unsafe.Add(ref selfRef, 24);
+        this.V3L.Y = Unsafe.Add(ref selfRef, 25);
+        this.V3L.Z = Unsafe.Add(ref selfRef, 26);
+        this.V3L.W = Unsafe.Add(ref selfRef, 27);
+        this.V3R.X = Unsafe.Add(ref selfRef, 28);
+        this.V3R.Y = Unsafe.Add(ref selfRef, 29);
+        this.V3R.Z = Unsafe.Add(ref selfRef, 30);
+        this.V3R.W = Unsafe.Add(ref selfRef, 31);
+
+        this.V4L.X = Unsafe.Add(ref selfRef, 32);
+        this.V4L.Y = Unsafe.Add(ref selfRef, 33);
+        this.V4L.Z = Unsafe.Add(ref selfRef, 34);
+        this.V4L.W = Unsafe.Add(ref selfRef, 35);
+        this.V4R.X = Unsafe.Add(ref selfRef, 36);
+        this.V4R.Y = Unsafe.Add(ref selfRef, 37);
+        this.V4R.Z = Unsafe.Add(ref selfRef, 38);
+        this.V4R.W = Unsafe.Add(ref selfRef, 39);
+
+        this.V5L.X = Unsafe.Add(ref selfRef, 40);
+        this.V5L.Y = Unsafe.Add(ref selfRef, 41);
+        this.V5L.Z = Unsafe.Add(ref selfRef, 42);
+        this.V5L.W = Unsafe.Add(ref selfRef, 43);
+        this.V5R.X = Unsafe.Add(ref selfRef, 44);
+        this.V5R.Y = Unsafe.Add(ref selfRef, 45);
+        this.V5R.Z = Unsafe.Add(ref selfRef, 46);
+        this.V5R.W = Unsafe.Add(ref selfRef, 47);
+
+        this.V6L.X = Unsafe.Add(ref selfRef, 48);
+        this.V6L.Y = Unsafe.Add(ref selfRef, 49);
+        this.V6L.Z = Unsafe.Add(ref selfRef, 50);
+        this.V6L.W = Unsafe.Add(ref selfRef, 51);
+        this.V6R.X = Unsafe.Add(ref selfRef, 52);
+        this.V6R.Y = Unsafe.Add(ref selfRef, 53);
+        this.V6R.Z = Unsafe.Add(ref selfRef, 54);
+        this.V6R.W = Unsafe.Add(ref selfRef, 55);
+
+        this.V7L.X = Unsafe.Add(ref selfRef, 56);
+        this.V7L.Y = Unsafe.Add(ref selfRef, 57);
+        this.V7L.Z = Unsafe.Add(ref selfRef, 58);
+        this.V7L.W = Unsafe.Add(ref selfRef, 59);
+        this.V7R.X = Unsafe.Add(ref selfRef, 60);
+        this.V7R.Y = Unsafe.Add(ref selfRef, 61);
+        this.V7R.Z = Unsafe.Add(ref selfRef, 62);
+        this.V7R.W = Unsafe.Add(ref selfRef, 63);
     }
 
     /// <summary>
@@ -422,11 +488,11 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
             const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
 
             Vector256<int> targetVector = Vector256.Create(value);
-            ref Vector256<float> blockStride = ref this.V0;
+            ref Vector256<float> blockStride = ref this.V256_0;
 
             for (nuint i = 0; i < RowCount; i++)
             {
-                Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
+                Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V256_0, i)), targetVector);
                 if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
                 {
                     return false;
@@ -577,31 +643,4 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
         // row #6
         RuntimeUtility.Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
     }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
-    {
-        row += off;
-        row = Vector.Max(row, Vector<float>.Zero);
-        row = Vector.Min(row, max);
-        return row.FastRound();
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> row, Vector256<float> off, Vector256<float> max)
-    {
-        row += off;
-        row = Vector256.Max(row, Vector256<float>.Zero);
-        row = Vector256.Min(row, max);
-        return Vector256_.RoundToNearestInteger(row);
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> row, Vector128<float> off, Vector128<float> max)
-    {
-        row += off;
-        row = Vector128.Max(row, Vector128<float>.Zero);
-        row = Vector128.Min(row, max);
-        return Vector128_.RoundToNearestInteger(row);
-    }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs
index f3a6f7d37..4b350f6f3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
@@ -60,7 +60,7 @@ internal abstract partial class JpegColorConverterBase
                 ref Vector128<float> b = ref Unsafe.Add(ref srcBlue, i);
 
                 // luminosity = (0.299 * r) + (0.587 * g) + (0.114 * b)
-                Unsafe.Add(ref destLuminance, i) = Vector128Utilities.MultiplyAdd(Vector128Utilities.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
+                Unsafe.Add(ref destLuminance, i) = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
             }
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs
index 139ffc549..94b897e07 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs
index 21d5eaa6f..638f4278b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs
index 8cecd3956..6eabb3ee0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs
index f8517e086..233437da9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs
index 7598a64b2..44c0bcf2b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs
index 5bb2c5e5b..e36683dee 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs
index 27f2ce035..b1228ba01 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs
@@ -1,10 +1,10 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
index 42d89a231..0db081c6f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
@@ -4,7 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
index b11d834a8..862c77469 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@@ -26,14 +26,14 @@ internal static partial class FloatingPointDCT
         // Applies 1D floating point FDCT inplace
         static void FDCT8x8_1D_Avx(ref Block8x8F block)
         {
-            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
-            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
-            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
-            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
-            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
-            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
-            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
-            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+            Vector256<float> tmp0 = Avx.Add(block.V256_0, block.V256_7);
+            Vector256<float> tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
+            Vector256<float> tmp1 = Avx.Add(block.V256_1, block.V256_6);
+            Vector256<float> tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
+            Vector256<float> tmp2 = Avx.Add(block.V256_2, block.V256_5);
+            Vector256<float> tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
+            Vector256<float> tmp3 = Avx.Add(block.V256_3, block.V256_4);
+            Vector256<float> tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
 
             // Even part
             Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
@@ -41,13 +41,13 @@ internal static partial class FloatingPointDCT
             Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
             Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
 
-            block.V0 = Avx.Add(tmp10, tmp11);
-            block.V4 = Avx.Subtract(tmp10, tmp11);
+            block.V256_0 = Avx.Add(tmp10, tmp11);
+            block.V256_4 = Avx.Subtract(tmp10, tmp11);
 
             var mm256_F_0_7071 = Vector256.Create(0.707106781f);
             Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
-            block.V2 = Avx.Add(tmp13, z1);
-            block.V6 = Avx.Subtract(tmp13, z1);
+            block.V256_2 = Avx.Add(tmp13, z1);
+            block.V256_6 = Avx.Subtract(tmp13, z1);
 
             // Odd part
             tmp10 = Avx.Add(tmp4, tmp5);
@@ -62,10 +62,10 @@ internal static partial class FloatingPointDCT
             Vector256<float> z11 = Avx.Add(tmp7, z3);
             Vector256<float> z13 = Avx.Subtract(tmp7, z3);
 
-            block.V5 = Avx.Add(z13, z2);
-            block.V3 = Avx.Subtract(z13, z2);
-            block.V1 = Avx.Add(z11, z4);
-            block.V7 = Avx.Subtract(z11, z4);
+            block.V256_5 = Avx.Add(z13, z2);
+            block.V256_3 = Avx.Subtract(z13, z2);
+            block.V256_1 = Avx.Add(z11, z4);
+            block.V256_7 = Avx.Subtract(z11, z4);
         }
     }
 
@@ -88,10 +88,10 @@ internal static partial class FloatingPointDCT
         static void IDCT8x8_1D_Avx(ref Block8x8F block)
         {
             // Even part
-            Vector256<float> tmp0 = block.V0;
-            Vector256<float> tmp1 = block.V2;
-            Vector256<float> tmp2 = block.V4;
-            Vector256<float> tmp3 = block.V6;
+            Vector256<float> tmp0 = block.V256_0;
+            Vector256<float> tmp1 = block.V256_2;
+            Vector256<float> tmp2 = block.V256_4;
+            Vector256<float> tmp3 = block.V256_6;
 
             Vector256<float> z5 = tmp0;
             Vector256<float> tmp10 = Avx.Add(z5, tmp2);
@@ -107,10 +107,10 @@ internal static partial class FloatingPointDCT
             tmp2 = Avx.Subtract(tmp11, tmp12);
 
             // Odd part
-            Vector256<float> tmp4 = block.V1;
-            Vector256<float> tmp5 = block.V3;
-            Vector256<float> tmp6 = block.V5;
-            Vector256<float> tmp7 = block.V7;
+            Vector256<float> tmp4 = block.V256_1;
+            Vector256<float> tmp5 = block.V256_3;
+            Vector256<float> tmp6 = block.V256_5;
+            Vector256<float> tmp7 = block.V256_7;
 
             Vector256<float> z13 = Avx.Add(tmp6, tmp5);
             Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
@@ -129,14 +129,14 @@ internal static partial class FloatingPointDCT
             tmp5 = Avx.Subtract(tmp11, tmp6);
             tmp4 = Avx.Subtract(tmp10, tmp5);
 
-            block.V0 = Avx.Add(tmp0, tmp7);
-            block.V7 = Avx.Subtract(tmp0, tmp7);
-            block.V1 = Avx.Add(tmp1, tmp6);
-            block.V6 = Avx.Subtract(tmp1, tmp6);
-            block.V2 = Avx.Add(tmp2, tmp5);
-            block.V5 = Avx.Subtract(tmp2, tmp5);
-            block.V3 = Avx.Add(tmp3, tmp4);
-            block.V4 = Avx.Subtract(tmp3, tmp4);
+            block.V256_0 = Avx.Add(tmp0, tmp7);
+            block.V256_7 = Avx.Subtract(tmp0, tmp7);
+            block.V256_1 = Avx.Add(tmp1, tmp6);
+            block.V256_6 = Avx.Subtract(tmp1, tmp6);
+            block.V256_2 = Avx.Add(tmp2, tmp5);
+            block.V256_5 = Avx.Subtract(tmp2, tmp5);
+            block.V256_3 = Avx.Add(tmp3, tmp4);
+            block.V256_4 = Avx.Subtract(tmp3, tmp4);
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
index a9e63a3d0..43dab1ffc 100644
--- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
@@ -326,11 +326,11 @@ internal class AlphaDecoder : IDisposable
             {
                 Vector128<long> a0 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, i)), 0);
                 Vector128<byte> a1 = a0.AsByte() + last.AsByte();
-                Vector128<byte> a2 = Vector128Utilities.ShiftLeftBytesInVector(a1, 1);
+                Vector128<byte> a2 = Vector128_.ShiftLeftBytesInVector(a1, 1);
                 Vector128<byte> a3 = a1 + a2;
-                Vector128<byte> a4 = Vector128Utilities.ShiftLeftBytesInVector(a3, 2);
+                Vector128<byte> a4 = Vector128_.ShiftLeftBytesInVector(a3, 2);
                 Vector128<byte> a5 = a3 + a4;
-                Vector128<byte> a6 = Vector128Utilities.ShiftLeftBytesInVector(a5, 4);
+                Vector128<byte> a6 = Vector128_.ShiftLeftBytesInVector(a5, 4);
                 Vector128<byte> a7 = a5 + a6;
 
                 ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
index 0dc6d26bc..dbd255722 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
@@ -8,6 +8,7 @@ using SixLabors.ImageSharp.Tests;
 
 namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg;
 
+[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
 public class DecodeJpeg
 {
     private JpegDecoder decoder;
@@ -21,7 +22,7 @@ public class DecodeJpeg
         this.preloadedImageStream = new MemoryStream(bytes);
     }
 
-    private void GenericBechmark()
+    private void GenericBenchmark()
     {
         this.preloadedImageStream.Position = 0;
         using Image img = this.decoder.Decode(DecoderOptions.Default, this.preloadedImageStream);
@@ -51,16 +52,16 @@ public class DecodeJpeg
     }
 
     [Benchmark(Description = "Baseline 4:4:4 Interleaved")]
-    public void JpegBaselineInterleaved444() => this.GenericBechmark();
+    public void JpegBaselineInterleaved444() => this.GenericBenchmark();
 
     [Benchmark(Description = "Baseline 4:2:0 Interleaved")]
-    public void JpegBaselineInterleaved420() => this.GenericBechmark();
+    public void JpegBaselineInterleaved420() => this.GenericBenchmark();
 
     [Benchmark(Description = "Baseline 4:0:0 (grayscale)")]
-    public void JpegBaseline400() => this.GenericBechmark();
+    public void JpegBaseline400() => this.GenericBenchmark();
 
     [Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")]
-    public void JpegProgressiveNonInterleaved420() => this.GenericBechmark();
+    public void JpegProgressiveNonInterleaved420() => this.GenericBenchmark();
 }
 
 /*
diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
index e21d0c76d..9fd48301e 100644
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@@ -34,6 +34,7 @@ public partial class Config
     // like `LZCNT`, `BMI1`, or `BMI2`
     // `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
     private const string EnableAES = "DOTNET_EnableAES";
+    private const string EnableAVX512F = "DOTNET_EnableAVX512F";
     private const string EnableAVX = "DOTNET_EnableAVX";
     private const string EnableAVX2 = "DOTNET_EnableAVX2";
     private const string EnableBMI1 = "DOTNET_EnableBMI1";
@@ -76,4 +77,36 @@ public partial class Config
             }
         }
     }
+
+    public class HwIntrinsics_SSE_AVX_AVX512F : Config
+    {
+        public HwIntrinsics_SSE_AVX_AVX512F()
+        {
+            this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
+                .WithEnvironmentVariables(
+                    new EnvironmentVariable(EnableHWIntrinsic, Off),
+                    new EnvironmentVariable(FeatureSIMD, Off))
+                .WithId("1. No HwIntrinsics").AsBaseline());
+
+            if (Sse.IsSupported)
+            {
+                this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
+                    .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
+                    .WithId("2. SSE"));
+            }
+
+            if (Avx.IsSupported)
+            {
+                this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
+                    .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX512F, Off))
+                    .WithId("3. AVX"));
+            }
+
+            if (Avx512F.IsSupported)
+            {
+                this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
+                    .WithId("3. AVX512F"));
+            }
+        }
+    }
 }

From 30bdc29e4060bea18832c9e6e905398ed1d8c02a Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 7 May 2025 15:14:58 +1000
Subject: [PATCH 04/12] Migrate from Sse to general Vector128 for ZigZag

---
 .../Jpeg/Components/Block8x8F.Vector128.cs    |  13 +-
 .../Jpeg/Components/Block8x8F.Vector256.cs    |   2 +-
 .../Formats/Jpeg/Components/Block8x8F.cs      |  12 +-
 .../Jpeg/Components/ZigZag.Intrinsic.cs       | 135 ++++++++++--------
 4 files changed, 90 insertions(+), 72 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
index 37332db62..8e0d526e5 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
@@ -3,7 +3,6 @@
 
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -45,22 +44,20 @@ internal partial struct Block8x8F
     private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
         => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
 
-    private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+    private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
     {
-        DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+        DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
 
         ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
         ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
-
         ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
 
-        // TODO: We can use the v128 utilities for this.
         for (nuint i = 0; i < 16; i += 2)
         {
-            Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-            Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+            Vector128<int> left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
+            Vector128<int> right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
 
-            Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right);
+            Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right);
         }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
index a7d5c89b3..3aab547e0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
@@ -121,7 +121,7 @@ internal partial struct Block8x8F
         }
     }
 
-    private void TransposeInplace_Avx()
+    private void TransposeInPlace_Avx()
     {
         // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
         Vector256<float> r0 = Avx.InsertVector128(
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index ec563897d..284c5bfe5 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -282,10 +282,10 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
             MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
             ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
         }
-        else if (Ssse3.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
         {
-            MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
-            ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
+            MultiplyIntoInt16Vector128(ref block, ref qt, ref dest);
+            ZigZag.ApplyTransposingZigZagOrderingVector128(ref dest);
         }
         else
         {
@@ -387,7 +387,7 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void LoadFrom(ref Block8x8 source)
     {
-        if (SimdUtils.HasVector8)
+        if (Avx2.IsSupported)
         {
             this.LoadFromInt16ExtendedAvx2(ref source);
             return;
@@ -483,6 +483,7 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     /// <param name="value">Value to compare to.</param>
     public bool EqualsToScalar(int value)
     {
+        // TODO: Can we provide a Vector128 implementation for this?
         if (Avx2.IsSupported)
         {
             const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
@@ -585,10 +586,11 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     {
         if (Avx.IsSupported)
         {
-            this.TransposeInplace_Avx();
+            this.TransposeInPlace_Avx();
         }
         else
         {
+            // TODO: Can we provide a Vector128 implementation for this?
             this.TransposeInPlace_Scalar();
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index f6239ad1e..941edb5c0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -1,6 +1,9 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -17,11 +20,11 @@ internal static partial class ZigZag
 #pragma warning restore SA1309
 
     /// <summary>
-    /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
+    /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128"/>
     /// zig zag implementation.
     /// </summary>
-    private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
-    {
+    private static ReadOnlySpan<byte> SseShuffleMasks =>
+    [
 #pragma warning disable SA1515
         /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
         // A
@@ -83,14 +86,14 @@ internal static partial class ZigZag
         // H
         _,  _, _, _,  _,  _,  _,  _, 10, 11, 12, 13,  _,  _, 14, 15,
 #pragma warning restore SA1515
-    };
+    ];
 
     /// <summary>
     /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
     /// zig zag implementation.
     /// </summary>
-    private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
-    {
+    private static ReadOnlySpan<byte> AvxShuffleMasks =>
+    [
 #pragma warning disable SA1515
         /* 01 */
         // [cr] crln_01_AB_CD
@@ -138,15 +141,15 @@ internal static partial class ZigZag
         // (in) GH
         _, _, _, _,   _, _, _, _,   0, 1, 10, 11,   12, 13, 2, 3,   _, _, _, _,   _, _, 0, 1,   6, 7, 8, 9,   2, 3, 10, 11,
 #pragma warning restore SA1515
-    };
+    ];
 
     /// <summary>
-    /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
+    /// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
     /// </summary>
     /// <param name="block">Input matrix.</param>
-    public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
+    public static unsafe void ApplyTransposingZigZagOrderingVector128(ref Block8x8 block)
     {
-        DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
+        DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
 
         fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
         {
@@ -160,68 +163,68 @@ internal static partial class ZigZag
             Vector128<byte> rowH = block.V7.AsByte();
 
             // row0 - A0 B0 A1 A2 B1 C0 D0 C1
-            Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
-            Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
-            Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
-            Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
-            row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
+            Vector128<short> row0_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 0))).AsInt16();
+            Vector128<short> row0_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 1))).AsInt16();
+            Vector128<short> row0_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 2))).AsInt16();
+            Vector128<short> row0 = row0_A | row0_B | row0_C;
+            row0 = row0.AsUInt16().WithElement(6, rowD.AsUInt16().GetElement(0)).AsInt16();
 
             // row1 - B2 A3 A4 B3 C2 D1 E0 F0
-            Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
-            Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
-            Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
-            row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
-            row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
-            row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
-            row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
+            Vector128<short> row1_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 3))).AsInt16();
+            Vector128<short> row1_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 4))).AsInt16();
+            Vector128<short> row1 = row1_A | row1_B;
+            row1 = row1.AsUInt16().WithElement(4, rowC.AsUInt16().GetElement(2)).AsInt16();
+            row1 = row1.AsUInt16().WithElement(5, rowD.AsUInt16().GetElement(1)).AsInt16();
+            row1 = row1.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(0)).AsInt16();
+            row1 = row1.AsUInt16().WithElement(7, rowF.AsUInt16().GetElement(0)).AsInt16();
 
             // row2 - E1 D2 C3 B4 A5 A6 B5 C4
-            Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
-            Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
-            Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
-            Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
-            row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
-            row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
+            Vector128<short> row2_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 5))).AsInt16();
+            Vector128<short> row2_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 6))).AsInt16();
+            Vector128<short> row2_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 7))).AsInt16();
+            Vector128<short> row2 = row2_A | row2_B | row2_C;
+            row2 = row2.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(2)).AsInt16();
+            row2 = row2.AsUInt16().WithElement(0, rowE.AsUInt16().GetElement(1)).AsInt16();
 
             // row3 - D3 E2 F1 G0 H0 G1 F2 E3
-            Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
-            Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
-            Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
-            Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
-            row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
-            row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
+            Vector128<short> row3_E = ZShuffle(rowE, Vector128.Load(shuffleVectorsPtr + (16 * 8))).AsInt16();
+            Vector128<short> row3_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 9))).AsInt16();
+            Vector128<short> row3_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 10))).AsInt16();
+            Vector128<short> row3 = row3_E | row3_F | row3_G;
+            row3 = row3.AsUInt16().WithElement(0, rowD.AsUInt16().GetElement(3)).AsInt16();
+            row3 = row3.AsUInt16().WithElement(4, rowH.AsUInt16().GetElement(0)).AsInt16();
 
             // row4 - D4 C5 B6 A7 B7 C6 D5 E4
-            Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
-            Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
-            Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
-            Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
-            row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
-            row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
+            Vector128<short> row4_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 11))).AsInt16();
+            Vector128<short> row4_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 12))).AsInt16();
+            Vector128<short> row4_D = ZShuffle(rowD, Vector128.Load(shuffleVectorsPtr + (16 * 13))).AsInt16();
+            Vector128<short> row4 = row4_B | row4_C | row4_D;
+            row4 = row4.AsUInt16().WithElement(3, rowA.AsUInt16().GetElement(7)).AsInt16();
+            row4 = row4.AsUInt16().WithElement(7, rowE.AsUInt16().GetElement(4)).AsInt16();
 
             // row5 - F3 G2 H1 H2 G3 F4 E5 D6
-            Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
-            Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
-            Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
-            Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
-            row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
-            row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
+            Vector128<short> row5_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 14))).AsInt16();
+            Vector128<short> row5_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 15))).AsInt16();
+            Vector128<short> row5_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 16))).AsInt16();
+            Vector128<short> row5 = row5_F | row5_G | row5_H;
+            row5 = row5.AsUInt16().WithElement(7, rowD.AsUInt16().GetElement(6)).AsInt16();
+            row5 = row5.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(5)).AsInt16();
 
             // row6 - C7 D7 E6 F5 G4 H3 H4 G5
-            Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
-            Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
-            Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
-            row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
-            row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
-            row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
-            row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
+            Vector128<short> row6_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 17))).AsInt16();
+            Vector128<short> row6_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 18))).AsInt16();
+            Vector128<short> row6 = row6_G | row6_H;
+            row6 = row6.AsUInt16().WithElement(0, rowC.AsUInt16().GetElement(7)).AsInt16();
+            row6 = row6.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(7)).AsInt16();
+            row6 = row6.AsUInt16().WithElement(2, rowE.AsUInt16().GetElement(6)).AsInt16();
+            row6 = row6.AsUInt16().WithElement(3, rowF.AsUInt16().GetElement(5)).AsInt16();
 
             // row7 - F6 E7 F7 G6 H5 H6 G7 H7
-            Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
-            Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
-            Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
-            Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
-            row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
+            Vector128<short> row7_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 19))).AsInt16();
+            Vector128<short> row7_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 20))).AsInt16();
+            Vector128<short> row7_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 21))).AsInt16();
+            Vector128<short> row7 = row7_F | row7_G | row7_H;
+            row7 = row7.AsUInt16().WithElement(1, rowE.AsUInt16().GetElement(7)).AsInt16();
 
             block.V0 = row0;
             block.V1 = row1;
@@ -300,4 +303,20 @@ internal static partial class ZigZag
             block.V67 = row67.AsInt16();
         }
     }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector128<byte> ZShuffle(Vector128<byte> source, Vector128<byte> mask)
+    {
+        // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
+        if (Ssse3.IsSupported)
+        {
+            return Ssse3.Shuffle(source, mask);
+        }
+
+        // For ARM and WASM, codegen will be optimal.
+        return Vector128.Shuffle(source, mask);
+    }
+
+    [DoesNotReturn]
+    private static void ThrowUnreachableException() => throw new UnreachableException();
 }

From 041e59dbce345157af491dda7afd03c7a60016bf Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 7 May 2025 16:20:01 +1000
Subject: [PATCH 05/12] All Vector128 Load

---
 .../Jpeg/Components/Block8x8F.Vector128.cs    | 30 ++++++++++++++++
 .../Formats/Jpeg/Components/Block8x8F.cs      |  5 +++
 .../Formats/Jpg/Block8x8FTests.cs             | 35 +++++++++++++++----
 3 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
index 8e0d526e5..ffd405714 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
@@ -40,6 +40,36 @@ internal partial struct Block8x8F
         this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
     }
 
+    /// <summary>
+    /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
+    /// </summary>
+    /// <param name="source">The source <see cref="Block8x8"/></param>
+    public void LoadFromInt16ExtendedVector128(ref Block8x8 source)
+    {
+        DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
+
+        ref Vector128<short> srcBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref source);
+        ref Vector128<float> destBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this);
+
+        // Only 8 iterations, one per 128b short block
+        for (nuint i = 0; i < 8; i++)
+        {
+            Vector128<short> src = Unsafe.Add(ref srcBase, i);
+
+            // Step 1: Widen short -> int
+            Vector128<int> lower = Vector128.WidenLower(src); // lower 4 shorts -> 4 ints
+            Vector128<int> upper = Vector128.WidenUpper(src); // upper 4 shorts -> 4 ints
+
+            // Step 2: Convert int -> float
+            Vector128<float> lowerF = Vector128.ConvertToSingle(lower);
+            Vector128<float> upperF = Vector128.ConvertToSingle(upper);
+
+            // Step 3: Store to destination (this is 16 lanes -> two Vector128<float> blocks)
+            Unsafe.Add(ref destBase, (i * 2) + 0) = lowerF;
+            Unsafe.Add(ref destBase, (i * 2) + 1) = upperF;
+        }
+    }
+
     [MethodImpl(InliningOptions.ShortMethod)]
     private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
         => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 284c5bfe5..f7ef44384 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -392,6 +392,11 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
             this.LoadFromInt16ExtendedAvx2(ref source);
             return;
         }
+        else if (Vector128.IsHardwareAccelerated)
+        {
+            this.LoadFromInt16ExtendedVector128(ref source);
+            return;
+        }
 
         this.LoadFromInt16Scalar(ref source);
     }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 7b73c0c52..1c5d15dc2 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -55,7 +55,7 @@ public partial class Block8x8FTests : JpegFixture
             Times,
             () =>
             {
-                var block = default(Block8x8F);
+                Block8x8F block = default;
 
                 for (int i = 0; i < Block8x8F.Size; i++)
                 {
@@ -68,7 +68,7 @@ public partial class Block8x8FTests : JpegFixture
                     sum += block[i];
                 }
             });
-        Assert.Equal(sum, 64f * 63f * 0.5f);
+        Assert.Equal(64f * 63f * 0.5f, sum);
     }
 
     [Fact]
@@ -93,7 +93,7 @@ public partial class Block8x8FTests : JpegFixture
                     sum += block[i];
                 }
             });
-        Assert.Equal(sum, 64f * 63f * 0.5f);
+        Assert.Equal(64f * 63f * 0.5f, sum);
     }
 
     [Fact]
@@ -121,7 +121,7 @@ public partial class Block8x8FTests : JpegFixture
     }
 
     [Fact]
-    public void TransposeInplace()
+    public void TransposeInPlace()
     {
         static void RunTest()
         {
@@ -276,7 +276,7 @@ public partial class Block8x8FTests : JpegFixture
         float[] data = Create8x8RandomFloatData(-1000, 1000);
 
         Block8x8F source = Block8x8F.Load(data);
-        var dest = default(Block8x8);
+        Block8x8 dest = default;
 
         source.RoundInto(ref dest);
 
@@ -388,7 +388,7 @@ public partial class Block8x8FTests : JpegFixture
 
         short[] data = Create8x8ShortData();
 
-        var source = Block8x8.Load(data);
+        Block8x8 source = Block8x8.Load(data);
 
         Block8x8F dest = default;
         dest.LoadFromInt16Scalar(ref source);
@@ -399,6 +399,27 @@ public partial class Block8x8FTests : JpegFixture
         }
     }
 
+    [Fact]
+    public void LoadFromUInt16ExtendedVector128()
+    {
+        if (this.SkipOnNonVector128Runner())
+        {
+            return;
+        }
+
+        short[] data = Create8x8ShortData();
+
+        Block8x8 source = Block8x8.Load(data);
+
+        Block8x8F dest = default;
+        dest.LoadFromInt16ExtendedVector128(ref source);
+
+        for (int i = 0; i < Block8x8F.Size; i++)
+        {
+            Assert.Equal(data[i], dest[i]);
+        }
+    }
+
     [Fact]
     public void LoadFromUInt16ExtendedAvx2()
     {
@@ -409,7 +430,7 @@ public partial class Block8x8FTests : JpegFixture
 
         short[] data = Create8x8ShortData();
 
-        var source = Block8x8.Load(data);
+        Block8x8 source = Block8x8.Load(data);
 
         Block8x8F dest = default;
         dest.LoadFromInt16ExtendedAvx2(ref source);

From 038f047a1c6839383acb47ed1f4b8f242d53812b Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 8 May 2025 15:39:16 +1000
Subject: [PATCH 06/12] Initial fixes based on feedback

---
 .../Jpeg/Components/Block8x8F.Vector128.cs    |  3 ++-
 .../Jpeg/Components/Block8x8F.Vector256.cs    |  2 +-
 .../Formats/Jpeg/Components/Block8x8F.cs      | 24 ++++++++++++++-----
 .../Formats/Jpg/Block8x8FTests.cs             |  2 +-
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
index ffd405714..3daa47693 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Common.Helpers;
@@ -19,8 +20,8 @@ internal partial struct Block8x8F
     [MethodImpl(InliningOptions.ShortMethod)]
     public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
     {
-        Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
         Vector128<float> max = Vector128.Create(maximum);
+        Vector128<float> off = Vector128.Ceiling(max * .5F);
 
         this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
         this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
index 3aab547e0..4e4133496 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
@@ -46,8 +46,8 @@ internal partial struct Block8x8F
     [MethodImpl(InliningOptions.ShortMethod)]
     public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
     {
-        Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
         Vector256<float> max = Vector256.Create(maximum);
+        Vector256<float> off = Vector256.Ceiling(max * .5F);
 
         this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max);
         this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index f7ef44384..6f9b4fd16 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -488,18 +488,30 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     /// <param name="value">Value to compare to.</param>
     public bool EqualsToScalar(int value)
     {
-        // TODO: Can we provide a Vector128 implementation for this?
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
             Vector256<int> targetVector = Vector256.Create(value);
             ref Vector256<float> blockStride = ref this.V256_0;
 
             for (nuint i = 0; i < RowCount; i++)
             {
-                Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V256_0, i)), targetVector);
-                if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
+                if (!Vector256.EqualsAll(Vector256.ConvertToInt32(Unsafe.Add(ref this.V256_0, i)), targetVector))
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        if (Vector128.IsHardwareAccelerated)
+        {
+            Vector128<int> targetVector = Vector128.Create(value);
+            ref Vector4 blockStride = ref this.V0L;
+
+            for (nuint i = 0; i < RowCount * 2; i++)
+            {
+                if (!Vector128.EqualsAll(Vector128.ConvertToInt32(Unsafe.Add(ref this.V0L, i).AsVector128()), targetVector))
                 {
                     return false;
                 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index 1c5d15dc2..d1ade761c 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -462,7 +462,7 @@ public partial class Block8x8FTests : JpegFixture
         // 3. DisableAvx2 - call fallback code of float implementation
         FeatureTestRunner.RunWithHwIntrinsicsFeature(
             RunTest,
-            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
+            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE);
     }
 
     [Theory]

From 8a23d42bfdd6a1aaa68dc64870458e2514573ce5 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 8 May 2025 20:56:50 +1000
Subject: [PATCH 07/12] Port more V256 code

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 124 +++++++++---------
 .../Common/Helpers/Vector128Utilities.cs      |  15 ++-
 .../Common/Helpers/Vector256Utilities.cs      |  58 +++++---
 .../Common/Helpers/Vector512Utilities.cs      |  17 +--
 .../Jpeg/Components/Block8x8F.Vector256.cs    | 112 ++++++----------
 .../Formats/Jpeg/Components/Block8x8F.cs      |  12 +-
 .../Block8x8F_LoadFromInt16.cs                |   2 +-
 .../Formats/Jpg/Block8x8FTests.cs             |   2 +-
 8 files changed, 164 insertions(+), 178 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 449dc37d0..8533b2151 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -66,9 +66,9 @@ internal static partial class SimdUtils
             ref Span<float> destination,
             [ConstantExpected] byte control)
         {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) ||
-                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat))
+            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
+                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
+                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat))
             {
                 int remainder = 0;
                 if (Vector512.IsHardwareAccelerated)
@@ -112,9 +112,9 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) ||
+            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
                 (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) ||
-                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte))
+                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
             {
                 int remainder = 0;
                 if (Vector512.IsHardwareAccelerated)
@@ -158,7 +158,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
             {
                 int remainder = source.Length % (Vector128<byte>.Count * 3);
 
@@ -190,7 +190,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
             {
                 int remainder = source.Length % (Vector128<byte>.Count * 3);
 
@@ -223,7 +223,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
             {
                 int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1);    // bit-hack for modulo
 
@@ -249,7 +249,7 @@ internal static partial class SimdUtils
             Span<float> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat)
+            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
             {
                 ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@@ -263,21 +263,21 @@ internal static partial class SimdUtils
                     ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector512_.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector512_.ShuffleNative(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat)
+            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
             {
                 ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -291,21 +291,21 @@ internal static partial class SimdUtils
                     ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector256_.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector256_.ShuffleNative(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
-            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)
+            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)
             {
                 ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
@@ -319,17 +319,17 @@ internal static partial class SimdUtils
                     ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector128_.Shuffle(vs0, control);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
+                    vd0 = Vector128_.ShuffleNative(vs0, control);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
+                        Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
                     }
                 }
             }
@@ -341,7 +341,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte)
+            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
             {
                 Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -359,17 +359,17 @@ internal static partial class SimdUtils
                     ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector512_.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector512_.ShuffleNative(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
@@ -391,21 +391,21 @@ internal static partial class SimdUtils
                     ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector256_.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector256_.ShuffleNative(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
-            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)
+            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
             {
                 Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -423,17 +423,17 @@ internal static partial class SimdUtils
                     ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                     ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
 
-                    vd0 = Vector128_.Shuffle(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector128_.ShuffleNative(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
                 }
 
                 if (m > 0)
                 {
                     for (nuint i = u; i < n; i++)
                     {
-                        Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
                     }
                 }
             }
@@ -445,7 +445,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@@ -472,15 +472,15 @@ internal static partial class SimdUtils
                     v2 = Vector128_.AlignRight(v2, v1, 8);
                     v1 = Vector128_.AlignRight(v1, v0, 12);
 
-                    v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask);
-                    v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask);
-                    v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask);
-                    v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask);
+                    v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16), mask);
+                    v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16), mask);
+                    v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16), mask);
+                    v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16), mask);
 
-                    v0 = Vector128_.Shuffle(v0, maskE);
-                    v1 = Vector128_.Shuffle(v1, maskSlice4Nx16);
-                    v2 = Vector128_.Shuffle(v2, maskE);
-                    v3 = Vector128_.Shuffle(v3, maskSlice4Nx16);
+                    v0 = Vector128_.ShuffleNative(v0, maskE);
+                    v1 = Vector128_.ShuffleNative(v1, maskSlice4Nx16);
+                    v2 = Vector128_.ShuffleNative(v2, maskE);
+                    v3 = Vector128_.ShuffleNative(v3, maskSlice4Nx16);
 
                     v0 = Vector128_.AlignRight(v1, v0, 4);
                     v3 = Vector128_.AlignRight(v3, v2, 12);
@@ -505,7 +505,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@@ -534,10 +534,10 @@ internal static partial class SimdUtils
 
                     ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
 
-                    vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask);
-                    Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask);
+                    vd = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 1) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 2) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16) | fill, mask);
+                    Unsafe.Add(ref vd, 3) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16) | fill, mask);
                 }
             }
         }
@@ -548,7 +548,7 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
             {
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
                 Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
@@ -574,10 +574,10 @@ internal static partial class SimdUtils
                     Vector128<byte> v2 = Unsafe.Add(ref vs, 2);
                     Vector128<byte> v3 = Unsafe.Add(ref vs, 3);
 
-                    v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE);
-                    v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16);
-                    v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE);
-                    v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16);
+                    v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, mask), maskE);
+                    v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, mask), maskSlice4Nx16);
+                    v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, mask), maskE);
+                    v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, mask), maskSlice4Nx16);
 
                     v0 = Vector128_.AlignRight(v1, v0, 4);
                     v3 = Vector128_.AlignRight(v3, v2, 12);
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 85b09b351..3471acbd3 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -26,7 +26,7 @@ internal static class Vector128_
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Sse.IsSupported;
@@ -35,10 +35,10 @@ internal static class Vector128_
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported;
+        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
     }
 
     /// <summary>
@@ -66,7 +66,7 @@ internal static class Vector128_
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector128{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
+    public static Vector128<float> ShuffleNative(Vector128<float> vector, [ConstantExpected] byte control)
     {
         if (Sse.IsSupported)
         {
@@ -89,7 +89,7 @@ internal static class Vector128_
     /// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
     /// </returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
+    public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
     {
         if (Ssse3.IsSupported)
         {
@@ -101,6 +101,11 @@ internal static class Vector128_
             return AdvSimd.Arm64.VectorTableLookup(vector, indices);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.Swizzle(vector, indices);
+        }
+
         ThrowUnreachableException();
         return default;
     }
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 893b6240d..8b22a5137 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -24,10 +24,10 @@ internal static class Vector256_
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported || Sse.IsSupported;
+        get => Avx.IsSupported;
     }
 
     /// <summary>
@@ -46,20 +46,13 @@ internal static class Vector256_
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
+    public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
     {
         if (Avx.IsSupported)
         {
             return Avx.Shuffle(vector, vector, control);
         }
 
-        if (Sse.IsSupported)
-        {
-            Vector128<float> lower = vector.GetLower();
-            Vector128<float> upper = vector.GetUpper();
-            return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
-        }
-
         ThrowUnreachableException();
         return default;
     }
@@ -73,7 +66,7 @@ internal static class Vector256_
     /// </param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
     {
         if (Avx2.IsSupported)
         {
@@ -98,13 +91,6 @@ internal static class Vector256_
             return Avx.ConvertToVector256Int32(vector);
         }
 
-        if (Sse2.IsSupported)
-        {
-            Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
-            Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
-            return Vector256.Create(lower, upper);
-        }
-
         Vector256<float> sign = vector & Vector256.Create(-0F);
         Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608F);
 
@@ -154,6 +140,27 @@ internal static class Vector256_
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{Int16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> PackSignedSaturate(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackSignedSaturate(left, right);
+        }
+
+        Vector256<int> min = Vector256.Create((int)short.MinValue);
+        Vector256<int> max = Vector256.Create((int)short.MaxValue);
+        Vector256<int> lefClamped = Clamp(left, min, max);
+        Vector256<int> rightClamped = Clamp(right, min, max);
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
     /// <summary>
     /// Restricts a vector between a minimum and a maximum value.
     /// </summary>
@@ -166,6 +173,21 @@ internal static class Vector256_
     public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
         => Vector256.Min(Vector256.Max(value, min), max);
 
+    /// <summary>
+    /// Widens a <see cref="Vector128{Int16}"/> to a <see cref="Vector256{Int32}"/>.
+    /// </summary>
+    /// <param name="value">The vector to widen.</param>
+    /// <returns>The widened <see cref="Vector256{Int32}"/>.</returns>
+    public static Vector256<int> Widen(Vector128<short> value)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.ConvertToVector256Int32(value);
+        }
+
+        return Vector256.WidenLower(value.ToVector256());
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
index 3c773bc52..63de5dc10 100644
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -24,16 +24,16 @@ internal static class Vector512_
     /// <summary>
     /// Gets a value indicating whether shuffle float operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512F.IsSupported || Avx.IsSupported;
+        get => Avx512F.IsSupported;
     }
 
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Avx512BW.IsSupported;
@@ -46,20 +46,13 @@ internal static class Vector512_
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector512{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector512<float> Shuffle(Vector512<float> vector, [ConstantExpected] byte control)
+    public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
     {
         if (Avx512F.IsSupported)
         {
             return Avx512F.Shuffle(vector, vector, control);
         }
 
-        if (Avx.IsSupported)
-        {
-            Vector256<float> lower = vector.GetLower();
-            Vector256<float> upper = vector.GetUpper();
-            return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
-        }
-
         ThrowUnreachableException();
         return default;
     }
@@ -73,7 +66,7 @@ internal static class Vector512_
     /// </param>
     /// <returns>The <see cref="Vector512{Byte}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector512<byte> Shuffle(Vector512<byte> vector, Vector512<byte> indices)
+    public static Vector512<byte> ShuffleNative(Vector512<byte> vector, Vector512<byte> indices)
     {
         if (Avx512BW.IsSupported)
         {
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
index 4e4133496..2aaf5c943 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
@@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
-using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
@@ -60,109 +59,76 @@ internal partial struct Block8x8F
     }
 
     /// <summary>
-    /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
+    /// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
     /// </summary>
     /// <param name="source">The source <see cref="Block8x8"/></param>
-    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
+    public void LoadFromInt16ExtendedVector256(ref Block8x8 source)
     {
         DebugGuard.IsTrue(
-            Avx2.IsSupported,
-            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
+            Vector256.IsHardwareAccelerated,
+            "LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!");
 
         ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
         ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
 
-        // Vector256<ushort>.Count == 16 on AVX2
+        // Vector256<ushort>.Count == 16
         // We can process 2 block rows in a single step
-        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
-        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
-        dRef = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
-        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
-        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
-        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
+        Vector256<int> top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef));
+        Vector256<int> bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
+        dRef = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
+        Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
+        Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
+        Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom);
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
     private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
         => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
 
-    private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+    private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
     {
-        DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!");
 
         ref Vector256<float> aBase = ref a.V256_0;
         ref Vector256<float> bBase = ref b.V256_0;
-
         ref Vector256<short> destRef = ref dest.V01;
-        Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
 
         for (nuint i = 0; i < 8; i += 2)
         {
-            Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-            Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+            Vector256<int> row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
+            Vector256<int> row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
 
-            Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
-            row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
+            Vector256<short> row = Vector256_.PackSignedSaturate(row0, row1);
+            row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16();
 
             Unsafe.Add(ref destRef, i / 2) = row;
         }
     }
 
-    private void TransposeInPlace_Avx()
+    private void TransposeInPlaceVector256()
     {
         // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-        Vector256<float> r0 = Avx.InsertVector128(
-            this.V256_0,
-            Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-            1);
-
-        Vector256<float> r1 = Avx.InsertVector128(
-           this.V256_1,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-           1);
-
-        Vector256<float> r2 = Avx.InsertVector128(
-           this.V256_2,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-           1);
-
-        Vector256<float> r3 = Avx.InsertVector128(
-           this.V256_3,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-           1);
-
-        Vector256<float> r4 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-           1);
-
-        Vector256<float> r5 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-           1);
-
-        Vector256<float> r6 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-           1);
-
-        Vector256<float> r7 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-           1);
+        Vector256<float> r0 = this.V256_0.WithUpper(this.V4L.AsVector128());
+        Vector256<float> r1 = this.V256_1.WithUpper(this.V5L.AsVector128());
+        Vector256<float> r2 = this.V256_2.WithUpper(this.V6L.AsVector128());
+        Vector256<float> r3 = this.V256_3.WithUpper(this.V7L.AsVector128());
+        Vector256<float> r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128());
+        Vector256<float> r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128());
+        Vector256<float> r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128());
+        Vector256<float> r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128());
 
         Vector256<float> t0 = Avx.UnpackLow(r0, r1);
         Vector256<float> t2 = Avx.UnpackLow(r2, r3);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 6f9b4fd16..a4a7d3ed0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -277,9 +277,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     /// <param name="qt">The quantization table.</param>
     public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
     {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
+            MultiplyIntoInt16Vector256(ref block, ref qt, ref dest);
             ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
         }
         else if (Vector128.IsHardwareAccelerated)
@@ -387,9 +387,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void LoadFrom(ref Block8x8 source)
     {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.LoadFromInt16ExtendedAvx2(ref source);
+            this.LoadFromInt16ExtendedVector256(ref source);
             return;
         }
         else if (Vector128.IsHardwareAccelerated)
@@ -601,9 +601,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
     [MethodImpl(InliningOptions.ShortMethod)]
     public void TransposeInPlace()
     {
-        if (Avx.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.TransposeInPlace_Avx();
+            this.TransposeInPlaceVector256();
         }
         else
         {
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs
index 7a8502c2c..25b5e973e 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs
@@ -32,7 +32,7 @@ public class Block8x8F_LoadFromInt16
     public void Scalar() => this.destination.LoadFromInt16Scalar(ref this.source);
 
     [Benchmark]
-    public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedAvx2(ref this.source);
+    public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedVector256(ref this.source);
 
     // RESULT:
     //        Method |     Mean |     Error |    StdDev | Scaled |
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index d1ade761c..ab205c8a3 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -433,7 +433,7 @@ public partial class Block8x8FTests : JpegFixture
         Block8x8 source = Block8x8.Load(data);
 
         Block8x8F dest = default;
-        dest.LoadFromInt16ExtendedAvx2(ref source);
+        dest.LoadFromInt16ExtendedVector256(ref source);
 
         for (int i = 0; i < Block8x8F.Size; i++)
         {

From 6238f00895c8440624a8be7476eb0d0be01e38fd Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 8 May 2025 21:31:29 +1000
Subject: [PATCH 08/12] Modernize additional V256 code from review

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  |  23 ---
 .../Common/Helpers/Vector256Utilities.cs      |  22 +++
 .../Formats/Jpeg/Components/Block8x8.cs       |   4 +-
 .../Components/FloatingPointDCT.Intrinsic.cs  | 142 ------------------
 .../Components/FloatingPointDCT.Vector256.cs  | 142 ++++++++++++++++++
 .../Jpeg/Components/FloatingPointDCT.cs       |  14 +-
 .../Formats/Jpg/Block8x8Tests.cs              |   2 +-
 .../Jpg/Utils/LibJpegTools.ComponentData.cs   |   2 +-
 8 files changed, 175 insertions(+), 176 deletions(-)
 delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 8533b2151..e155e4536 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -619,29 +619,6 @@ internal static partial class SimdUtils
             return va + (vm0 * vm1);
         }
 
-        /// <summary>
-        /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
-        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
-        /// </summary>
-        /// <remarks>ret = (vm0 * vm1) - vs</remarks>
-        /// <param name="vs">The vector to subtract from the intermediate result.</param>
-        /// <param name="vm0">The first vector to multiply.</param>
-        /// <param name="vm1">The second vector to multiply.</param>
-        /// <returns>The <see cref="Vector256{T}"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static Vector256<float> MultiplySubtract(
-            Vector256<float> vs,
-            Vector256<float> vm0,
-            Vector256<float> vm1)
-        {
-            if (Fma.IsSupported)
-            {
-                return Fma.MultiplySubtract(vm1, vm0, vs);
-            }
-
-            return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
-        }
-
         /// <summary>
         /// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
         /// </summary>
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 8b22a5137..c835d267d 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -140,6 +140,28 @@ internal static class Vector256_
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+    /// </summary>
+    /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+    /// <param name="vs">The vector to subtract from the intermediate result.</param>
+    /// <param name="vm0">The first vector to multiply.</param>
+    /// <param name="vm1">The second vector to multiply.</param>
+    /// <returns>The <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplySubtract(
+        Vector256<float> vs,
+        Vector256<float> vm0,
+        Vector256<float> vm1)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplySubtract(vm1, vm0, vs);
+        }
+
+        return (vm0 * vm1) - vs;
+    }
+
     /// <summary>
     /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
     /// </summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 01d112bd6..731ad0f76 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -211,10 +211,10 @@ internal partial struct Block8x8
     }
 
     /// <summary>
-    /// Transpose the block inplace.
+    /// Transpose the block in place.
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    public void TransposeInplace()
+    public void TransposeInPlace()
     {
         ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
deleted file mode 100644
index 862c77469..000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal static partial class FloatingPointDCT
-{
-    /// <summary>
-    /// Apply floating point FDCT inplace using simd operations.
-    /// </summary>
-    /// <param name="block">Input block.</param>
-    private static void FDCT8x8_Avx(ref Block8x8F block)
-    {
-        DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
-        // First pass - process columns
-        FDCT8x8_1D_Avx(ref block);
-
-        // Second pass - process rows
-        block.TransposeInPlace();
-        FDCT8x8_1D_Avx(ref block);
-
-        // Applies 1D floating point FDCT inplace
-        static void FDCT8x8_1D_Avx(ref Block8x8F block)
-        {
-            Vector256<float> tmp0 = Avx.Add(block.V256_0, block.V256_7);
-            Vector256<float> tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
-            Vector256<float> tmp1 = Avx.Add(block.V256_1, block.V256_6);
-            Vector256<float> tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
-            Vector256<float> tmp2 = Avx.Add(block.V256_2, block.V256_5);
-            Vector256<float> tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
-            Vector256<float> tmp3 = Avx.Add(block.V256_3, block.V256_4);
-            Vector256<float> tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
-
-            // Even part
-            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
-            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
-            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
-            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
-
-            block.V256_0 = Avx.Add(tmp10, tmp11);
-            block.V256_4 = Avx.Subtract(tmp10, tmp11);
-
-            var mm256_F_0_7071 = Vector256.Create(0.707106781f);
-            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
-            block.V256_2 = Avx.Add(tmp13, z1);
-            block.V256_6 = Avx.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Avx.Add(tmp4, tmp5);
-            tmp11 = Avx.Add(tmp5, tmp6);
-            tmp12 = Avx.Add(tmp6, tmp7);
-
-            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f));         // mm256_F_0_3826
-            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10);    // mm256_F_0_5411
-            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12);    // mm256_F_1_3065
-            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
-            Vector256<float> z11 = Avx.Add(tmp7, z3);
-            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
-
-            block.V256_5 = Avx.Add(z13, z2);
-            block.V256_3 = Avx.Subtract(z13, z2);
-            block.V256_1 = Avx.Add(z11, z4);
-            block.V256_7 = Avx.Subtract(z11, z4);
-        }
-    }
-
-    /// <summary>
-    /// Apply floating point IDCT inplace using simd operations.
-    /// </summary>
-    /// <param name="transposedBlock">Transposed input block.</param>
-    private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
-    {
-        DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
-        // First pass - process columns
-        IDCT8x8_1D_Avx(ref transposedBlock);
-
-        // Second pass - process rows
-        transposedBlock.TransposeInPlace();
-        IDCT8x8_1D_Avx(ref transposedBlock);
-
-        // Applies 1D floating point FDCT inplace
-        static void IDCT8x8_1D_Avx(ref Block8x8F block)
-        {
-            // Even part
-            Vector256<float> tmp0 = block.V256_0;
-            Vector256<float> tmp1 = block.V256_2;
-            Vector256<float> tmp2 = block.V256_4;
-            Vector256<float> tmp3 = block.V256_6;
-
-            Vector256<float> z5 = tmp0;
-            Vector256<float> tmp10 = Avx.Add(z5, tmp2);
-            Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
-
-            var mm256_F_1_4142 = Vector256.Create(1.414213562f);
-            Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
-            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
-
-            tmp0 = Avx.Add(tmp10, tmp13);
-            tmp3 = Avx.Subtract(tmp10, tmp13);
-            tmp1 = Avx.Add(tmp11, tmp12);
-            tmp2 = Avx.Subtract(tmp11, tmp12);
-
-            // Odd part
-            Vector256<float> tmp4 = block.V256_1;
-            Vector256<float> tmp5 = block.V256_3;
-            Vector256<float> tmp6 = block.V256_5;
-            Vector256<float> tmp7 = block.V256_7;
-
-            Vector256<float> z13 = Avx.Add(tmp6, tmp5);
-            Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
-            Vector256<float> z11 = Avx.Add(tmp4, tmp7);
-            Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
-
-            tmp7 = Avx.Add(z11, z13);
-            tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
-
-            z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f));                   // mm256_F_1_8477
-
-            tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f));   // mm256_F_n1_0823
-            tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f));   // mm256_F_n2_6131
-
-            tmp6 = Avx.Subtract(tmp12, tmp7);
-            tmp5 = Avx.Subtract(tmp11, tmp6);
-            tmp4 = Avx.Subtract(tmp10, tmp5);
-
-            block.V256_0 = Avx.Add(tmp0, tmp7);
-            block.V256_7 = Avx.Subtract(tmp0, tmp7);
-            block.V256_1 = Avx.Add(tmp1, tmp6);
-            block.V256_6 = Avx.Subtract(tmp1, tmp6);
-            block.V256_2 = Avx.Add(tmp2, tmp5);
-            block.V256_5 = Avx.Subtract(tmp2, tmp5);
-            block.V256_3 = Avx.Add(tmp3, tmp4);
-            block.V256_4 = Avx.Subtract(tmp3, tmp4);
-        }
-    }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
new file mode 100644
index 000000000..bcd8c7043
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
@@ -0,0 +1,142 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal static partial class FloatingPointDCT
+{
+    /// <summary>
+    /// Apply floating point FDCT in place using simd operations.
+    /// </summary>
+    /// <param name="block">Input block.</param>
+    private static void FDCT8x8_Vector256(ref Block8x8F block)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Second pass - process rows
+        block.TransposeInPlace();
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Applies 1D floating point FDCT in place
+        static void FDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            Vector256<float> tmp0 = block.V256_0 + block.V256_7;
+            Vector256<float> tmp7 = block.V256_0 - block.V256_7;
+            Vector256<float> tmp1 = block.V256_1 + block.V256_6;
+            Vector256<float> tmp6 = block.V256_1 - block.V256_6;
+            Vector256<float> tmp2 = block.V256_2 + block.V256_5;
+            Vector256<float> tmp5 = block.V256_2 - block.V256_5;
+            Vector256<float> tmp3 = block.V256_3 + block.V256_4;
+            Vector256<float> tmp4 = block.V256_3 - block.V256_4;
+
+            // Even part
+            Vector256<float> tmp10 = tmp0 + tmp3;
+            Vector256<float> tmp13 = tmp0 - tmp3;
+            Vector256<float> tmp11 = tmp1 + tmp2;
+            Vector256<float> tmp12 = tmp1 - tmp2;
+
+            block.V256_0 = tmp10 + tmp11;
+            block.V256_4 = tmp10 - tmp11;
+
+            Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+            Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
+            block.V256_2 = tmp13 + z1;
+            block.V256_6 = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f);    // mm256_F_0_3826
+            Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10);    // mm256_F_0_5411
+            Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12);    // mm256_F_1_3065
+            Vector256<float> z3 = tmp11 * mm256_F_0_7071;
+
+            Vector256<float> z11 = tmp7 + z3;
+            Vector256<float> z13 = tmp7 - z3;
+
+            block.V256_5 = z13 + z2;
+            block.V256_3 = z13 - z2;
+            block.V256_1 = z11 + z4;
+            block.V256_7 = z11 - z4;
+        }
+    }
+
+    /// <summary>
+    /// Apply floating point IDCT in place using simd operations.
+    /// </summary>
+    /// <param name="transposedBlock">Transposed input block.</param>
+    private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Second pass - process rows
+        transposedBlock.TransposeInPlace();
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Applies 1D floating point FDCT in place
+        static void IDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            // Even part
+            Vector256<float> tmp0 = block.V256_0;
+            Vector256<float> tmp1 = block.V256_2;
+            Vector256<float> tmp2 = block.V256_4;
+            Vector256<float> tmp3 = block.V256_6;
+
+            Vector256<float> z5 = tmp0;
+            Vector256<float> tmp10 = z5 + tmp2;
+            Vector256<float> tmp11 = z5 - tmp2;
+
+            Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
+            Vector256<float> tmp13 = tmp1 + tmp3;
+            Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
+
+            tmp0 = tmp10 + tmp13;
+            tmp3 = tmp10 - tmp13;
+            tmp1 = tmp11 + tmp12;
+            tmp2 = tmp11 - tmp12;
+
+            // Odd part
+            Vector256<float> tmp4 = block.V256_1;
+            Vector256<float> tmp5 = block.V256_3;
+            Vector256<float> tmp6 = block.V256_5;
+            Vector256<float> tmp7 = block.V256_7;
+
+            Vector256<float> z13 = tmp6 + tmp5;
+            Vector256<float> z10 = tmp6 - tmp5;
+            Vector256<float> z11 = tmp4 + tmp7;
+            Vector256<float> z12 = tmp4 - tmp7;
+
+            tmp7 = z11 + z13;
+            tmp11 = (z11 - z13) * mm256_F_1_4142;
+
+            z5 = (z10 + z12) * Vector256.Create(1.847759065f);   // mm256_F_1_8477
+
+            tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f));   // mm256_F_n1_0823
+            tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f));   // mm256_F_n2_6131
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 - tmp5;
+
+            block.V256_0 = tmp0 + tmp7;
+            block.V256_7 = tmp0 - tmp7;
+            block.V256_1 = tmp1 + tmp6;
+            block.V256_6 = tmp1 - tmp6;
+            block.V256_2 = tmp2 + tmp5;
+            block.V256_5 = tmp2 - tmp5;
+            block.V256_3 = tmp3 + tmp4;
+            block.V256_4 = tmp3 - tmp4;
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
index 4c22307cf..8122d8daa 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
@@ -4,7 +4,7 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT
     }
 
     /// <summary>
-    /// Apply 2D floating point IDCT inplace.
+    /// Apply 2D floating point IDCT in place.
     /// </summary>
     /// <remarks>
     /// Input block must be dequantized with quantization table
@@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT
     /// <param name="block">Input block.</param>
     public static void TransformIDCT(ref Block8x8F block)
     {
-        if (Avx.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            IDCT8x8_Avx(ref block);
+            IDCT8x8_Vector256(ref block);
         }
         else
         {
@@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT
     }
 
     /// <summary>
-    /// Apply 2D floating point IDCT inplace.
+    /// Apply 2D floating point IDCT in place.
     /// </summary>
     /// <remarks>
     /// Input block must be quantized after this method with quantization
@@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT
     /// <param name="block">Input block.</param>
     public static void TransformFDCT(ref Block8x8F block)
     {
-        if (Avx.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_Vector256(ref block);
         }
         else
         {
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index b5d364dd3..cb8f52a96 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture
 
             Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());
 
-            block8x8.TransposeInplace();
+            block8x8.TransposeInPlace();
 
             short[] actual = new short[64];
             block8x8.CopyTo(actual);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
index 65d0a01ff..975378b5f 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@@ -60,7 +60,7 @@ internal static partial class LibJpegTools
 
         internal void MakeBlock(Block8x8 block, int y, int x)
         {
-            block.TransposeInplace();
+            block.TransposeInPlace();
             this.MakeBlock(block.ToArray(), y, x);
         }
 

From 505ecce3fa8d5c7cf8f967c7dd27f1f2f831fab0 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 12 May 2025 09:46:17 +1000
Subject: [PATCH 09/12] Update ShuffleNative (byte)

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  | 20 +++++++----
 .../Common/Helpers/Vector128Utilities.cs      | 35 +++++++++++--------
 .../Common/Helpers/Vector256Utilities.cs      |  2 +-
 .../Formats/Jpeg/Components/Block8x8F.cs      |  1 -
 4 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index e155e4536..dc610a6f9 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -113,7 +113,7 @@ internal static partial class SimdUtils
             [ConstantExpected] byte control)
         {
             if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) ||
+                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
                 (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
             {
                 int remainder = 0;
@@ -158,7 +158,7 @@ internal static partial class SimdUtils
             ref Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
             {
                 int remainder = source.Length % (Vector128<byte>.Count * 3);
 
@@ -373,7 +373,7 @@ internal static partial class SimdUtils
                     }
                 }
             }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte)
+            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
             {
                 Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
                 Shuffle.MMShuffleSpan(ref temp, control);
@@ -445,7 +445,9 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
+            if (Vector128.IsHardwareAccelerated &&
+                Vector128_.SupportsShuffleNativeByte &&
+                Vector128_.SupportsAlignRight)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@@ -505,7 +507,10 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated &&
+                Vector128_.SupportsShuffleNativeByte &&
+                Vector128_.SupportsShiftByte &&
+                Vector128_.SupportsAlignRight)
             {
                 Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                 Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@@ -548,7 +553,10 @@ internal static partial class SimdUtils
             Span<byte> destination,
             [ConstantExpected] byte control)
         {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated &&
+                Vector128_.SupportsShuffleNativeByte &&
+                Vector128_.SupportsShiftByte &&
+                Vector128_.SupportsAlignRight)
             {
                 Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
                 Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 3471acbd3..83b842e13 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -4,6 +4,7 @@
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.Wasm;
@@ -38,13 +39,26 @@ internal static class Vector128_
     public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
+        get
+        {
+            if (Vector128.IsHardwareAccelerated)
+            {
+                if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64)
+                {
+                    return Ssse3.IsSupported;
+                }
+
+                return true;
+            }
+
+            return false;
+        }
     }
 
     /// <summary>
     /// Gets a value indicating whether right align operations are supported.
     /// </summary>
-    public static bool SupportsRightAlign
+    public static bool SupportsAlignRight
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Ssse3.IsSupported || AdvSimd.IsSupported;
@@ -91,23 +105,16 @@ internal static class Vector128_
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
     {
+        // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
         if (Ssse3.IsSupported)
         {
             return Ssse3.Shuffle(vector, indices);
         }
 
-        if (AdvSimd.Arm64.IsSupported)
-        {
-            return AdvSimd.Arm64.VectorTableLookup(vector, indices);
-        }
-
-        if (PackedSimd.IsSupported)
-        {
-            return PackedSimd.Swizzle(vector, indices);
-        }
-
-        ThrowUnreachableException();
-        return default;
+        // For ARM and WASM, codegen will be optimal.
+        // We don't throw for x86/x64 so we should never use this method without
+        // checking for support.
+        return Vector128.Shuffle(vector, indices);
     }
 
     /// <summary>
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index c835d267d..817d6e607 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -33,7 +33,7 @@ internal static class Vector256_
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Avx2.IsSupported;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index a4a7d3ed0..49b519201 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -5,7 +5,6 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 using System.Text;
 using SixLabors.ImageSharp.Common.Helpers;
 

From 55a8c732326b19416d58323dcb3660dea10b0687 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 12 May 2025 10:47:34 +1000
Subject: [PATCH 10/12] Expand v128 native shuffle (float) support

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs  |  4 ++--
 .../Common/Helpers/Vector128Utilities.cs      | 19 ++++++++-----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index dc610a6f9..a1bf7dad3 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -68,7 +68,7 @@ internal static partial class SimdUtils
         {
             if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
                 (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
-                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat))
+                 Vector128.IsHardwareAccelerated)
             {
                 int remainder = 0;
                 if (Vector512.IsHardwareAccelerated)
@@ -305,7 +305,7 @@ internal static partial class SimdUtils
                     }
                 }
             }
-            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)
+            else if (Vector128.IsHardwareAccelerated)
             {
                 ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
                 ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 83b842e13..322423e1a 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -24,15 +24,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 internal static class Vector128_
 #pragma warning restore SA1649 // File name should match first type name
 {
-    /// <summary>
-    /// Gets a value indicating whether shuffle operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeFloat
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Sse.IsSupported;
-    }
-
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
@@ -87,8 +78,14 @@ internal static class Vector128_
             return Sse.Shuffle(vector, vector, control);
         }
 
-        ThrowUnreachableException();
-        return default;
+        // Don't use InverseMMShuffle here as we want to avoid the cast.
+        Vector128<int> indices = Vector128.Create(
+            control & 0x3,
+            (control >> 2) & 0x3,
+            (control >> 4) & 0x3,
+            (control >> 6) & 0x3);
+
+        return Vector128.Shuffle(vector, indices);
     }
 
     /// <summary>

From a59c900e9f7743b50c7d9721f09e55faa6d66186 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Wed, 14 May 2025 20:55:57 +1000
Subject: [PATCH 11/12] More optimizations based on feedback

---
 .../Common/Helpers/SimdUtils.HwIntrinsics.cs        | 13 ++++++++++---
 src/ImageSharp/Common/Helpers/Vector128Utilities.cs | 10 ++++++++++
 .../Formats/Jpeg/Components/Block8x8F.Vector128.cs  |  1 -
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index a1bf7dad3..4911653ce 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -1008,6 +1008,8 @@ internal static partial class SimdUtils
                 ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination));
 
                 Vector128<float> scale = Vector128.Create((float)byte.MaxValue);
+                Vector128<int> min = Vector128<int>.Zero;
+                Vector128<int> max = Vector128.Create((int)byte.MaxValue);
 
                 for (nuint i = 0; i < n; i++)
                 {
@@ -1023,10 +1025,15 @@ internal static partial class SimdUtils
                     Vector128<int> w2 = Vector128_.ConvertToInt32RoundToEven(f2);
                     Vector128<int> w3 = Vector128_.ConvertToInt32RoundToEven(f3);
 
-                    Vector128<short> u0 = Vector128_.PackSignedSaturate(w0, w1);
-                    Vector128<short> u1 = Vector128_.PackSignedSaturate(w2, w3);
+                    w0 = Vector128_.Clamp(w0, min, max);
+                    w1 = Vector128_.Clamp(w1, min, max);
+                    w2 = Vector128_.Clamp(w2, min, max);
+                    w3 = Vector128_.Clamp(w3, min, max);
 
-                    Unsafe.Add(ref destinationBase, i) = Vector128_.PackUnsignedSaturate(u0, u1);
+                    Vector128<short> u0 = Vector128.Narrow(w0, w1);
+                    Vector128<short> u1 = Vector128.Narrow(w2, w3);
+
+                    Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1).AsByte();
                 }
             }
         }
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 322423e1a..dbe0a1fce 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -205,6 +205,11 @@ internal static class Vector128_
             return AdvSimd.ConvertToInt32RoundToEven(vector);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertToInt32Saturate(PackedSimd.RoundToNearest(vector));
+        }
+
         Vector128<float> sign = vector & Vector128.Create(-0F);
         Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
 
@@ -230,6 +235,11 @@ internal static class Vector128_
             return AdvSimd.RoundToNearest(vector);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.RoundToNearest(vector);
+        }
+
         Vector128<float> sign = vector & Vector128.Create(-0F);
         Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
index 3daa47693..d4c0398d9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
@@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
-using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Common.Helpers;

From 4c1ecfad49a0b76e1ad46a2a99222305e4096cd5 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Thu, 15 May 2025 09:16:43 +1000
Subject: [PATCH 12/12] Fix v128 narrowing.

---
 src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 4911653ce..96ddb7976 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -1030,10 +1030,10 @@ internal static partial class SimdUtils
                     w2 = Vector128_.Clamp(w2, min, max);
                     w3 = Vector128_.Clamp(w3, min, max);
 
-                    Vector128<short> u0 = Vector128.Narrow(w0, w1);
-                    Vector128<short> u1 = Vector128.Narrow(w2, w3);
+                    Vector128<ushort> u0 = Vector128.Narrow(w0, w1).AsUInt16();
+                    Vector128<ushort> u1 = Vector128.Narrow(w2, w3).AsUInt16();
 
-                    Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1).AsByte();
+                    Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1);
                 }
             }
         }