diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 5a87d045e..bd22864bb 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms public Vector4 ConvolveCore(ref Vector4 rowStartRef) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) + if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); @@ -80,11 +80,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms while (bufferStart < bufferEnd) { - Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); - Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); - Vector256 multiply256 = Avx.Multiply(rowItem256, bufferItem256); - - result256 = Avx.Add(multiply256, result256); + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps + // for the FMA operation, and execute it directly on the target register and reading directly from + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. + // The code below should compile in the following assembly on .NET 5 x64: + // + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] + // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. + result256 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256); bufferStart += 2; rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); @@ -94,11 +103,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms if ((this.Length & 1) != 0) { - Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); - var bufferItem128 = Vector128.Create(*bufferStart); - Vector128 multiply128 = Sse.Multiply(rowItem128, bufferItem128); - - result128 = Sse.Add(multiply128, result128); + result128 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Vector128.Create(*bufferStart), + result128); } return *(Vector4*)&result128;