diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index bd22864bb..b537cdfdf 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -74,8 +74,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; - float* bufferEnd = bufferStart + (this.Length & ~1); - Vector256 result256 = Vector256.Zero; + float* bufferEnd = bufferStart + (this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); while (bufferStart < bufferEnd) @@ -87,19 +88,36 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms // // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] - // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 // // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. - result256 = Fma.MultiplyAdd( + // Additionally, we're also unrolling two computations per each loop iterations to leverage the + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. + result256_0 = Fma.MultiplyAdd( Unsafe.As>(ref rowStartRef), Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256); + result256_0); - bufferStart += 2; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + result256_1 = Fma.MultiplyAdd( + Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), + result256_1); + + bufferStart += 4; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + } + + result256_0 = Avx.Add(result256_0, result256_1); + + if ((this.Length & 3) >= 2) + { + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); } - Vector128 result128 = Sse.Add(result256.GetLower(), result256.GetUpper()); + Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); if ((this.Length & 1) != 0) {