diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 128218aac2..7d2bab259e 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -533,6 +533,7 @@ internal static partial class SimdUtils
///
/// Performs a multiplication and an addition of the .
+ /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
///
/// ret = (vm0 * vm1) + va
/// The vector to add to the intermediate result.
@@ -555,6 +556,7 @@ internal static partial class SimdUtils
///
/// Performs a multiplication and a subtraction of the .
+ /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
///
/// ret = (vm0 * vm1) - vs
/// The vector to subtract from the intermediate result.
@@ -575,6 +577,28 @@ internal static partial class SimdUtils
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
}
+ ///
+ /// Performs a multiplication and a negated addition of the .
+ ///
+ /// ret = c - (a * b)
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The vector to add negated to the intermediate result.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplyAddNegated(
+ in Vector256 a,
+ in Vector256 b,
+ in Vector256 c)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplyAddNegated(a, b, c);
+ }
+
+ return Avx.Subtract(c, Avx.Multiply(a, b));
+ }
+
///
/// as many elements as possible, slicing them down (keeping the remainder).
///
diff --git a/src/ImageSharp/PixelFormats/PixelBlenders/PorterDuffFunctions.cs b/src/ImageSharp/PixelFormats/PixelBlenders/PorterDuffFunctions.cs
index d1bd5bad31..2d47f1a628 100644
--- a/src/ImageSharp/PixelFormats/PixelBlenders/PorterDuffFunctions.cs
+++ b/src/ImageSharp/PixelFormats/PixelBlenders/PorterDuffFunctions.cs
@@ -124,7 +124,7 @@ internal static partial class PorterDuffFunctions
public static Vector256 Screen(Vector256 backdrop, Vector256 source)
{
Vector256 vOne = Vector256.Create(1F);
- return Avx.Subtract(vOne, Avx.Multiply(Avx.Subtract(vOne, backdrop), Avx.Subtract(vOne, source)));
+ return SimdUtils.HwIntrinsics.MultiplyAddNegated(Avx.Subtract(vOne, backdrop), Avx.Subtract(vOne, source), vOne);
}
///
@@ -244,10 +244,10 @@ internal static partial class PorterDuffFunctions
public static Vector256 OverlayValueFunction(Vector256 backdrop, Vector256 source)
{
Vector256 vOne = Vector256.Create(1F);
- Vector256 vTwo = Vector256.Create(2F);
Vector256 left = Avx.Multiply(Avx.Add(backdrop, backdrop), source);
- Vector256 right = Avx.Subtract(vOne, Avx.Multiply(Avx.Multiply(vTwo, Avx.Subtract(vOne, source)), Avx.Subtract(vOne, backdrop)));
+ Vector256 vOneMinusSource = Avx.Subtract(vOne, source);
+ Vector256 right = SimdUtils.HwIntrinsics.MultiplyAddNegated(Avx.Add(vOneMinusSource, vOneMinusSource), Avx.Subtract(vOne, backdrop), vOne);
Vector256 cmp = Avx.CompareGreaterThan(backdrop, Vector256.Create(.5F));
return Avx.BlendVariable(left, right, cmp);
}
@@ -430,9 +430,7 @@ internal static partial class PorterDuffFunctions
public static Vector256 Out(Vector256 destination, Vector256 source)
{
// calculate alpha
- Vector256 sW = Avx.Shuffle(source, source, ShuffleAlphaControl);
- Vector256 dW = Avx.Shuffle(destination, destination, ShuffleAlphaControl);
- Vector256 alpha = Avx.Multiply(Avx.Subtract(Vector256.Create(1F), dW), sW);
+ Vector256 alpha = Avx.Permute(Avx.Multiply(source, Avx.Subtract(Vector256.Create(1F), destination)), ShuffleAlphaControl);
// premultiply
Vector256 color = Avx.Multiply(source, alpha);