diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 001244a89..7c7504371 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -114,7 +114,6 @@ jobs:
if: ${{ matrix.options.sdk-preview != true }}
uses: actions/setup-dotnet@v3
with:
- include-prerelease: true
dotnet-version: |
6.0.x
@@ -122,7 +121,6 @@ jobs:
if: ${{ matrix.options.sdk-preview == true }}
uses: actions/setup-dotnet@v3
with:
- include-prerelease: true
dotnet-version: |
7.0.x
diff --git a/ci-pack.ps1 b/ci-pack.ps1
index 09f45347e..55c69fb59 100644
--- a/ci-pack.ps1
+++ b/ci-pack.ps1
@@ -3,4 +3,4 @@ dotnet clean -c Release
$repositoryUrl = "https://github.com/$env:GITHUB_REPOSITORY"
# Building for packing and publishing.
-dotnet pack -c Release --output "$PSScriptRoot/artifacts" /p:RepositoryUrl=$repositoryUrl
+dotnet pack -c Release -p:PackageOutputPath="$PSScriptRoot/artifacts" -p:RepositoryUrl=$repositoryUrl
diff --git a/src/ImageSharp/Common/Constants.cs b/src/ImageSharp/Common/Constants.cs
index fa2f72c74..d4640f133 100644
--- a/src/ImageSharp/Common/Constants.cs
+++ b/src/ImageSharp/Common/Constants.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
namespace SixLabors.ImageSharp;
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index fc6cfd585..81cc4b539 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -5,6 +5,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp;
@@ -808,6 +809,25 @@ internal static class Numerics
return Sse2.ConvertToInt32(vsum);
}
+ ///
+ /// Reduces elements of the vector into one sum.
+ ///
+ /// The accumulator to reduce.
+ /// The sum of all elements.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static int ReduceSumArm(Vector128 accumulator)
+ {
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ Vector64 sum = AdvSimd.Arm64.AddAcross(accumulator);
+ return (int)AdvSimd.Extract(sum, 0);
+ }
+
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(accumulator);
+ Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
+ return (int)AdvSimd.Extract(sum3, 0);
+ }
+
///
/// Reduces even elements of the vector into one sum.
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 4bc0040c6..7d2bab259 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -532,7 +532,8 @@ internal static partial class SimdUtils
}
///
- /// Performs a multiplication and an addition of the .
+ /// Performs a multiplication and an addition of the .
+ /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
///
/// ret = (vm0 * vm1) + va
/// The vector to add to the intermediate result.
@@ -549,22 +550,21 @@ internal static partial class SimdUtils
{
return Fma.MultiplyAdd(vm1, vm0, va);
}
- else
- {
- return Avx.Add(Avx.Multiply(vm0, vm1), va);
- }
+
+ return Avx.Add(Avx.Multiply(vm0, vm1), va);
}
///
- /// Performs a multiplication and a substraction of the .
+ /// Performs a multiplication and a subtraction of the .
+ /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
///
/// ret = (vm0 * vm1) - vs
- /// The vector to substract from the intermediate result.
+ /// The vector to subtract from the intermediate result.
/// The first vector to multiply.
/// The second vector to multiply.
/// The .
[MethodImpl(InliningOptions.ShortMethod)]
- public static Vector256 MultiplySubstract(
+ public static Vector256 MultiplySubtract(
in Vector256 vs,
in Vector256 vm0,
in Vector256 vm1)
@@ -573,10 +573,30 @@ internal static partial class SimdUtils
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
- else
+
+ return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+ }
+
+ ///
+ /// Performs a multiplication and a negated addition of the .
+ ///
+ /// ret = c - (a * b)
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The vector to add negated to the intermediate result.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplyAddNegated(
+ in Vector256 a,
+ in Vector256 b,
+ in Vector256 c)
+ {
+ if (Fma.IsSupported)
{
- return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+ return Fma.MultiplyAddNegated(a, b, c);
}
+
+ return Avx.Subtract(c, Avx.Multiply(a, b));
}
///
diff --git a/src/ImageSharp/Formats/DecoderOptions.cs b/src/ImageSharp/Formats/DecoderOptions.cs
index 989fc49fc..6243a071d 100644
--- a/src/ImageSharp/Formats/DecoderOptions.cs
+++ b/src/ImageSharp/Formats/DecoderOptions.cs
@@ -15,15 +15,25 @@ public sealed class DecoderOptions
private uint maxFrames = int.MaxValue;
+ // Used by the FileProvider in the unit tests to set the configuration on the fly.
+#pragma warning disable IDE0032 // Use auto property
+ private Configuration configuration = Configuration.Default;
+#pragma warning restore IDE0032 // Use auto property
+
///
/// Gets the shared default general decoder options instance.
+ /// Used internally to reduce allocations for default decoding operations.
///
internal static DecoderOptions Default { get; } = LazyOptions.Value;
///
/// Gets a custom configuration instance to be used by the image processing pipeline.
///
- public Configuration Configuration { get; internal set; } = Configuration.Default;
+#pragma warning disable IDE0032 // Use auto property
+#pragma warning disable RCS1085 // Use auto-implemented property.
+ public Configuration Configuration { get => this.configuration; init => this.configuration = value; }
+#pragma warning restore RCS1085 // Use auto-implemented property.
+#pragma warning restore IDE0032 // Use auto property
///
/// Gets the target size to decode the image into. Scaling should use an operation equivalent to .
@@ -44,4 +54,6 @@ public sealed class DecoderOptions
/// Gets the maximum number of image frames to decode, inclusive.
///
public uint MaxFrames { get => this.maxFrames; init => this.maxFrames = Math.Clamp(value, 1, int.MaxValue); }
+
+ internal void SetConfiguration(Configuration configuration) => this.configuration = configuration;
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
index cae89fc3c..7e102f696 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@@ -99,7 +99,7 @@ internal static partial class FloatingPointDCT
var mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256 tmp13 = Avx.Add(tmp1, tmp3);
- Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+ Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
tmp0 = Avx.Add(tmp10, tmp13);
tmp3 = Avx.Subtract(tmp10, tmp13);
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 322a5f643..316c705e3 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -5,6 +5,7 @@ using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
// ReSharper disable InconsistentNaming
@@ -26,6 +27,11 @@ internal static class LossyUtils
return Vp8_Sse16xN_Sse2(a, b, 8);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse16x16_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 16, 16);
}
@@ -43,6 +49,11 @@ internal static class LossyUtils
return Vp8_Sse16xN_Sse2(a, b, 4);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse16x8_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 16, 8);
}
@@ -119,6 +130,11 @@ internal static class LossyUtils
return Numerics.ReduceSum(sum);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse4x4_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 4, 4);
}
@@ -199,6 +215,106 @@ internal static class LossyUtils
return Numerics.ReduceSum(sum);
}
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b)
+ {
+ Vector128 sum = Vector128.Zero;
+ fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+ {
+ fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+ {
+ for (int y = 0; y < 16; y++)
+ {
+ sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+ }
+ }
+ }
+
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b)
+ {
+ Vector128 sum = Vector128.Zero;
+ fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+ {
+ fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+ {
+ for (int y = 0; y < 8; y++)
+ {
+ sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+ }
+ }
+ }
+
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static int Vp8_Sse4x4_Neon(Span a, Span b)
+ {
+ Vector128 a0 = Load4x4Neon(a).AsByte();
+ Vector128 b0 = Load4x4Neon(b).AsByte();
+ Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+ Vector64 absDiffLower = absDiff.GetLower().AsByte();
+ Vector64 absDiffUpper = absDiff.GetUpper().AsByte();
+ Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+ Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+ // pair-wise adds and widen.
+ Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1);
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2);
+
+ Vector128 sum = AdvSimd.Add(sum1, sum2);
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ // Load all 4x4 pixels into a single Vector128
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe Vector128 Load4x4Neon(Span src)
+ {
+ fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
+ {
+ Vector128 output = Vector128.Zero;
+ output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
+ output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
+ output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
+ output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
+ return output;
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum)
+ {
+ Vector128 a0 = AdvSimd.LoadVector128(a);
+ Vector128 b0 = AdvSimd.LoadVector128(b);
+
+ Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+ Vector64 absDiffLower = absDiff.GetLower();
+ Vector64 absDiffUpper = absDiff.GetUpper();
+ Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+ Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+ // pair-wise adds and widen.
+ Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1);
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2);
+ return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
+ }
+
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b)
{
diff --git a/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs b/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs
index cf1910121..2db61a06f 100644
--- a/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs
+++ b/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs
@@ -3,6 +3,10 @@
//
using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.PixelFormats.PixelBlenders;
@@ -43,18 +47,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.NormalSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.NormalSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.NormalSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -81,18 +152,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.MultiplySrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.MultiplySrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.MultiplySrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -119,18 +257,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.AddSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
{
- destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.AddSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
+ {
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.AddSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -157,18 +362,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
{
- destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], amount);
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.SubtractSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], amount);
+ }
+ }
+ else
+ {
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.SubtractSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.SubtractSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -195,18 +467,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.ScreenSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.ScreenSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.ScreenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -233,18 +572,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.DarkenSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+ Vector256 vOne = Vector256.Create(1F);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ // We need to create a Vector256 containing the current and next amount values
+ // taking up each half of the Vector256 and then clamp them.
+ Vector256 opacity = Vector256.Create(
+ Vector128.Create(amountBase),
+ Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+ opacity = Avx.Min(Avx.Max(Vector256.Zero, opacity), vOne);
+
+ destinationBase = PorterDuffFunctions.DarkenSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ amountBase = ref Unsafe.Add(ref amountBase, 2);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.DarkenSrc(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+ }
}
}
}
@@ -271,18 +677,85 @@ internal static class DefaultPixelBlenders
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
- for (int i = 0; i < destination.Length; i++)
+
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
+ ref Vector256 destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+ ref Vector256 backgroundBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(background));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Vector256 opacity = Vector256.Create(amount);
+
+ while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+ {
+ destinationBase = PorterDuffFunctions.LightenSrc(backgroundBase, sourceBase, opacity);
+ destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+ backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+ sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+ }
+
+ if (Numerics.Modulo2(destination.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ int i = destination.Length - 1;
+ destination[i] = PorterDuffFunctions.LightenSrc(background[i], source[i], amount);
+ }
+ }
+ else
{
- destination[i] = PorterDuffFunctions.LightenSrc(background[i], source[i], amount);
+ for (int i = 0; i < destination.Length; i++)
+ {
+ destination[i] = PorterDuffFunctions.LightenSrc(background[i], source[i], amount);
+ }
}
}
///
protected override void BlendFunction(Span destination, ReadOnlySpan background, ReadOnlySpan source, ReadOnlySpan amount)
{
- for (int i = 0; i < destination.Length; i++)
+ if (Avx2.IsSupported && destination.Length >= 2)
+ {
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256