diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index fc6cfd585..81cc4b539 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -5,6 +5,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp;
@@ -808,6 +809,25 @@ internal static class Numerics
return Sse2.ConvertToInt32(vsum);
}
+ ///
+ /// Reduces elements of the vector into one sum.
+ ///
+ /// The accumulator to reduce.
+ /// The sum of all elements.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static int ReduceSumArm(Vector128 accumulator)
+ {
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ Vector64 sum = AdvSimd.Arm64.AddAcross(accumulator);
+ return (int)AdvSimd.Extract(sum, 0);
+ }
+
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(accumulator);
+ Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
+ return (int)AdvSimd.Extract(sum3, 0);
+ }
+
///
/// Reduces even elements of the vector into one sum.
///
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 322a5f643..316c705e3 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -5,6 +5,7 @@ using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
// ReSharper disable InconsistentNaming
@@ -26,6 +27,11 @@ internal static class LossyUtils
return Vp8_Sse16xN_Sse2(a, b, 8);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse16x16_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 16, 16);
}
@@ -43,6 +49,11 @@ internal static class LossyUtils
return Vp8_Sse16xN_Sse2(a, b, 4);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse16x8_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 16, 8);
}
@@ -119,6 +130,11 @@ internal static class LossyUtils
return Numerics.ReduceSum(sum);
}
+ if (AdvSimd.IsSupported)
+ {
+ return Vp8_Sse4x4_Neon(a, b);
+ }
+
return Vp8_SseNxN(a, b, 4, 4);
}
@@ -199,6 +215,106 @@ internal static class LossyUtils
return Numerics.ReduceSum(sum);
}
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b)
+ {
+ Vector128 sum = Vector128.Zero;
+ fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+ {
+ fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+ {
+ for (int y = 0; y < 16; y++)
+ {
+ sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+ }
+ }
+ }
+
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b)
+ {
+ Vector128 sum = Vector128.Zero;
+ fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+ {
+ fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+ {
+ for (int y = 0; y < 8; y++)
+ {
+ sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+ }
+ }
+ }
+
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static int Vp8_Sse4x4_Neon(Span a, Span b)
+ {
+ Vector128 a0 = Load4x4Neon(a).AsByte();
+ Vector128 b0 = Load4x4Neon(b).AsByte();
+ Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+ Vector64 absDiffLower = absDiff.GetLower().AsByte();
+ Vector64 absDiffUpper = absDiff.GetUpper().AsByte();
+ Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+ Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+ // pair-wise adds and widen.
+ Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1);
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2);
+
+ Vector128 sum = AdvSimd.Add(sum1, sum2);
+#if NET7_0_OR_GREATER
+ return (int)Vector128.Sum(sum);
+#else
+ return Numerics.ReduceSumArm(sum);
+#endif
+ }
+
+ // Load all 4x4 pixels into a single Vector128
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe Vector128 Load4x4Neon(Span src)
+ {
+ fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
+ {
+ Vector128 output = Vector128.Zero;
+ output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
+ output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
+ output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
+ output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
+ return output;
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum)
+ {
+ Vector128 a0 = AdvSimd.LoadVector128(a);
+ Vector128 b0 = AdvSimd.LoadVector128(b);
+
+ Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+ Vector64 absDiffLower = absDiff.GetLower();
+ Vector64 absDiffUpper = absDiff.GetUpper();
+ Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+ Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+ // pair-wise adds and widen.
+ Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1);
+ Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2);
+ return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
+ }
+
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b)
{
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
index e7f9ade36..73e7044f5 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
+using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Formats.Webp.Lossy;
using SixLabors.ImageSharp.Tests.TestUtilities;
@@ -222,62 +223,99 @@ public class LossyUtilsTests
public void HadamardTransform_Works() => RunHadamardTransformTest();
[Fact]
- public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
+ public void TransformTwo_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
[Fact]
- public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
+ public void TransformTwo_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
[Fact]
- public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
+ public void TransformOne_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
[Fact]
- public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
+ public void TransformOne_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
- // This will test the AVX2 version.
+ // This will test the AVX2 or ARM version.
[Fact]
- public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
+ public void Vp8Sse16X16_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
// This will test the SSE2 version.
[Fact]
- public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
+ public void Vp8Sse16X16_WithoutAVX2_Works()
+ {
+ if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
+ {
+ return;
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
+ }
// This will test the fallback scalar version.
[Fact]
- public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
+ public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic);
- // This will test the AVX2 version.
+ // This will test the AVX2 or ARM version.
[Fact]
- public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
+ public void Vp8Sse16X8_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
// This will test the SSE2 version.
[Fact]
- public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
+ public void Vp8Sse16X8_WithoutAVX2_Works()
+ {
+ if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
+ {
+ return;
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
+ }
// This will test the fallback scalar version.
[Fact]
- public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
+ public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic);
- // This will test the AVX2 version.
+ // This will test the AVX2 version or ARM version.
[Fact]
- public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
+ public void Vp8Sse4X4_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
// This will test the SSE2 version.
[Fact]
- public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
+ public void Vp8Sse4X4_WithoutAVX2_Works()
+ {
+ if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
+ {
+ return;
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
+ }
// This will test the fallback scalar version.
[Fact]
- public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
+ public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
[Fact]
- public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
+ public void Mean16x4_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
[Fact]
- public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
+ public void Mean16x4_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
[Fact]
- public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
+ public void HadamardTransform_WithHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
[Fact]
- public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
+ public void HadamardTransform_WithoutHardwareIntrinsics_Works() =>
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
}