diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index fc6cfd585..81cc4b539 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -5,6 +5,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp; @@ -808,6 +809,25 @@ internal static class Numerics return Sse2.ConvertToInt32(vsum); } + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + public static int ReduceSumArm(Vector128 accumulator) + { + if (AdvSimd.Arm64.IsSupported) + { + Vector64 sum = AdvSimd.Arm64.AddAcross(accumulator); + return (int)AdvSimd.Extract(sum, 0); + } + + Vector128 sum2 = AdvSimd.AddPairwiseWidening(accumulator); + Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32()); + return (int)AdvSimd.Extract(sum3, 0); + } + /// /// Reduces even elements of the vector into one sum. /// diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 322a5f643..316c705e3 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -5,6 +5,7 @@ using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; // ReSharper disable InconsistentNaming @@ -26,6 +27,11 @@ internal static class LossyUtils return Vp8_Sse16xN_Sse2(a, b, 8); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse16x16_Neon(a, b); + } + return Vp8_SseNxN(a, b, 16, 16); } @@ -43,6 +49,11 @@ internal static class LossyUtils return Vp8_Sse16xN_Sse2(a, b, 4); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse16x8_Neon(a, b); + } + return Vp8_SseNxN(a, b, 16, 8); } @@ -119,6 +130,11 @@ internal static class LossyUtils return Numerics.ReduceSum(sum); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse4x4_Neon(a, b); + } + return Vp8_SseNxN(a, b, 4, 4); } @@ -199,6 +215,106 @@ internal static class LossyUtils return Numerics.ReduceSum(sum); } + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b) + { + Vector128 sum = Vector128.Zero; + fixed (byte* aRef = &MemoryMarshal.GetReference(a)) + { + fixed (byte* bRef = &MemoryMarshal.GetReference(b)) + { + for (int y = 0; y < 16; y++) + { + sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); + } + } + } + +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else + return Numerics.ReduceSumArm(sum); +#endif + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b) + { + Vector128 sum = Vector128.Zero; + fixed (byte* aRef = &MemoryMarshal.GetReference(a)) + { + fixed (byte* bRef = &MemoryMarshal.GetReference(b)) + { + for (int y = 0; y < 8; y++) + { + sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); + } + } + } + +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else + return Numerics.ReduceSumArm(sum); +#endif + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse4x4_Neon(Span a, Span b) + { + Vector128 a0 = Load4x4Neon(a).AsByte(); + Vector128 b0 = Load4x4Neon(b).AsByte(); + Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); + Vector64 absDiffLower = absDiff.GetLower().AsByte(); + Vector64 absDiffUpper = absDiff.GetUpper().AsByte(); + Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); + Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); + + // pair-wise adds and widen. + Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); + Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); + + Vector128 sum = AdvSimd.Add(sum1, sum2); +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else + return Numerics.ReduceSumArm(sum); +#endif + } + + // Load all 4x4 pixels into a single Vector128 + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe Vector128 Load4x4Neon(Span src) + { + fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) + { + Vector128 output = Vector128.Zero; + output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); + output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); + output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); + output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); + return output; + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum) + { + Vector128 a0 = AdvSimd.LoadVector128(a); + Vector128 b0 = AdvSimd.LoadVector128(b); + + Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); + Vector64 absDiffLower = absDiff.GetLower(); + Vector64 absDiffUpper = absDiff.GetUpper(); + Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); + Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); + + // pair-wise adds and widen. + Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); + Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); + return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); + } + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) { diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index e7f9ade36..73e7044f5 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; @@ -222,62 +223,99 @@ public class LossyUtilsTests public void HadamardTransform_Works() => RunHadamardTransformTest(); [Fact] - public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll); + public void TransformTwo_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll); [Fact] - public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic); + public void TransformTwo_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic); [Fact] - public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll); + public void TransformOne_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll); [Fact] - public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic); + public void TransformOne_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic); - // This will test the AVX2 version. + // This will test the AVX2 or ARM version. [Fact] - public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll); + public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse16X16_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] - public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic); - // This will test the AVX2 version. + // This will test the AVX2 or ARM version. [Fact] - public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); + public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse16X8_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] - public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic); - // This will test the AVX2 version. + // This will test the AVX2 version or ARM version. [Fact] - public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); + public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse4X4_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] - public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic); [Fact] - public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); + public void Mean16x4_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); [Fact] - public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); + public void Mean16x4_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); [Fact] - public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); + public void HadamardTransform_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); [Fact] - public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic); + public void HadamardTransform_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic); }