From cbeeca5710853637768b9ab3675dffa1a8ab91e5 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 12 Feb 2023 12:41:01 +0100 Subject: [PATCH 1/8] Add ARM version of calculating mode score --- .../Formats/Webp/Lossy/LossyUtils.cs | 107 +++++++++++++++++- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 12 +- .../Formats/WebP/LossyUtilsTests.cs | 6 +- 3 files changed, 113 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 4756dea86..f46574136 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -5,6 +5,7 @@ using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; // ReSharper disable InconsistentNaming @@ -14,7 +15,7 @@ internal static class LossyUtils { // Note: method name in libwebp reference implementation is called VP8SSE16x16. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X16(Span a, Span b) + public static int Vp8_Sse16x16(Span a, Span b) { if (Avx2.IsSupported) { @@ -26,12 +27,17 @@ internal static class LossyUtils return Vp8_Sse16xN_Sse2(a, b, 8); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse16x16_Neon(a, b); + } + return Vp8_SseNxN(a, b, 16, 16); } // Note: method name in libwebp reference implementation is called VP8SSE16x8. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X8(Span a, Span b) + public static int Vp8_Sse16x8(Span a, Span b) { if (Avx2.IsSupported) { @@ -43,12 +49,17 @@ internal static class LossyUtils return Vp8_Sse16xN_Sse2(a, b, 4); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse16x8_Neon(a, b); + } + return Vp8_SseNxN(a, b, 16, 8); } // Note: method name in libwebp reference implementation is called VP8SSE4x4. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse4X4(Span a, Span b) + public static int Vp8_Sse4x4(Span a, Span b) { if (Avx2.IsSupported) { @@ -119,6 +130,11 @@ internal static class LossyUtils return Numerics.ReduceSum(sum); } + if (AdvSimd.IsSupported) + { + return Vp8_Sse4x4_Neon(a, b); + } + return Vp8_SseNxN(a, b, 4, 4); } @@ -201,6 +217,91 @@ internal static class LossyUtils return Numerics.ReduceSum(sum); } + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16x16_Neon(Span a, Span b) + { + Vector128 sum = Vector128.Zero; + for (int y = 0; y < 16; y++) + { + sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); + } + + return ReduceSum(sum); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16x8_Neon(Span a, Span b) + { + Vector128 sum = Vector128.Zero; + for (int y = 0; y < 8; y++) + { + sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); + } + + return ReduceSum(sum); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse4x4_Neon(Span a, Span b) + { + Vector128 a0 = Load4x4Neon(a).AsByte(); + Vector128 b0 = Load4x4Neon(b).AsByte(); + Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); + Vector64 absDiffLower = absDiff.GetLower().AsByte(); + Vector64 absDiffUpper = absDiff.GetUpper().AsByte(); + Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); + Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); + + // pair-wise adds and widen. + Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); + Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); + return ReduceSum(AdvSimd.Add(sum1, sum2)); + } + + // Load all 4x4 pixels into a single Vector128 + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe Vector128 Load4x4Neon(Span src) + { + fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) + { + Vector128 output = Vector128.Zero; + output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); + output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); + output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); + output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); + return output; + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSum(Vector128 sum) + { + Vector128 sum2 = AdvSimd.AddPairwiseWidening(sum); + Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32()); + return (int)AdvSimd.Extract(sum3, 0); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 AccumulateSSE16Neon(Span a, Span b, Vector128 sum) + { + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + + Vector128 a0 = Unsafe.As>(ref aRef); + Vector128 b0 = Unsafe.As>(ref bRef); + + Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); + Vector64 absDiffLower = absDiff.GetLower(); + Vector64 absDiffUpper = absDiff.GetUpper(); + Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); + Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); + + // pair-wise adds and widen. + Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); + Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); + return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); + } + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) { diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index fed9c16d4..fca0d03f2 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -53,7 +53,7 @@ internal static unsafe class QuantEnc rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode); // Measure RD-score. - rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst); + rdCur.D = LossyUtils.Vp8_Sse16x16(src, tmpDst); rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0; rdCur.H = WebpConstants.Vp8FixedCostsI16[mode]; rdCur.R = it.GetCostLuma16(rdCur, proba, res); @@ -145,7 +145,7 @@ internal static unsafe class QuantEnc rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode); // Compute RD-score. - rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst); + rdTmp.D = LossyUtils.Vp8_Sse4x4(src, tmpDst); rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0; rdTmp.H = modeCosts[mode]; @@ -235,7 +235,7 @@ internal static unsafe class QuantEnc rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode); // Compute RD-score - rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst); + rdUv.D = LossyUtils.Vp8_Sse16x8(src, tmpDst); rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas. rdUv.H = WebpConstants.Vp8FixedCostsUv[mode]; rdUv.R = it.GetCostUv(rdUv, proba, res); @@ -389,7 +389,7 @@ internal static unsafe class QuantEnc for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]); - long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16); + long score = (LossyUtils.Vp8_Sse16x16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16); if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit) { @@ -436,7 +436,7 @@ internal static unsafe class QuantEnc for (mode = 0; mode < WebpConstants.NumBModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]); - long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4); + long score = (LossyUtils.Vp8_Sse4x4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4); if (score < bestI4Score) { bestI4Mode = mode; @@ -485,7 +485,7 @@ internal static unsafe class QuantEnc for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]); - long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv); + long score = (LossyUtils.Vp8_Sse16x8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv); if (score < bestUvScore) { bestMode = mode; diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 69b503b5e..4682536d0 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -140,7 +140,7 @@ public class LossyUtilsTests int expected = 2063; // act - int actual = LossyUtils.Vp8_Sse16X16(a, b); + int actual = LossyUtils.Vp8_Sse16x16(a, b); // assert Assert.Equal(expected, actual); @@ -186,7 +186,7 @@ public class LossyUtilsTests int expected = 749; // act - int actual = LossyUtils.Vp8_Sse16X8(a, b); + int actual = LossyUtils.Vp8_Sse16x8(a, b); // assert Assert.Equal(expected, actual); @@ -218,7 +218,7 @@ public class LossyUtilsTests int expected = 27; // act - int actual = LossyUtils.Vp8_Sse4X4(a, b); + int actual = LossyUtils.Vp8_Sse4x4(a, b); // assert Assert.Equal(expected, actual); From 7483802692509896b5dd1ae1e6aba905d727f708 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 12 Feb 2023 12:44:07 +0100 Subject: [PATCH 2/8] Move reduce sum to numerics --- src/ImageSharp/Common/Helpers/Numerics.cs | 14 ++++++++++++++ src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 15 ++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index fc6cfd585..f2f9aaad2 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -5,6 +5,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp; @@ -808,6 +809,19 @@ internal static class Numerics return Sse2.ConvertToInt32(vsum); } + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + public static int ReduceSumArm(Vector128 accumulator) + { + Vector128 sum2 = AdvSimd.AddPairwiseWidening(accumulator); + Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32()); + return (int)AdvSimd.Extract(sum3, 0); + } + /// /// Reduces even elements of the vector into one sum. /// diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index f46574136..e594ed9a8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -226,7 +226,7 @@ internal static class LossyUtils sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); } - return ReduceSum(sum); + return Numerics.ReduceSumArm(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -238,7 +238,7 @@ internal static class LossyUtils sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); } - return ReduceSum(sum); + return Numerics.ReduceSumArm(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -255,7 +255,8 @@ internal static class LossyUtils // pair-wise adds and widen. Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - return ReduceSum(AdvSimd.Add(sum1, sum2)); + + return Numerics.ReduceSumArm(AdvSimd.Add(sum1, sum2)); } // Load all 4x4 pixels into a single Vector128 @@ -273,14 +274,6 @@ internal static class LossyUtils } } - [MethodImpl(InliningOptions.ShortMethod)] - private static int ReduceSum(Vector128 sum) - { - Vector128 sum2 = AdvSimd.AddPairwiseWidening(sum); - Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32()); - return (int)AdvSimd.Extract(sum3, 0); - } - [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 AccumulateSSE16Neon(Span a, Span b, Vector128 sum) { From 7ed4c69349a3320685ba1624d7a01788d0677741 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 12 Feb 2023 13:13:01 +0100 Subject: [PATCH 3/8] Disable ARM for testing scalar version of calculating mode score --- .../Formats/WebP/LossyUtilsTests.cs | 57 ++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index e7f9ade36..04f90b6ee 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -222,62 +222,79 @@ public class LossyUtilsTests public void HadamardTransform_Works() => RunHadamardTransformTest(); [Fact] - public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll); + public void TransformTwo_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll); [Fact] - public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic); + public void TransformTwo_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic); [Fact] - public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll); + public void TransformOne_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll); [Fact] - public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic); + public void TransformOne_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic); - // This will test the AVX2 version. + // This will test the AVX2 or ARM version. [Fact] - public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll); + public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse16X16_WithoutAVX2_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); // This will test the fallback scalar version. [Fact] - public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse16X16_WithoutSSE2_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); - // This will test the AVX2 version. + // This will test the AVX2 or ARM version. [Fact] - public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); + public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse16X8_WithoutAVX2_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); // This will test the fallback scalar version. [Fact] - public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); - // This will test the AVX2 version. + // This will test the AVX2 version or ARM version. [Fact] - public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); + public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); // This will test the SSE2 version. [Fact] - public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); + public void Vp8Sse4X4_WithoutAVX2_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); // This will test the fallback scalar version. [Fact] - public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX); + public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); [Fact] - public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); + public void Mean16x4_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); [Fact] - public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); + public void Mean16x4_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); [Fact] - public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); + public void HadamardTransform_WithHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); [Fact] - public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic); + public void HadamardTransform_WithoutHardwareIntrinsics_Works() => + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic); } From 2f673b9942ab616e1123b61ea22fe9109e9706e7 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 12 Feb 2023 15:07:58 +0100 Subject: [PATCH 4/8] Use ref parameter for AccumulateSSE16Neon --- .../Formats/Webp/Lossy/LossyUtils.cs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index d59af537d..850f3d876 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -219,9 +219,14 @@ internal static class LossyUtils private static int Vp8_Sse16x16_Neon(Span a, Span b) { Vector128 sum = Vector128.Zero; + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); for (int y = 0; y < 16; y++) { - sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); + sum = AccumulateSSE16Neon( + ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), + ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), + sum); } return Numerics.ReduceSumArm(sum); @@ -231,9 +236,14 @@ internal static class LossyUtils private static int Vp8_Sse16x8_Neon(Span a, Span b) { Vector128 sum = Vector128.Zero; + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); for (int y = 0; y < 8; y++) { - sum = AccumulateSSE16Neon(a.Slice(y * WebpConstants.Bps), b.Slice(y * WebpConstants.Bps), sum); + sum = AccumulateSSE16Neon( + ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), + ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), + sum); } return Numerics.ReduceSumArm(sum); @@ -273,11 +283,8 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 AccumulateSSE16Neon(Span a, Span b, Vector128 sum) + private static Vector128 AccumulateSSE16Neon(ref byte aRef, ref byte bRef, Vector128 sum) { - ref byte aRef = ref MemoryMarshal.GetReference(a); - ref byte bRef = ref MemoryMarshal.GetReference(b); - Vector128 a0 = Unsafe.As>(ref aRef); Vector128 b0 = Unsafe.As>(ref bRef); From a526d84cbbeeddfa6fe949e38c9c9c977c5c3baa Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 13 Feb 2023 19:34:14 +0100 Subject: [PATCH 5/8] Skip WithoutAVX2 tests on ARM --- .../Formats/WebP/LossyUtilsTests.cs | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 04f90b6ee..73e7044f5 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; @@ -244,13 +245,19 @@ public class LossyUtilsTests // This will test the SSE2 version. [Fact] - public void Vp8Sse16X16_WithoutAVX2_Works() => + public void Vp8Sse16X16_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] - public void Vp8Sse16X16_WithoutSSE2_Works() => - FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); + public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic); // This will test the AVX2 or ARM version. [Fact] @@ -259,13 +266,20 @@ public class LossyUtilsTests // This will test the SSE2 version. [Fact] - public void Vp8Sse16X8_WithoutAVX2_Works() => + public void Vp8Sse16X8_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => - FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic); // This will test the AVX2 version or ARM version. [Fact] @@ -274,13 +288,20 @@ public class LossyUtilsTests // This will test the SSE2 version. [Fact] - public void Vp8Sse4X4_WithoutAVX2_Works() => + public void Vp8Sse4X4_WithoutAVX2_Works() + { + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return; + } + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); + } // This will test the fallback scalar version. [Fact] public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => - FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX | HwIntrinsics.DisableArm64AdvSimd); + FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic); [Fact] public void Mean16x4_WithHardwareIntrinsics_Works() => From e345857cd9ba15ac865aa55906a4a2b47ff20c78 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 15 Feb 2023 18:44:51 +0100 Subject: [PATCH 6/8] Use AddAcross for reduce sum, if available --- src/ImageSharp/Common/Helpers/Numerics.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index f2f9aaad2..81cc4b539 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -817,6 +817,12 @@ internal static class Numerics [MethodImpl(InliningOptions.ShortMethod)] public static int ReduceSumArm(Vector128 accumulator) { + if (AdvSimd.Arm64.IsSupported) + { + Vector64 sum = AdvSimd.Arm64.AddAcross(accumulator); + return (int)AdvSimd.Extract(sum, 0); + } + Vector128 sum2 = AdvSimd.AddPairwiseWidening(accumulator); Vector64 sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32()); return (int)AdvSimd.Extract(sum3, 0); From b0bfb0a035abd455ff50fa447f55b87d974fc5cf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 17 Feb 2023 13:29:26 +0100 Subject: [PATCH 7/8] Use Vector128.sum() for reduce sum in NET7.0 --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 850f3d876..13f5662e7 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -229,7 +229,11 @@ internal static class LossyUtils sum); } +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else return Numerics.ReduceSumArm(sum); +#endif } [MethodImpl(InliningOptions.ShortMethod)] @@ -246,7 +250,11 @@ internal static class LossyUtils sum); } +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else return Numerics.ReduceSumArm(sum); +#endif } [MethodImpl(InliningOptions.ShortMethod)] @@ -264,7 +272,12 @@ internal static class LossyUtils Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - return Numerics.ReduceSumArm(AdvSimd.Add(sum1, sum2)); + Vector128 sum = AdvSimd.Add(sum1, sum2); +#if NET7_0_OR_GREATER + return (int)Vector128.Sum(sum); +#else + return Numerics.ReduceSumArm(sum); +#endif } // Load all 4x4 pixels into a single Vector128 From ae7306beb98367cdd42235605adfc3305490fc45 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 17 Feb 2023 14:24:28 +0100 Subject: [PATCH 8/8] Change arguments of AccumulateSSE16Neon to pointers for better code generation --- .../Formats/Webp/Lossy/LossyUtils.cs | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 13f5662e7..316c705e3 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -216,17 +216,18 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16x16_Neon(Span a, Span b) + private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b) { Vector128 sum = Vector128.Zero; - ref byte aRef = ref MemoryMarshal.GetReference(a); - ref byte bRef = ref MemoryMarshal.GetReference(b); - for (int y = 0; y < 16; y++) + fixed (byte* aRef = &MemoryMarshal.GetReference(a)) { - sum = AccumulateSSE16Neon( - ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), - ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), - sum); + fixed (byte* bRef = &MemoryMarshal.GetReference(b)) + { + for (int y = 0; y < 16; y++) + { + sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); + } + } } #if NET7_0_OR_GREATER @@ -237,17 +238,18 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16x8_Neon(Span a, Span b) + private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b) { Vector128 sum = Vector128.Zero; - ref byte aRef = ref MemoryMarshal.GetReference(a); - ref byte bRef = ref MemoryMarshal.GetReference(b); - for (int y = 0; y < 8; y++) + fixed (byte* aRef = &MemoryMarshal.GetReference(a)) { - sum = AccumulateSSE16Neon( - ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), - ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), - sum); + fixed (byte* bRef = &MemoryMarshal.GetReference(b)) + { + for (int y = 0; y < 8; y++) + { + sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); + } + } } #if NET7_0_OR_GREATER @@ -296,10 +298,10 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 AccumulateSSE16Neon(ref byte aRef, ref byte bRef, Vector128 sum) + private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum) { - Vector128 a0 = Unsafe.As>(ref aRef); - Vector128 b0 = Unsafe.As>(ref bRef); + Vector128 a0 = AdvSimd.LoadVector128(a); + Vector128 b0 = AdvSimd.LoadVector128(b); Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); Vector64 absDiffLower = absDiff.GetLower();