diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index 35769d96a7..e4fd9bce60 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp return (a / GreatestCommonDivisor(a, b)) * b; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Modulo4(int a) => a & 3; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Modulo8(int a) => a & 7; + + /// + /// Fast (mod m) calculator, + /// where should be a power of 2. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int ModuloP2(int a, int m) + { + return a & (m - 1); + } + /// /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation. /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs index a8b3434980..c7fd21a8f0 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp internal static partial class SimdUtils { /// - /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*) + /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.) /// public static class BasicIntrinsics256 { diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index fd263b54c5..996a08fb4b 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp internal static partial class SimdUtils { /// - /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) - /// PR: + /// Implementation methods based on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*). + /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) + /// See: /// https://github.com/dotnet/coreclr/pull/10662 /// API Proposal: /// https://github.com/dotnet/corefx/issues/15957 diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs new file mode 100644 index 0000000000..bb21474660 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -0,0 +1,143 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Fallback implementation based on (128bit). + /// For , efficient software fallback implementations are present + /// + maybe even mono can emit intrinsics for that type :P + /// + public static class FallbackIntrinsics128 + { + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + int remainder = source.Length % 4; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + int remainder = source.Length % 4; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + + /// + /// Implementation of using . + /// + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!"); + + int count = dest.Length / 4; + if (count == 0) + { + return; + } + + ref ByteVector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Vector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + const float Scale = 1f / 255f; + Vector4 d = default; + + for (int i = 0; i < count; i++) + { + ref ByteVector4 s = ref Unsafe.Add(ref sBase, i); + d.X = s.X; + d.Y = s.Y; + d.Z = s.Z; + d.W = s.W; + d *= Scale; + Unsafe.Add(ref dBase, i) = d; + } + } + + /// + /// Implementation of using . + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflows( + ReadOnlySpan source, + Span dest) + { + DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!"); + + int count = source.Length / 4; + if (count == 0) + { + return; + } + + ref Vector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref ByteVector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + var half = new Vector4(0.5f); + var maxBytes = new Vector4(255f); + + for (int i = 0; i < count; i++) + { + Vector4 s = Unsafe.Add(ref sBase, i); + s *= maxBytes; + s += half; + + // I'm not sure if Clamp() is properly implemented with intrinsics. + s = Vector4.Max(Vector4.Zero, s); + s = Vector4.Min(maxBytes, s); + + ref ByteVector4 d = ref Unsafe.Add(ref dBase, i); + d.X = (byte)s.X; + d.Y = (byte)s.Y; + d.Z = (byte)s.Z; + d.W = (byte)s.W; + } + } + + [StructLayout(LayoutKind.Sequential)] + private struct ByteVector4 + { + public byte X; + public byte Y; + public byte Z; + public byte W; + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 111ac22408..bc75dc8caa 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp } /// - /// Converts `dest.Length` -s to -s normalized into [0..1]. + /// Converts all input -s to -s normalized into [0..1]. /// should be the of the same size as , /// but there are no restrictions on the span's length. /// @@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); + FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); // Deal with the remainder: int count = source.Length; @@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp } /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of . + /// Convert all values normalized into [0..1] from 'source' into 'dest' buffer of . /// The values are scaled up into [0-255] and rounded, overflows are clamped. /// should be the of the same size as , /// but there are no restrictions on the span's length. @@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); + FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); // Deal with the remainder: int count = source.Length; diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index 7a212b0523..a56082fcd3 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { [Benchmark] - public void BasicBulk() + public void FallbackIntrinsics128() { - ref Vector4 sBase = ref this.source.GetSpan()[0]; - ref Rgba32 dBase = ref this.destination.GetSpan()[0]; - - Vector4 maxBytes = new Vector4(255); - Vector4 half = new Vector4(0.5f); + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - for (int i = 0; i < this.Count; i++) - { - Vector4 v = Unsafe.Add(ref sBase, i); - v *= maxBytes; - v += half; - v = Vector4.Clamp(v, Vector4.Zero, maxBytes); - ref Rgba32 d = ref Unsafe.Add(ref dBase, i); - d.R = (byte)v.X; - d.G = (byte)v.Y; - d.B = (byte)v.Z; - d.A = (byte)v.W; - } + SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } [Benchmark(Baseline = true)] - public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows() + public void BasicIntrinsics256() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); @@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark] - public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows() + public void ExtendedIntrinsic() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 4a801d64ef..519edaa31f 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class ToVector4_Rgba32 : ToVector4 { [Benchmark] - public void BasicBulk() + public void FallbackIntrinsics128() { - ref Rgba32 sBase = ref this.source.GetSpan()[0]; - ref Vector4 dBase = ref this.destination.GetSpan()[0]; - - Vector4 scale = new Vector4(1f / 255f); - - Vector4 v = default; + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - for (int i = 0; i < this.Count; i++) - { - ref Rgba32 s = ref Unsafe.Add(ref sBase, i); - v.X = s.R; - v.Y = s.G; - v.Z = s.B; - v.W = s.A; - v *= scale; - Unsafe.Add(ref dBase, i) = v; - } + SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } [Benchmark(Baseline = true)] - public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat() + public void BasicIntrinsics256() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); @@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 2dcba2b74b..feefd17580 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common } public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; + public static readonly TheoryData ArraySizesDivisibleBy4 = new TheoryData { 0, 4, 8, 28, 1020 }; public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; @@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, }; + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + [Theory] [MemberData(nameof(ArraySizesDivisibleBy8))] - public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count) + public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count) { if (this.SkipOnNonAvx2()) { @@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + [Theory] [MemberData(nameof(ArraySizesDivisibleBy8))] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count) { if (this.SkipOnNonAvx2()) { diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index 6c2979fe9e..aec4d0b810 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers public class ImageMathsTests { + [Theory] + [InlineData(0, 0)] + [InlineData(1, 1)] + [InlineData(2, 2)] + [InlineData(3, 3)] + [InlineData(4, 0)] + [InlineData(100, 0)] + [InlineData(123, 3)] + [InlineData(53436353, 1)] + public void Modulo4(int a, int expected) + { + int actual = ImageMaths.Modulo4(a); + Assert.Equal(expected, actual); + } + + [Theory] + [InlineData(0, 0)] + [InlineData(1, 1)] + [InlineData(2, 2)] + [InlineData(6, 6)] + [InlineData(7, 7)] + [InlineData(8, 0)] + [InlineData(100, 4)] + [InlineData(123, 3)] + [InlineData(53436353, 1)] + [InlineData(975, 7)] + public void Modulo8(int a, int expected) + { + int actual = ImageMaths.Modulo8(a); + Assert.Equal(expected, actual); + } + + [Theory] + [InlineData(0, 2, 0)] + [InlineData(1, 2, 1)] + [InlineData(2, 2, 0)] + [InlineData(0, 4, 0)] + [InlineData(3, 4, 3)] + [InlineData(5, 4, 1)] + [InlineData(5, 8, 5)] + [InlineData(8, 8, 0)] + [InlineData(8, 16, 8)] + [InlineData(15, 16, 15)] + [InlineData(17, 16, 1)] + [InlineData(17, 32, 17)] + [InlineData(31, 32, 31)] + [InlineData(32, 32, 0)] + [InlineData(33, 32, 1)] + public void Modulo2P(int a, int m, int expected) + { + int actual = ImageMaths.ModuloP2(a, m); + Assert.Equal(expected, actual); + } + [Fact] public void FasAbsResultMatchesMath() {