diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
index 35769d96a7..e4fd9bce60 100644
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b;
}
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int Modulo4(int a) => a & 3;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int Modulo8(int a) => a & 7;
+
+ ///
+ /// Fast (mod m) calculator,
+ /// where should be a power of 2.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int ModuloP2(int a, int m)
+ {
+ return a & (m - 1);
+ }
+
///
/// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
index a8b3434980..c7fd21a8f0 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils
{
///
- /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
+ /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
///
public static class BasicIntrinsics256
{
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
index fd263b54c5..996a08fb4b 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils
{
///
- /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
- /// PR:
+ /// Implementation methods based on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
+ /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
+ /// See:
/// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
new file mode 100644
index 0000000000..bb21474660
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@@ -0,0 +1,143 @@
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// Fallback implementation based on (128bit).
+ /// For , efficient software fallback implementations are present
+ /// + maybe even mono can emit intrinsics for that type :P
+ ///
+ public static class FallbackIntrinsics128
+ {
+ ///
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ internal static void BulkConvertByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ int remainder = source.Length % 4;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertByteToNormalizedFloat(
+ source.Slice(0, alignedCount),
+ dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
+
+ ///
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ int remainder = source.Length % 4;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertNormalizedFloatToByteClampOverflows(
+ source.Slice(0, alignedCount),
+ dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
+
+ ///
+ /// Implementation of using .
+ ///
+ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
+ {
+ DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!");
+
+ int count = dest.Length / 4;
+ if (count == 0)
+ {
+ return;
+ }
+
+ ref ByteVector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref Vector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+
+ const float Scale = 1f / 255f;
+ Vector4 d = default;
+
+ for (int i = 0; i < count; i++)
+ {
+ ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
+ d.X = s.X;
+ d.Y = s.Y;
+ d.Z = s.Z;
+ d.W = s.W;
+ d *= Scale;
+ Unsafe.Add(ref dBase, i) = d;
+ }
+ }
+
+ ///
+ /// Implementation of using .
+ ///
+ internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!");
+
+ int count = source.Length / 4;
+ if (count == 0)
+ {
+ return;
+ }
+
+ ref Vector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref ByteVector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+
+ var half = new Vector4(0.5f);
+ var maxBytes = new Vector4(255f);
+
+ for (int i = 0; i < count; i++)
+ {
+ Vector4 s = Unsafe.Add(ref sBase, i);
+ s *= maxBytes;
+ s += half;
+
+ // I'm not sure if Clamp() is properly implemented with intrinsics.
+ s = Vector4.Max(Vector4.Zero, s);
+ s = Vector4.Min(maxBytes, s);
+
+ ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
+ d.X = (byte)s.X;
+ d.Y = (byte)s.Y;
+ d.Z = (byte)s.Z;
+ d.W = (byte)s.W;
+ }
+ }
+
+ [StructLayout(LayoutKind.Sequential)]
+ private struct ByteVector4
+ {
+ public byte X;
+ public byte Y;
+ public byte Z;
+ public byte W;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
index 111ac22408..bc75dc8caa 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp
}
///
- /// Converts `dest.Length` -s to -s normalized into [0..1].
+ /// Converts all input -s to -s normalized into [0..1].
/// should be the of the same size as ,
/// but there are no restrictions on the span's length.
///
@@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+ FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
// Deal with the remainder:
int count = source.Length;
@@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp
}
///
- /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of .
+ /// Convert all values normalized into [0..1] from 'source' into 'dest' buffer of .
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// should be the of the same size as ,
/// but there are no restrictions on the span's length.
@@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+ FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
// Deal with the remainder:
int count = source.Length;
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
index 7a212b0523..a56082fcd3 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class PackFromVector4_Rgba32 : PackFromVector4
{
[Benchmark]
- public void BasicBulk()
+ public void FallbackIntrinsics128()
{
- ref Vector4 sBase = ref this.source.GetSpan()[0];
- ref Rgba32 dBase = ref this.destination.GetSpan()[0];
-
- Vector4 maxBytes = new Vector4(255);
- Vector4 half = new Vector4(0.5f);
+ Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
+ Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
- for (int i = 0; i < this.Count; i++)
- {
- Vector4 v = Unsafe.Add(ref sBase, i);
- v *= maxBytes;
- v += half;
- v = Vector4.Clamp(v, Vector4.Zero, maxBytes);
- ref Rgba32 d = ref Unsafe.Add(ref dBase, i);
- d.R = (byte)v.X;
- d.G = (byte)v.Y;
- d.B = (byte)v.Z;
- d.A = (byte)v.W;
- }
+ SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
[Benchmark(Baseline = true)]
- public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows()
+ public void BasicIntrinsics256()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
@@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
[Benchmark]
- public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows()
+ public void ExtendedIntrinsic()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
index 4a801d64ef..519edaa31f 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class ToVector4_Rgba32 : ToVector4
{
[Benchmark]
- public void BasicBulk()
+ public void FallbackIntrinsics128()
{
- ref Rgba32 sBase = ref this.source.GetSpan()[0];
- ref Vector4 dBase = ref this.destination.GetSpan()[0];
-
- Vector4 scale = new Vector4(1f / 255f);
-
- Vector4 v = default;
+ Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
+ Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
- for (int i = 0; i < this.Count; i++)
- {
- ref Rgba32 s = ref Unsafe.Add(ref sBase, i);
- v.X = s.R;
- v.Y = s.G;
- v.Z = s.B;
- v.W = s.A;
- v *= scale;
- Unsafe.Add(ref dBase, i) = v;
- }
+ SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
}
[Benchmark(Baseline = true)]
- public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat()
+ public void BasicIntrinsics256()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
@@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
[Benchmark]
- public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat()
+ public void ExtendedIntrinsics()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 2dcba2b74b..feefd17580 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
- public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
- public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common
}
public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 };
+ public static readonly TheoryData ArraySizesDivisibleBy4 = new TheoryData { 0, 4, 8, 28, 1020 };
public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 };
@@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common
0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
};
+ [Theory]
+ [MemberData(nameof(ArraySizesDivisibleBy4))]
+ public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count)
+ {
+ TestImpl_BulkConvertByteToNormalizedFloat(
+ count,
+ (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+ }
+
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))]
- public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+ public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count)
{
if (this.SkipOnNonAvx2())
{
@@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
+ [Theory]
+ [MemberData(nameof(ArraySizesDivisibleBy4))]
+ public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ {
+ TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+ (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+ );
+ }
+
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))]
- public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (this.SkipOnNonAvx2())
{
diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
index 6c2979fe9e..aec4d0b810 100644
--- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers
public class ImageMathsTests
{
+ [Theory]
+ [InlineData(0, 0)]
+ [InlineData(1, 1)]
+ [InlineData(2, 2)]
+ [InlineData(3, 3)]
+ [InlineData(4, 0)]
+ [InlineData(100, 0)]
+ [InlineData(123, 3)]
+ [InlineData(53436353, 1)]
+ public void Modulo4(int a, int expected)
+ {
+ int actual = ImageMaths.Modulo4(a);
+ Assert.Equal(expected, actual);
+ }
+
+ [Theory]
+ [InlineData(0, 0)]
+ [InlineData(1, 1)]
+ [InlineData(2, 2)]
+ [InlineData(6, 6)]
+ [InlineData(7, 7)]
+ [InlineData(8, 0)]
+ [InlineData(100, 4)]
+ [InlineData(123, 3)]
+ [InlineData(53436353, 1)]
+ [InlineData(975, 7)]
+ public void Modulo8(int a, int expected)
+ {
+ int actual = ImageMaths.Modulo8(a);
+ Assert.Equal(expected, actual);
+ }
+
+ [Theory]
+ [InlineData(0, 2, 0)]
+ [InlineData(1, 2, 1)]
+ [InlineData(2, 2, 0)]
+ [InlineData(0, 4, 0)]
+ [InlineData(3, 4, 3)]
+ [InlineData(5, 4, 1)]
+ [InlineData(5, 8, 5)]
+ [InlineData(8, 8, 0)]
+ [InlineData(8, 16, 8)]
+ [InlineData(15, 16, 15)]
+ [InlineData(17, 16, 1)]
+ [InlineData(17, 32, 17)]
+ [InlineData(31, 32, 31)]
+ [InlineData(32, 32, 0)]
+ [InlineData(33, 32, 1)]
+ public void Modulo2P(int a, int m, int expected)
+ {
+ int actual = ImageMaths.ModuloP2(a, m);
+ Assert.Equal(expected, actual);
+ }
+
[Fact]
public void FasAbsResultMatchesMath()
{