diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs
new file mode 100644
index 000000000..ec52b90ef
--- /dev/null
+++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs
@@ -0,0 +1,64 @@
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
+ /// PR:
+ /// https://github.com/dotnet/coreclr/pull/10662
+ /// API Proposal:
+ /// https://github.com/dotnet/corefx/issues/15957
+ ///
+ public static class ExtendedIntrinsics
+ {
+ public static bool IsAvailable { get; } =
+#if NETCOREAPP2_1
+// TODO: Add a build target for .NET 4.7.2
+ true;
+#else
+ false;
+#endif
+
+ // ReSharper disable once MemberHidesStaticFromOuterClass
+ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
+ {
+ Guard.IsTrue(
+ source.Length % Vector.Count == 0,
+ nameof(source),
+ "dest.Length should be divisable by Vector.Count!");
+
+ int n = source.Length / Vector.Count;
+
+ ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = new Vector(1f / 255f);
+
+ for (int i = 0; i < n; i++)
+ {
+ Vector b = Unsafe.Add(ref sourceBase, i);
+
+ Vector.Widen(b, out Vector s0, out Vector s1);
+ Vector.Widen(s0, out Vector w0, out Vector w1);
+ Vector.Widen(s1, out Vector w2, out Vector w3);
+
+ Vector f0 = Vector.ConvertToSingle(w0) * scale;
+ Vector f1 = Vector.ConvertToSingle(w1) * scale;
+ Vector f2 = Vector.ConvertToSingle(w2) * scale;
+ Vector f3 = Vector.ConvertToSingle(w3) * scale;
+
+ ref Vector d = ref Unsafe.Add(ref destBase, i * 4);
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs
index 481e0726d..3630ede32 100644
--- a/src/ImageSharp/Common/Extensions/SimdUtils.cs
+++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs
@@ -14,12 +14,12 @@ namespace SixLabors.ImageSharp
///
/// Various extension and utility methods for and utilizing SIMD capabilities
///
- internal static class SimdUtils
+ internal static partial class SimdUtils
{
///
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
///
- public static bool IsAvx2CompatibleArchitecture => Vector.Count == 8 && Vector.Count == 8;
+ public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8;
internal static void GuardAvx2(string operation)
{
@@ -61,7 +61,8 @@ namespace SixLabors.ImageSharp
///
/// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values.
- /// The values gonna be scaled up into [0-255] and rounded.
+ /// The values are scaled up into [0-255] and rounded.
+ /// The implementation is SIMD optimized and works only with `source.Length` divisible by .
/// Based on:
///
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
@@ -106,46 +107,13 @@ namespace SixLabors.ImageSharp
}
///
- /// Fast -> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged.
+ /// Converts `dest.Length` bytes to -s to -s normalized into [0..1]
+ /// The implementation is SIMD optimized and works only with `dest.Length` divisible by .
+ /// Implementation adapted from:
///
- /// https://github.com/dotnet/coreclr/pull/10662
+ /// http://stackoverflow.com/a/5362789
///
///
- internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan source, Span dest)
- {
- Guard.IsTrue(
- source.Length % Vector.Count == 0,
- nameof(source),
- "dest.Length should be divisable by Vector.Count!");
-
- int n = source.Length / Vector.Count;
-
- ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
-
- var scale = new Vector(1f / 255f);
-
- for (int i = 0; i < n; i++)
- {
- Vector b = Unsafe.Add(ref sourceBase, i);
-
- Vector.Widen(b, out Vector s0, out Vector s1);
- Vector.Widen(s0, out Vector w0, out Vector w1);
- Vector.Widen(s1, out Vector w2, out Vector w3);
-
- Vector f0 = Vector.ConvertToSingle(w0) * scale;
- Vector f1 = Vector.ConvertToSingle(w1) * scale;
- Vector f2 = Vector.ConvertToSingle(w2) * scale;
- Vector f3 = Vector.ConvertToSingle(w3) * scale;
-
- ref Vector d = ref Unsafe.Add(ref destBase, i * 4);
- d = f0;
- Unsafe.Add(ref d, 1) = f1;
- Unsafe.Add(ref d, 2) = f2;
- Unsafe.Add(ref d, 3) = f3;
- }
- }
-
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
{
GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
@@ -188,7 +156,7 @@ namespace SixLabors.ImageSharp
///
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest)
{
- GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
+ GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!");
diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
index 76e119ba4..6745079da 100644
--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@@ -3,7 +3,6 @@
using System;
using System.Numerics;
-using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.Memory;
@@ -19,99 +18,37 @@ namespace SixLabors.ImageSharp.PixelFormats
///
internal partial class PixelOperations : PixelOperations
{
- ///
- /// SIMD optimized bulk implementation of
- /// that works only with `count` divisible by .
- ///
- /// The to the source colors.
- /// The to the dstination vectors.
- /// The number of pixels to convert.
- ///
- /// Implementation adapted from:
- ///
- /// http://stackoverflow.com/a/5362789
- ///
- /// TODO: We can replace this implementation in the future using new Vector API-s:
- ///
- /// https://github.com/dotnet/corefx/issues/15957
- ///
- ///
- internal static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count)
- {
- if (!Vector.IsHardwareAccelerated)
- {
- throw new InvalidOperationException(
- "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!");
- }
-
- DebugGuard.IsTrue(
- count % Vector.Count == 0,
- nameof(count),
- "Argument 'count' should divisible by Vector.Count!");
-
- var bVec = new Vector(256.0f / 255.0f);
- var magicFloat = new Vector(32768.0f);
- var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f
- var mask = new Vector(255);
-
- int unpackedRawCount = count * 4;
-
- ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors));
- ref WideRgba destBaseAsWide = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors));
- ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsWide);
- ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWide);
-
- for (int i = 0; i < count; i++)
- {
- uint sVal = Unsafe.Add(ref sourceBase, i);
- ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i);
-
- // This call is the bottleneck now:
- dst.Load(sVal);
- }
-
- int numOfVectors = unpackedRawCount / Vector.Count;
-
- for (int i = 0; i < numOfVectors; i++)
- {
- Vector vi = Unsafe.Add(ref destBaseAsUInt, i);
-
- vi &= mask;
- vi |= magicInt;
-
- var vf = Vector.AsVectorSingle(vi);
- vf = (vf - magicFloat) * bVec;
-
- Unsafe.Add(ref destBaseAsFloat, i) = vf;
- }
- }
-
///
internal override void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count)
{
Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));
- if (count < 256 || !Vector.IsHardwareAccelerated)
+ if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
// Doesn't worth to bother with SIMD:
base.ToVector4(sourceColors, destinationVectors, count);
return;
}
- int remainder = count % Vector.Count;
+ int remainder = count % 2;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
- ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount);
+ ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors);
+ Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount));
+
+ SimdUtils.BulkConvertByteToNormalizedFloat(
+ rawSrc,
+ rawDest);
}
if (remainder > 0)
{
- sourceColors = sourceColors.Slice(alignedCount);
- destinationVectors = destinationVectors.Slice(alignedCount);
- base.ToVector4(sourceColors, destinationVectors, remainder);
+ // actually: remainder == 1
+ int lastIdx = count - 1;
+ destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4();
}
}
@@ -120,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats
{
GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
- if (!SimdUtils.IsAvx2CompatibleArchitecture)
+ if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
base.PackFromVector4(sourceVectors, destinationColors, count);
return;
@@ -131,10 +68,10 @@ namespace SixLabors.ImageSharp.PixelFormats
if (alignedCount > 0)
{
- ReadOnlySpan flatSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount));
- Span flatDest = MemoryMarshal.Cast(destinationColors);
+ ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount));
+ Span rawDest = MemoryMarshal.Cast(destinationColors);
- SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest);
+ SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)
@@ -172,30 +109,6 @@ namespace SixLabors.ImageSharp.PixelFormats
sourcePixels.Slice(0, count).CopyTo(dest);
}
-
- ///
- /// Value type to store -s widened into multiple -s.
- ///
- [StructLayout(LayoutKind.Sequential)]
- private struct WideRgba
- {
- private uint r;
-
- private uint g;
-
- private uint b;
-
- private uint a;
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public void Load(uint p)
- {
- this.r = p;
- this.g = p >> GreenShift;
- this.b = p >> BlueShift;
- this.a = p >> AlphaShift;
- }
- }
}
}
}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
index a5fa59ba0..bdae7d065 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@@ -23,7 +23,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
private IMemoryOwner destination;
- [Params(16, 128, 512)]
+ [Params(
+ //64,
+ 2048)]
public int Count { get; set; }
[GlobalSetup]
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 4e39af70f..0488dd5e1 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -205,12 +205,12 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
-
+
[Theory]
[InlineData(1, 0)]
[InlineData(2, 32)]
[InlineData(3, 128)]
- public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count)
+ public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
{
if (!Vector.IsHardwareAccelerated)
{
@@ -221,7 +221,7 @@ namespace SixLabors.ImageSharp.Tests.Common
float[] result = new float[count];
float[] expected = source.Select(b => (float)b / 255f).ToArray();
- SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result);
+ SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
index 4d7ec71e7..535952e05 100644
--- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
@@ -17,43 +17,26 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
{
public class Rgba32 : PixelOperationsTests
{
+ public const string SkipProfilingBenchmarks =
+#if true
+ "Profiling benchmark - enable manually!";
+#else
+ null;
+#endif
+
public Rgba32(ITestOutputHelper output)
: base(output)
{
}
- // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
- public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 };
-
[Fact]
public void IsSpecialImplementation()
{
Assert.IsType(PixelOperations.Instance);
}
- [Fact]
- public void ToVector4SimdAligned()
- {
- if (!Vector.IsHardwareAccelerated)
- {
- return;
- }
-
- ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64);
- Vector4[] expected = CreateExpectedVector4Data(source);
-
- TestOperation(
- source,
- expected,
- (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64)
- );
- }
-
-
- // [Fact] // Profiling benchmark - enable manually!
-#pragma warning disable xUnit1013 // Public method should be marked as test
+ [Fact(Skip = SkipProfilingBenchmarks)]
public void Benchmark_ToVector4()
-#pragma warning restore xUnit1013 // Public method should be marked as test
{
int times = 200000;
int count = 1024;
@@ -73,13 +56,10 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
public class Argb32 : PixelOperationsTests
{
- // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
public Argb32(ITestOutputHelper output)
: base(output)
{
}
-
- public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 };
}
[Theory]
@@ -110,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
{
}
- public static TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 };
+ public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 1111 };
private static PixelOperations Operations => PixelOperations.Instance;