Browse Source

uniformize conversion code

af/merge-core
Anton Firszov 7 years ago
parent
commit
f72fcbdc0f
  1. 64
      src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs
  2. 50
      src/ImageSharp/Common/Extensions/SimdUtils.cs
  3. 117
      src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
  4. 4
      tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
  5. 6
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  6. 38
      tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs

64
src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs

@ -0,0 +1,64 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
/// <summary>
/// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// PR:
/// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957
/// </summary>
public static class ExtendedIntrinsics
{
public static bool IsAvailable { get; } =
#if NETCOREAPP2_1
// TODO: Add a build target for .NET 4.7.2
true;
#else
false;
#endif
// ReSharper disable once MemberHidesStaticFromOuterClass
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
Guard.IsTrue(
source.Length % Vector<byte>.Count == 0,
nameof(source),
"dest.Length should be divisable by Vector<byte>.Count!");
int n = source.Length / Vector<byte>.Count;
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
var scale = new Vector<float>(1f / 255f);
for (int i = 0; i < n; i++)
{
Vector<byte> b = Unsafe.Add(ref sourceBase, i);
Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
Vector<float> f0 = Vector.ConvertToSingle(w0) * scale;
Vector<float> f1 = Vector.ConvertToSingle(w1) * scale;
Vector<float> f2 = Vector.ConvertToSingle(w2) * scale;
Vector<float> f3 = Vector.ConvertToSingle(w3) * scale;
ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
}
}
}

50
src/ImageSharp/Common/Extensions/SimdUtils.cs

@ -14,12 +14,12 @@ namespace SixLabors.ImageSharp
/// <summary>
/// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
/// </summary>
internal static class SimdUtils
internal static partial class SimdUtils
{
/// <summary>
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
/// </summary>
public static bool IsAvx2CompatibleArchitecture => Vector<float>.Count == 8 && Vector<int>.Count == 8;
public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
internal static void GuardAvx2(string operation)
{
@ -61,7 +61,8 @@ namespace SixLabors.ImageSharp
/// <summary>
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
/// The values gonna be scaled up into [0-255] and rounded.
/// The values are scaled up into [0-255] and rounded.
/// The implementation is SIMD optimized and works only with `source.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
/// Based on:
/// <see>
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
@ -106,46 +107,13 @@ namespace SixLabors.ImageSharp
}
/// <summary>
/// Fast <see cref="byte"/> -> <see cref="float"/> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged.
/// Converts `dest.Length` bytes to <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]
/// The implementation is SIMD optimized and works only with `dest.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
/// Implementation adapted from:
/// <see>
/// <cref>https://github.com/dotnet/coreclr/pull/10662</cref>
/// <cref>http://stackoverflow.com/a/5362789</cref>
/// </see>
/// </summary>
internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan<byte> source, Span<float> dest)
{
Guard.IsTrue(
source.Length % Vector<byte>.Count == 0,
nameof(source),
"dest.Length should be divisable by Vector<byte>.Count!");
int n = source.Length / Vector<byte>.Count;
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
var scale = new Vector<float>(1f / 255f);
for (int i = 0; i < n; i++)
{
Vector<byte> b = Unsafe.Add(ref sourceBase, i);
Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
Vector<float> f0 = Vector.ConvertToSingle(w0) * scale;
Vector<float> f1 = Vector.ConvertToSingle(w1) * scale;
Vector<float> f2 = Vector.ConvertToSingle(w2) * scale;
Vector<float> f3 = Vector.ConvertToSingle(w3) * scale;
ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
@ -188,7 +156,7 @@ namespace SixLabors.ImageSharp
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
{
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");

117
src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs

@ -3,7 +3,6 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.Memory;
@ -19,99 +18,37 @@ namespace SixLabors.ImageSharp.PixelFormats
/// </summary>
internal partial class PixelOperations : PixelOperations<Rgba32>
{
/// <summary>
/// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/>
/// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>.
/// </summary>
/// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param>
/// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param>
/// <param name="count">The number of pixels to convert.</param>
/// <remarks>
/// Implementation adapted from:
/// <see>
/// <cref>http://stackoverflow.com/a/5362789</cref>
/// </see>
/// TODO: We can replace this implementation in the future using new Vector API-s:
/// <see>
/// <cref>https://github.com/dotnet/corefx/issues/15957</cref>
/// </see>
/// </remarks>
internal static void ToVector4SimdAligned(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destVectors, int count)
{
if (!Vector.IsHardwareAccelerated)
{
throw new InvalidOperationException(
"Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!");
}
DebugGuard.IsTrue(
count % Vector<uint>.Count == 0,
nameof(count),
"Argument 'count' should divisible by Vector<uint>.Count!");
var bVec = new Vector<float>(256.0f / 255.0f);
var magicFloat = new Vector<float>(32768.0f);
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
var mask = new Vector<uint>(255);
int unpackedRawCount = count * 4;
ref uint sourceBase = ref Unsafe.As<Rgba32, uint>(ref MemoryMarshal.GetReference(sourceColors));
ref WideRgba destBaseAsWide = ref Unsafe.As<Vector4, WideRgba>(ref MemoryMarshal.GetReference(destVectors));
ref Vector<uint> destBaseAsUInt = ref Unsafe.As<WideRgba, Vector<uint>>(ref destBaseAsWide);
ref Vector<float> destBaseAsFloat = ref Unsafe.As<WideRgba, Vector<float>>(ref destBaseAsWide);
for (int i = 0; i < count; i++)
{
uint sVal = Unsafe.Add(ref sourceBase, i);
ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i);
// This call is the bottleneck now:
dst.Load(sVal);
}
int numOfVectors = unpackedRawCount / Vector<uint>.Count;
for (int i = 0; i < numOfVectors; i++)
{
Vector<uint> vi = Unsafe.Add(ref destBaseAsUInt, i);
vi &= mask;
vi |= magicInt;
var vf = Vector.AsVectorSingle(vi);
vf = (vf - magicFloat) * bVec;
Unsafe.Add(ref destBaseAsFloat, i) = vf;
}
}
/// <inheritdoc />
internal override void ToVector4(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destinationVectors, int count)
{
Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));
if (count < 256 || !Vector.IsHardwareAccelerated)
if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
// Doesn't worth to bother with SIMD:
base.ToVector4(sourceColors, destinationVectors, count);
return;
}
int remainder = count % Vector<uint>.Count;
int remainder = count % 2;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount);
ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
SimdUtils.BulkConvertByteToNormalizedFloat(
rawSrc,
rawDest);
}
if (remainder > 0)
{
sourceColors = sourceColors.Slice(alignedCount);
destinationVectors = destinationVectors.Slice(alignedCount);
base.ToVector4(sourceColors, destinationVectors, remainder);
// actually: remainder == 1
int lastIdx = count - 1;
destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4();
}
}
@ -120,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats
{
GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
if (!SimdUtils.IsAvx2CompatibleArchitecture)
if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
base.PackFromVector4(sourceVectors, destinationColors, count);
return;
@ -131,10 +68,10 @@ namespace SixLabors.ImageSharp.PixelFormats
if (alignedCount > 0)
{
ReadOnlySpan<float> flatSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
Span<byte> flatDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest);
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)
@ -172,30 +109,6 @@ namespace SixLabors.ImageSharp.PixelFormats
sourcePixels.Slice(0, count).CopyTo(dest);
}
/// <summary>
/// Value type to store <see cref="Rgba32"/>-s widened into multiple <see cref="uint"/>-s.
/// </summary>
[StructLayout(LayoutKind.Sequential)]
private struct WideRgba
{
private uint r;
private uint g;
private uint b;
private uint a;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Load(uint p)
{
this.r = p;
this.g = p >> GreenShift;
this.b = p >> BlueShift;
this.a = p >> AlphaShift;
}
}
}
}
}

4
tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs

@ -23,7 +23,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
private IMemoryOwner<TPixel> destination;
[Params(16, 128, 512)]
[Params(
//64,
2048)]
public int Count { get; set; }
[GlobalSetup]

6
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -205,12 +205,12 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
[Theory]
[InlineData(1, 0)]
[InlineData(2, 32)]
[InlineData(3, 128)]
public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count)
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
{
if (!Vector.IsHardwareAccelerated)
{
@ -221,7 +221,7 @@ namespace SixLabors.ImageSharp.Tests.Common
float[] result = new float[count];
float[] expected = source.Select(b => (float)b / 255f).ToArray();
SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result);
SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}

38
tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs

@ -17,43 +17,26 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
{
public class Rgba32 : PixelOperationsTests<ImageSharp.PixelFormats.Rgba32>
{
public const string SkipProfilingBenchmarks =
#if true
"Profiling benchmark - enable manually!";
#else
null;
#endif
public Rgba32(ITestOutputHelper output)
: base(output)
{
}
// For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
[Fact]
public void IsSpecialImplementation()
{
Assert.IsType<ImageSharp.PixelFormats.Rgba32.PixelOperations>(PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance);
}
[Fact]
public void ToVector4SimdAligned()
{
if (!Vector.IsHardwareAccelerated)
{
return;
}
ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64);
Vector4[] expected = CreateExpectedVector4Data(source);
TestOperation(
source,
expected,
(s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64)
);
}
// [Fact] // Profiling benchmark - enable manually!
#pragma warning disable xUnit1013 // Public method should be marked as test
[Fact(Skip = SkipProfilingBenchmarks)]
public void Benchmark_ToVector4()
#pragma warning restore xUnit1013 // Public method should be marked as test
{
int times = 200000;
int count = 1024;
@ -73,13 +56,10 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
public class Argb32 : PixelOperationsTests<ImageSharp.PixelFormats.Argb32>
{
// For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
public Argb32(ITestOutputHelper output)
: base(output)
{
}
public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
}
[Theory]
@ -110,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
{
}
public static TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
public static TheoryData<int> ArraySizesData => new TheoryData<int> { 0, 1, 2, 7, 16, 1111 };
private static PixelOperations<TPixel> Operations => PixelOperations<TPixel>.Instance;

Loading…
Cancel
Save