Browse Source

FallbackIntrinsics128 + ImageMaths.Modulo* implementations

af/merge-core
Anton Firszov 8 years ago
parent
commit
8793880447
  1. 16
      src/ImageSharp/Common/Helpers/ImageMaths.cs
  2. 2
      src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
  3. 5
      src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
  4. 143
      src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
  5. 6
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  6. 26
      tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
  7. 25
      tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
  8. 27
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  9. 54
      tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

16
src/ImageSharp/Common/Helpers/ImageMaths.cs

@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b; return (a / GreatestCommonDivisor(a, b)) * b;
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Modulo4(int a) => a & 3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Modulo8(int a) => a & 7;
/// <summary>
/// Fast (mod m) calculator,
/// where <paramref name="m"/> should be a power of 2.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ModuloP2(int a, int m)
{
return a & (m - 1);
}
/// <summary> /// <summary>
/// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation. /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
/// </summary> /// </summary>

2
src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs

@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils internal static partial class SimdUtils
{ {
/// <summary> /// <summary>
/// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*) /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
/// </summary> /// </summary>
public static class BasicIntrinsics256 public static class BasicIntrinsics256
{ {

5
src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils internal static partial class SimdUtils
{ {
/// <summary> /// <summary>
/// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) /// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
/// PR: /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// See:
/// https://github.com/dotnet/coreclr/pull/10662 /// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal: /// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957 /// https://github.com/dotnet/corefx/issues/15957

143
src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs

@ -0,0 +1,143 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
/// <summary>
/// Fallback implementation based on <see cref="Vector4"/> (128bit).
/// For <see cref="Vector4"/>, efficient software fallback implementations are present
/// + maybe even mono can emit intrinsics for that type :P
/// </summary>
public static class FallbackIntrinsics128
{
/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
int remainder = source.Length % 4;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertByteToNormalizedFloat(
source.Slice(0, alignedCount),
dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
int remainder = source.Length % 4;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(
source.Slice(0, alignedCount),
dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!");
int count = dest.Length / 4;
if (count == 0)
{
return;
}
ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
const float Scale = 1f / 255f;
Vector4 d = default;
for (int i = 0; i < count; i++)
{
ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
d.X = s.X;
d.Y = s.Y;
d.Z = s.Z;
d.W = s.W;
d *= Scale;
Unsafe.Add(ref dBase, i) = d;
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
ReadOnlySpan<float> source,
Span<byte> dest)
{
DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!");
int count = source.Length / 4;
if (count == 0)
{
return;
}
ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
var half = new Vector4(0.5f);
var maxBytes = new Vector4(255f);
for (int i = 0; i < count; i++)
{
Vector4 s = Unsafe.Add(ref sBase, i);
s *= maxBytes;
s += half;
// I'm not sure if Clamp() is properly implemented with intrinsics.
s = Vector4.Max(Vector4.Zero, s);
s = Vector4.Min(maxBytes, s);
ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
d.X = (byte)s.X;
d.Y = (byte)s.Y;
d.Z = (byte)s.Z;
d.W = (byte)s.W;
}
}
[StructLayout(LayoutKind.Sequential)]
private struct ByteVector4
{
public byte X;
public byte Y;
public byte Z;
public byte W;
}
}
}
}

6
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp
} }
/// <summary> /// <summary>
/// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]. /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>, /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length. /// but there are no restrictions on the span's length.
/// </summary> /// </summary>
@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
// Deal with the remainder: // Deal with the remainder:
int count = source.Length; int count = source.Length;
@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp
} }
/// <summary> /// <summary>
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>. /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped. /// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>, /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length. /// but there are no restrictions on the span's length.
@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
// Deal with the remainder: // Deal with the remainder:
int count = source.Length; int count = source.Length;

26
tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs

@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32> public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32>
{ {
[Benchmark] [Benchmark]
public void BasicBulk() public void FallbackIntrinsics128()
{ {
ref Vector4 sBase = ref this.source.GetSpan()[0]; Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
ref Rgba32 dBase = ref this.destination.GetSpan()[0]; Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
Vector4 maxBytes = new Vector4(255);
Vector4 half = new Vector4(0.5f);
for (int i = 0; i < this.Count; i++) SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
{
Vector4 v = Unsafe.Add(ref sBase, i);
v *= maxBytes;
v += half;
v = Vector4.Clamp(v, Vector4.Zero, maxBytes);
ref Rgba32 d = ref Unsafe.Add(ref dBase, i);
d.R = (byte)v.X;
d.G = (byte)v.Y;
d.B = (byte)v.Z;
d.A = (byte)v.W;
}
} }
[Benchmark(Baseline = true)] [Benchmark(Baseline = true)]
public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows() public void BasicIntrinsics256()
{ {
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan()); Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan()); Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
} }
[Benchmark] [Benchmark]
public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows() public void ExtendedIntrinsic()
{ {
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan()); Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan()); Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());

25
tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs

@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class ToVector4_Rgba32 : ToVector4<Rgba32> public class ToVector4_Rgba32 : ToVector4<Rgba32>
{ {
[Benchmark] [Benchmark]
public void BasicBulk() public void FallbackIntrinsics128()
{ {
ref Rgba32 sBase = ref this.source.GetSpan()[0]; Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
ref Vector4 dBase = ref this.destination.GetSpan()[0]; Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Vector4 scale = new Vector4(1f / 255f);
Vector4 v = default;
for (int i = 0; i < this.Count; i++) SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
{
ref Rgba32 s = ref Unsafe.Add(ref sBase, i);
v.X = s.R;
v.Y = s.G;
v.Z = s.B;
v.W = s.A;
v *= scale;
Unsafe.Add(ref dBase, i) = v;
}
} }
[Benchmark(Baseline = true)] [Benchmark(Baseline = true)]
public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat() public void BasicIntrinsics256()
{ {
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan()); Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan()); Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
} }
[Benchmark] [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() public void ExtendedIntrinsics()
{ {
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan()); Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan()); Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());

27
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)] [InlineData(1, 8)]
[InlineData(2, 16)] [InlineData(2, 16)]
[InlineData(3, 128)] [InlineData(3, 128)]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
{ {
if (this.SkipOnNonAvx2()) if (this.SkipOnNonAvx2())
{ {
@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)] [InlineData(1, 8)]
[InlineData(2, 16)] [InlineData(2, 16)]
[InlineData(3, 128)] [InlineData(3, 128)]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
{ {
if (this.SkipOnNonAvx2()) if (this.SkipOnNonAvx2())
{ {
@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common
} }
public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 }; public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new TheoryData<int> { 0, 4, 8, 28, 1020 };
public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 }; public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common
0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
}; };
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count)
{
TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
[Theory] [Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))] [MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count) public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count)
{ {
if (this.SkipOnNonAvx2()) if (this.SkipOnNonAvx2())
{ {
@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
} }
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
(s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
);
}
[Theory] [Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))] [MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{ {
if (this.SkipOnNonAvx2()) if (this.SkipOnNonAvx2())
{ {

54
tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers
public class ImageMathsTests public class ImageMathsTests
{ {
[Theory]
[InlineData(0, 0)]
[InlineData(1, 1)]
[InlineData(2, 2)]
[InlineData(3, 3)]
[InlineData(4, 0)]
[InlineData(100, 0)]
[InlineData(123, 3)]
[InlineData(53436353, 1)]
public void Modulo4(int a, int expected)
{
int actual = ImageMaths.Modulo4(a);
Assert.Equal(expected, actual);
}
[Theory]
[InlineData(0, 0)]
[InlineData(1, 1)]
[InlineData(2, 2)]
[InlineData(6, 6)]
[InlineData(7, 7)]
[InlineData(8, 0)]
[InlineData(100, 4)]
[InlineData(123, 3)]
[InlineData(53436353, 1)]
[InlineData(975, 7)]
public void Modulo8(int a, int expected)
{
int actual = ImageMaths.Modulo8(a);
Assert.Equal(expected, actual);
}
[Theory]
[InlineData(0, 2, 0)]
[InlineData(1, 2, 1)]
[InlineData(2, 2, 0)]
[InlineData(0, 4, 0)]
[InlineData(3, 4, 3)]
[InlineData(5, 4, 1)]
[InlineData(5, 8, 5)]
[InlineData(8, 8, 0)]
[InlineData(8, 16, 8)]
[InlineData(15, 16, 15)]
[InlineData(17, 16, 1)]
[InlineData(17, 32, 17)]
[InlineData(31, 32, 31)]
[InlineData(32, 32, 0)]
[InlineData(33, 32, 1)]
public void Modulo2P(int a, int m, int expected)
{
int actual = ImageMaths.ModuloP2(a, m);
Assert.Equal(expected, actual);
}
[Fact] [Fact]
public void FasAbsResultMatchesMath() public void FasAbsResultMatchesMath()
{ {

Loading…
Cancel
Save