Browse Source

FallbackIntrinsics128 + ImageMaths.Modulo* implementations

af/merge-core
Anton Firszov 8 years ago
parent
commit
8793880447
  1. 16
      src/ImageSharp/Common/Helpers/ImageMaths.cs
  2. 2
      src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
  3. 5
      src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
  4. 143
      src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
  5. 6
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  6. 26
      tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
  7. 25
      tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
  8. 27
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  9. 54
      tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

16
src/ImageSharp/Common/Helpers/ImageMaths.cs

@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Modulo4(int a) => a & 3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Modulo8(int a) => a & 7;
/// <summary>
/// Fast (mod m) calculator,
/// where <paramref name="m"/> should be a power of 2.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ModuloP2(int a, int m)
{
return a & (m - 1);
}
/// <summary>
/// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
/// </summary>

2
src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs

@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils
{
/// <summary>
/// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
/// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
/// </summary>
public static class BasicIntrinsics256
{

5
src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp
internal static partial class SimdUtils
{
/// <summary>
/// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// PR:
/// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
/// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// See:
/// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957

143
src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs

@ -0,0 +1,143 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
/// <summary>
/// Fallback implementation based on <see cref="Vector4"/> (128bit).
/// For <see cref="Vector4"/>, efficient software fallback implementations are present
/// + maybe even mono can emit intrinsics for that type :P
/// </summary>
public static class FallbackIntrinsics128
{
/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
int remainder = source.Length % 4;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertByteToNormalizedFloat(
source.Slice(0, alignedCount),
dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
int remainder = source.Length % 4;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(
source.Slice(0, alignedCount),
dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!");
int count = dest.Length / 4;
if (count == 0)
{
return;
}
ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
const float Scale = 1f / 255f;
Vector4 d = default;
for (int i = 0; i < count; i++)
{
ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
d.X = s.X;
d.Y = s.Y;
d.Z = s.Z;
d.W = s.W;
d *= Scale;
Unsafe.Add(ref dBase, i) = d;
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
ReadOnlySpan<float> source,
Span<byte> dest)
{
DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!");
int count = source.Length / 4;
if (count == 0)
{
return;
}
ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
var half = new Vector4(0.5f);
var maxBytes = new Vector4(255f);
for (int i = 0; i < count; i++)
{
Vector4 s = Unsafe.Add(ref sBase, i);
s *= maxBytes;
s += half;
// I'm not sure if Clamp() is properly implemented with intrinsics.
s = Vector4.Max(Vector4.Zero, s);
s = Vector4.Min(maxBytes, s);
ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
d.X = (byte)s.X;
d.Y = (byte)s.Y;
d.Z = (byte)s.Z;
d.W = (byte)s.W;
}
}
[StructLayout(LayoutKind.Sequential)]
private struct ByteVector4
{
public byte X;
public byte Y;
public byte Z;
public byte W;
}
}
}
}

6
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp
}
/// <summary>
/// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
// Deal with the remainder:
int count = source.Length;
@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp
}
/// <summary>
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
// Deal with the remainder:
int count = source.Length;

26
tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs

@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32>
{
[Benchmark]
public void BasicBulk()
public void FallbackIntrinsics128()
{
ref Vector4 sBase = ref this.source.GetSpan()[0];
ref Rgba32 dBase = ref this.destination.GetSpan()[0];
Vector4 maxBytes = new Vector4(255);
Vector4 half = new Vector4(0.5f);
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
for (int i = 0; i < this.Count; i++)
{
Vector4 v = Unsafe.Add(ref sBase, i);
v *= maxBytes;
v += half;
v = Vector4.Clamp(v, Vector4.Zero, maxBytes);
ref Rgba32 d = ref Unsafe.Add(ref dBase, i);
d.R = (byte)v.X;
d.G = (byte)v.Y;
d.B = (byte)v.Z;
d.A = (byte)v.W;
}
SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
[Benchmark(Baseline = true)]
public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows()
public void BasicIntrinsics256()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
[Benchmark]
public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows()
public void ExtendedIntrinsic()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());

25
tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs

@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class ToVector4_Rgba32 : ToVector4<Rgba32>
{
[Benchmark]
public void BasicBulk()
public void FallbackIntrinsics128()
{
ref Rgba32 sBase = ref this.source.GetSpan()[0];
ref Vector4 dBase = ref this.destination.GetSpan()[0];
Vector4 scale = new Vector4(1f / 255f);
Vector4 v = default;
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
for (int i = 0; i < this.Count; i++)
{
ref Rgba32 s = ref Unsafe.Add(ref sBase, i);
v.X = s.R;
v.Y = s.G;
v.Z = s.B;
v.W = s.A;
v *= scale;
Unsafe.Add(ref dBase, i) = v;
}
SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
}
[Benchmark(Baseline = true)]
public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat()
public void BasicIntrinsics256()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
[Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat()
public void ExtendedIntrinsics()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());

27
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common
}
public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new TheoryData<int> { 0, 4, 8, 28, 1020 };
public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common
0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
};
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count)
{
TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count)
{
if (this.SkipOnNonAvx2())
{
@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
(s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
);
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (this.SkipOnNonAvx2())
{

54
tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers
public class ImageMathsTests
{
[Theory]
[InlineData(0, 0)]
[InlineData(1, 1)]
[InlineData(2, 2)]
[InlineData(3, 3)]
[InlineData(4, 0)]
[InlineData(100, 0)]
[InlineData(123, 3)]
[InlineData(53436353, 1)]
public void Modulo4(int a, int expected)
{
int actual = ImageMaths.Modulo4(a);
Assert.Equal(expected, actual);
}
[Theory]
[InlineData(0, 0)]
[InlineData(1, 1)]
[InlineData(2, 2)]
[InlineData(6, 6)]
[InlineData(7, 7)]
[InlineData(8, 0)]
[InlineData(100, 4)]
[InlineData(123, 3)]
[InlineData(53436353, 1)]
[InlineData(975, 7)]
public void Modulo8(int a, int expected)
{
int actual = ImageMaths.Modulo8(a);
Assert.Equal(expected, actual);
}
[Theory]
[InlineData(0, 2, 0)]
[InlineData(1, 2, 1)]
[InlineData(2, 2, 0)]
[InlineData(0, 4, 0)]
[InlineData(3, 4, 3)]
[InlineData(5, 4, 1)]
[InlineData(5, 8, 5)]
[InlineData(8, 8, 0)]
[InlineData(8, 16, 8)]
[InlineData(15, 16, 15)]
[InlineData(17, 16, 1)]
[InlineData(17, 32, 17)]
[InlineData(31, 32, 31)]
[InlineData(32, 32, 0)]
[InlineData(33, 32, 1)]
public void Modulo2P(int a, int m, int expected)
{
int actual = ImageMaths.ModuloP2(a, m);
Assert.Equal(expected, actual);
}
[Fact]
public void FasAbsResultMatchesMath()
{

Loading…
Cancel
Save