Browse Source

Merge branch 'master' into webp

pull/1552/head
Brian Popow 6 years ago
committed by GitHub
parent
commit
7abb70759a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 6
      src/ImageSharp/Common/Helpers/ImageMaths.cs
  2. 103
      src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
  3. 301
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  4. 7
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  5. 79
      src/ImageSharp/Common/Helpers/Vector4Utilities.cs
  6. 76
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
  7. 1
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
  8. 8
      tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs
  9. 68
      tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs
  10. 13
      tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs
  11. 68
      tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs
  12. 4
      tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
  13. 36
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  14. 15
      tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
  15. 2
      tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
  16. 50
      tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs

6
src/ImageSharp/Common/Helpers/ImageMaths.cs

@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b; return (a / GreatestCommonDivisor(a, b)) * b;
} }
/// <summary>
/// Calculates <paramref name="x"/> % 2
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public static int Modulo2(int x) => x & 1;
/// <summary> /// <summary>
/// Calculates <paramref name="x"/> % 4 /// Calculates <paramref name="x"/> % 4
/// </summary> /// </summary>

103
src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs

@ -1,103 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
public static class Avx2Intrinsics
{
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
NormalizedFloatToByteSaturate(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));
source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
/// </remarks>
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
int n = dest.Length / Vector256<byte>.Count;
ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
var maxBytes = Vector256.Create(255f);
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
for (int i = 0; i < n; i++)
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector256<float> f0 = s;
Vector256<float> f1 = Unsafe.Add(ref s, 1);
Vector256<float> f2 = Unsafe.Add(ref s, 2);
Vector256<float> f3 = Unsafe.Add(ref s, 3);
Vector256<int> w0 = ConvertToInt32(f0, maxBytes);
Vector256<int> w1 = ConvertToInt32(f1, maxBytes);
Vector256<int> w2 = ConvertToInt32(f2, maxBytes);
Vector256<int> w3 = ConvertToInt32(f3, maxBytes);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destBase, i) = b;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
vf = Avx.Multiply(vf, scale);
return Avx.ConvertToVector256Int32(vf);
}
}
}
}
#endif

301
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -0,0 +1,301 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
public static class HwIntrinsics
{
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
/// </summary>
/// <param name="va">The vector to add to the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static Vector256<float> MultiplyAdd(
in Vector256<float> va,
in Vector256<float> vm0,
in Vector256<float> vm1)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(vm1, vm0, va);
}
else
{
return Avx.Add(Avx.Multiply(vm0, vm1), va);
}
}
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported || Sse2.IsSupported)
{
int remainder;
if (Avx2.IsSupported)
{
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
}
else
{
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
}
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}
/// <summary>
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
/// </remarks>
internal static unsafe void ByteToNormalizedFloat(
ReadOnlySpan<byte> source,
Span<float> dest)
{
if (Avx2.IsSupported)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
int n = dest.Length / Vector256<byte>.Count;
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
var scale = Vector256.Create(1 / (float)byte.MaxValue);
for (int i = 0; i < n; i++)
{
int si = Vector256<byte>.Count * i;
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
else
{
// Sse
VerifySpanInput(source, dest, Vector128<byte>.Count);
int n = dest.Length / Vector128<byte>.Count;
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
ref Vector128<float> destBase =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
var scale = Vector128.Create(1 / (float)byte.MaxValue);
Vector128<byte> zero = Vector128<byte>.Zero;
for (int i = 0; i < n; i++)
{
int si = Vector128<byte>.Count * i;
Vector128<int> i0, i1, i2, i3;
if (Sse41.IsSupported)
{
i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
}
else
{
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
}
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
}
/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported || Sse2.IsSupported)
{
int remainder;
if (Avx2.IsSupported)
{
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
}
else
{
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
}
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
NormalizedFloatToByteSaturate(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));
source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
/// </remarks>
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
if (Avx2.IsSupported)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
int n = dest.Length / Vector256<byte>.Count;
ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<byte> destBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
var scale = Vector256.Create((float)byte.MaxValue);
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
for (int i = 0; i < n; i++)
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector256<float> f0 = Avx.Multiply(scale, s);
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destBase, i) = b;
}
}
else
{
// Sse
VerifySpanInput(source, dest, Vector128<byte>.Count);
int n = dest.Length / Vector128<byte>.Count;
ref Vector128<float> sourceBase =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<byte> destBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));
var scale = Vector128.Create((float)byte.MaxValue);
for (int i = 0; i < n; i++)
{
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector128<float> f0 = Sse.Multiply(scale, s);
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0);
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1);
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2);
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3);
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3);
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
}
}
}
}
}
}
#endif

7
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -79,8 +79,9 @@ namespace SixLabors.ImageSharp
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{ {
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
#if SUPPORTS_RUNTIME_INTRINSICS
#if SUPPORTS_EXTENDED_INTRINSICS HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
#else #else
BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest); BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest);
@ -110,7 +111,7 @@ namespace SixLabors.ImageSharp
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
Avx2Intrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#elif SUPPORTS_EXTENDED_INTRINSICS #elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#else #else

79
src/ImageSharp/Common/Helpers/Vector4Utilities.cs

@ -5,6 +5,10 @@ using System;
using System.Numerics; using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
namespace SixLabors.ImageSharp namespace SixLabors.ImageSharp
{ {
@ -13,6 +17,9 @@ namespace SixLabors.ImageSharp
/// </summary> /// </summary>
internal static class Vector4Utilities internal static class Vector4Utilities
{ {
private const int BlendAlphaControl = 0b_10_00_10_00;
private const int ShuffleAlphaControl = 0b_11_11_11_11;
/// <summary> /// <summary>
/// Restricts a vector between a minimum and a maximum value. /// Restricts a vector between a minimum and a maximum value.
/// 5x Faster then <see cref="Vector4.Clamp(Vector4, Vector4, Vector4)"/>. /// 5x Faster then <see cref="Vector4.Clamp(Vector4, Vector4, Vector4)"/>.
@ -56,13 +63,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static void Premultiply(Span<Vector4> vectors) public static void Premultiply(Span<Vector4> vectors)
{ {
// TODO: This method can be AVX2 optimized using Vector<float> #if SUPPORTS_RUNTIME_INTRINSICS
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); if (Avx2.IsSupported && vectors.Length >= 2)
{
ref Vector256<float> vectorsBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
for (int i = 0; i < vectors.Length; i++) // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
{
Vector256<float> source = vectorsBase;
Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
}
if (ImageMaths.Modulo2(vectors.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
}
}
else
#endif
{ {
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
Premultiply(ref v);
for (int i = 0; i < vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
Premultiply(ref v);
}
} }
} }
@ -73,13 +106,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static void UnPremultiply(Span<Vector4> vectors) public static void UnPremultiply(Span<Vector4> vectors)
{ {
// TODO: This method can be AVX2 optimized using Vector<float> #if SUPPORTS_RUNTIME_INTRINSICS
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); if (Avx2.IsSupported && vectors.Length >= 2)
{
ref Vector256<float> vectorsBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
for (int i = 0; i < vectors.Length; i++) // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
{
Vector256<float> source = vectorsBase;
Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
}
if (ImageMaths.Modulo2(vectors.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
}
}
else
#endif
{ {
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
UnPremultiply(ref v);
for (int i = 0; i < vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
UnPremultiply(ref v);
}
} }
} }

76
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs

@ -1,11 +1,15 @@
// Copyright (c) Six Labors. // Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0. // Licensed under the Apache License, Version 2.0.
using System; using System;
using System.Numerics; using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static SixLabors.ImageSharp.SimdUtils;
#endif
using SixLabors.ImageSharp.Tuples; using SixLabors.ImageSharp.Tuples;
// ReSharper disable ImpureMethodCallOnReadonlyValueField // ReSharper disable ImpureMethodCallOnReadonlyValueField
@ -47,6 +51,73 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
"JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!"); "JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!");
} }
#if SUPPORTS_RUNTIME_INTRINSICS
ref Vector256<float> yBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> cbBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> crBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> resultBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
// Used for the color conversion
var chromaOffset = Vector256.Create(-halfValue);
var scale = Vector256.Create(1 / maxValue);
var rCrMult = Vector256.Create(1.402F);
var gCbMult = Vector256.Create(-0.344136F);
var gCrMult = Vector256.Create(-0.714136F);
var bCbMult = Vector256.Create(1.772F);
// Used for packing.
var va = Vector256.Create(1F);
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);
// Walking 8 elements at one step:
int n = result.Length / 8;
for (int i = 0; i < n; i++)
{
// y = yVals[i];
// cb = cbVals[i] - 128F;
// cr = crVals[i] - 128F;
Vector256<float> y = Unsafe.Add(ref yBase, i);
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
y = Avx2.PermuteVar8x32(y, vcontrol);
cb = Avx2.PermuteVar8x32(cb, vcontrol);
cr = Avx2.PermuteVar8x32(cr, vcontrol);
// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
// Adding & multiplying 8 elements at one time:
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult);
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);
// TODO: We should be savving to RGBA not Vector4
r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);
Vector256<float> vte = Avx.UnpackLow(r, b);
Vector256<float> vto = Avx.UnpackLow(g, va);
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
destination = Avx.UnpackLow(vte, vto);
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
vte = Avx.UnpackHigh(r, b);
vto = Avx.UnpackHigh(g, va);
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
}
#else
ref Vector<float> yBase = ref Vector<float> yBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector<float> cbBase = ref Vector<float> cbBase =
@ -104,6 +175,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
destination.Pack(ref rr, ref gg, ref bb); destination.Pack(ref rr, ref gg, ref bb);
} }
#endif
} }
} }
} }

1
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs

@ -4,7 +4,6 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Numerics; using System.Numerics;
using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Tuples; using SixLabors.ImageSharp.Tuples;

8
tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs

@ -13,15 +13,13 @@ using System.Runtime.Intrinsics.X86;
#endif #endif
using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Environments;
using BenchmarkDotNet.Jobs;
using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.PixelFormats;
// ReSharper disable InconsistentNaming // ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{ {
[Config(typeof(Config.ShortClr))] [Config(typeof(Config.ShortCore31))]
public abstract class FromVector4<TPixel> public abstract class FromVector4<TPixel>
where TPixel : unmanaged, IPixel<TPixel> where TPixel : unmanaged, IPixel<TPixel>
{ {
@ -104,12 +102,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark] [Benchmark]
public void UseAvx2() public void UseHwIntrinsics()
{ {
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan()); Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan()); Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
} }
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };

68
tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs

@ -0,0 +1,68 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.ShortCore31))]
public class PremultiplyVector4
{
private static readonly Vector4[] Vectors = CreateVectors();
[Benchmark(Baseline = true)]
public void PremultiplyBaseline()
{
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
for (int i = 0; i < Vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
Premultiply(ref v);
}
}
[Benchmark]
public void Premultiply()
{
Vector4Utilities.Premultiply(Vectors);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Premultiply(ref Vector4 source)
{
float w = source.W;
source *= w;
source.W = w;
}
private static Vector4[] CreateVectors()
{
var rnd = new Random(42);
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
}
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
{
var values = new Vector4[length];
for (int i = 0; i < length; i++)
{
ref Vector4 v = ref values[i];
v.X = GetRandomFloat(rnd, minVal, maxVal);
v.Y = GetRandomFloat(rnd, minVal, maxVal);
v.Z = GetRandomFloat(rnd, minVal, maxVal);
v.W = GetRandomFloat(rnd, minVal, maxVal);
}
return values;
}
private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
}
}

13
tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs

@ -13,7 +13,7 @@ using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{ {
[Config(typeof(Config.ShortClr))] [Config(typeof(Config.ShortCore31))]
public class ToVector4_Rgba32 : ToVector4<Rgba32> public class ToVector4_Rgba32 : ToVector4<Rgba32>
{ {
[Benchmark] [Benchmark]
@ -52,6 +52,17 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats); SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
} }
#if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark]
public void HwIntrinsics()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
}
#endif
// [Benchmark] // [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{ {

68
tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs

@ -0,0 +1,68 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.ShortCore31))]
public class UnPremultiplyVector4
{
private static readonly Vector4[] Vectors = CreateVectors();
[Benchmark(Baseline = true)]
public void UnPremultiplyBaseline()
{
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
for (int i = 0; i < Vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
UnPremultiply(ref v);
}
}
[Benchmark]
public void UnPremultiply()
{
Vector4Utilities.UnPremultiply(Vectors);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void UnPremultiply(ref Vector4 source)
{
float w = source.W;
source /= w;
source.W = w;
}
private static Vector4[] CreateVectors()
{
var rnd = new Random(42);
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
}
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
{
var values = new Vector4[length];
for (int i = 0; i < length; i++)
{
ref Vector4 v = ref values[i];
v.X = GetRandomFloat(rnd, minVal, maxVal);
v.Y = GetRandomFloat(rnd, minVal, maxVal);
v.Z = GetRandomFloat(rnd, minVal, maxVal);
v.W = GetRandomFloat(rnd, minVal, maxVal);
}
return values;
}
private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
}
}

4
tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

@ -73,7 +73,9 @@ namespace SixLabors.ImageSharp.Benchmarks
} }
#endif #endif
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
.WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off)) .WithEnvironmentVariables(
new EnvironmentVariable(EnableHWIntrinsic, Off),
new EnvironmentVariable(FeatureSIMD, Off))
.WithId("No HwIntrinsics")); .WithId("No HwIntrinsics"));
} }
} }

36
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -7,7 +7,7 @@ using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Common.Tuples; using SixLabors.ImageSharp.Common.Tuples;
using SixLabors.ImageSharp.Tests.TestUtilities;
using Xunit; using Xunit;
using Xunit.Abstractions; using Xunit.Abstractions;
@ -204,6 +204,25 @@ namespace SixLabors.ImageSharp.Tests.Common
(s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); (s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
} }
#if SUPPORTS_RUNTIME_INTRINSICS
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
static void RunTest(string serialized)
{
TestImpl_BulkConvertByteToNormalizedFloat(
FeatureTestRunner.Deserialize(serialized),
(s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
}
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41,
count);
}
#endif
[Theory] [Theory]
[MemberData(nameof(ArbitraryArraySizes))] [MemberData(nameof(ArbitraryArraySizes))]
public void BulkConvertByteToNormalizedFloat(int count) public void BulkConvertByteToNormalizedFloat(int count)
@ -281,16 +300,19 @@ namespace SixLabors.ImageSharp.Tests.Common
[Theory] [Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))] [MemberData(nameof(ArraySizesDivisibleBy32))]
public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count) public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{ {
if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported) static void RunTest(string serialized)
{ {
return; TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
FeatureTestRunner.Deserialize(serialized),
(s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span));
} }
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( FeatureTestRunner.RunWithHwIntrinsicsFeature(
count, RunTest,
(s, d) => SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2,
count);
} }
#endif #endif

15
tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers
{ {
public class ImageMathsTests public class ImageMathsTests
{ {
[Theory]
[InlineData(0)]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
[InlineData(4)]
[InlineData(100)]
[InlineData(123)]
[InlineData(53436353)]
public void Modulo2(int x)
{
int actual = ImageMaths.Modulo2(x);
Assert.Equal(x % 2, actual);
}
[Theory] [Theory]
[InlineData(0)] [InlineData(0)]
[InlineData(1)] [InlineData(1)]

2
tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs

@ -17,6 +17,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers
[InlineData(0)] [InlineData(0)]
[InlineData(1)] [InlineData(1)]
[InlineData(30)] [InlineData(30)]
[InlineData(63)]
public void Premultiply_VectorSpan(int length) public void Premultiply_VectorSpan(int length)
{ {
var rnd = new Random(42); var rnd = new Random(42);
@ -36,6 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers
[InlineData(0)] [InlineData(0)]
[InlineData(1)] [InlineData(1)]
[InlineData(30)] [InlineData(30)]
[InlineData(63)]
public void UnPremultiply_VectorSpan(int length) public void UnPremultiply_VectorSpan(int length)
{ {
var rnd = new Random(42); var rnd = new Random(42);

50
tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs

@ -33,6 +33,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
where T : IXunitSerializable where T : IXunitSerializable
=> BasicSerializer.Deserialize<T>(value); => BasicSerializer.Deserialize<T>(value);
/// <summary>
/// Allows the deserialization of integers passed to the feature test.
/// </summary>
/// <param name="value">The string value to deserialize.</param>
/// <returns>The <see cref="int"/> value.</returns>
public static int Deserialize(string value)
=> Convert.ToInt32(value);
/// <summary> /// <summary>
/// Runs the given test <paramref name="action"/> within an environment /// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features. /// where the given <paramref name="intrinsics"/> features.
@ -201,6 +209,48 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
} }
} }
/// <summary>
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <param name="action">The test action to run.</param>
/// <param name="intrinsics">The intrinsics features.</param>
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
public static void RunWithHwIntrinsicsFeature(
Action<string> action,
HwIntrinsics intrinsics,
int serializable)
{
if (!RemoteExecutor.IsSupported)
{
return;
}
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection())
{
var processStartInfo = new ProcessStartInfo();
if (intrinsic.Key != HwIntrinsics.AllowAll)
{
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
RemoteExecutor.Invoke(
action,
serializable.ToString(),
new RemoteInvokeOptions
{
StartInfo = processStartInfo
})
.Dispose();
}
else
{
// Since we are running using the default architecture there is no
// point creating the overhead of running the action in a separate process.
action(serializable.ToString());
}
}
}
internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
{ {
// Loop through and translate the given values into COMPlus equivaluents // Loop through and translate the given values into COMPlus equivaluents

Loading…
Cancel
Save