Browse Source

Merge pull request #2654 from SixLabors/js/xplat-intrinsics

Cleanup SimdUtils
js/sanitize-foreground-rectangle
James Jackson-South 2 years ago
committed by GitHub
parent
commit
1f22bceef8
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 40
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 78
      src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
  3. 182
      src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
  4. 144
      src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
  5. 335
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  6. 118
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  7. 74
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  8. 39
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  9. 37
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  10. 42
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  11. 16
      src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs
  12. 75
      src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
  13. 102
      tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs
  14. 66
      tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs
  15. 25
      tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
  16. 4
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
  17. 4
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
  18. 72
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
  19. 2
      tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
  20. 59
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  21. 22
      tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs
  22. 81
      tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs

40
src/ImageSharp/Common/Helpers/Numerics.cs

@ -1010,6 +1010,26 @@ internal static class Numerics
where TVector : struct
=> (uint)span.Length / (uint)Vector256<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into the given span.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="span">The given span.</param>
/// <returns>Count of vectors that safely fit into the span.</returns>
public static nuint Vector512Count<TVector>(this Span<byte> span)
where TVector : struct
=> (uint)span.Length / (uint)Vector512<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into the given span.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="span">The given span.</param>
/// <returns>Count of vectors that safely fit into the span.</returns>
public static nuint Vector512Count<TVector>(this ReadOnlySpan<byte> span)
where TVector : struct
=> (uint)span.Length / (uint)Vector512<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into the given span.
/// </summary>
@ -1049,4 +1069,24 @@ internal static class Numerics
public static nuint Vector256Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector256<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into the given span.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="span">The given span.</param>
/// <returns>Count of vectors that safely fit into the span.</returns>
public static nuint Vector512Count<TVector>(this Span<float> span)
where TVector : struct
=> (uint)span.Length / (uint)Vector512<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into length.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="length">The given length.</param>
/// <returns>Count of vectors that safely fit into the length.</returns>
public static nuint Vector512Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector512<TVector>.Count;
}

78
src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs

@ -0,0 +1,78 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of bytes</param>
/// <param name="destination">The destination span of floats</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> destination)
{
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref destination);
if (source.Length > 0)
{
ConvertByteToNormalizedFloatRemainder(source, destination);
}
}
/// <summary>
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="destination">The destination span of bytes</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
{
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
if (source.Length > 0)
{
ConvertNormalizedFloatToByteRemainder(source, destination);
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(destination);
for (int i = 0; i < source.Length; i++)
{
Unsafe.Add(ref dBase, (uint)i) = Unsafe.Add(ref sBase, (uint)i) / 255f;
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
{
ref float sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(destination);
for (int i = 0; i < source.Length; i++)
{
Unsafe.Add(ref dBase, (uint)i) = ConvertToByte(Unsafe.Add(ref sBase, (uint)i));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255f) + 0.5f, 0, 255f);
}

182
src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

@ -1,182 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
/// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// See:
/// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957
/// </summary>
public static class ExtendedIntrinsics
{
public static bool IsAvailable { get; } = Vector.IsHardwareAccelerated;
/// <summary>
/// Widen and convert a vector of <see cref="short"/> values into 2 vectors of <see cref="float"/>-s.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ConvertToSingle(
Vector<short> source,
out Vector<float> dest1,
out Vector<float> dest2)
{
Vector.Widen(source, out Vector<int> i1, out Vector<int> i2);
dest1 = Vector.ConvertToSingle(i1);
dest2 = Vector.ConvertToSingle(i2);
}
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (!IsAvailable)
{
return;
}
int remainder = Numerics.ModuloP2(source.Length, Vector<byte>.Count);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
}
}
/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (!IsAvailable)
{
return;
}
int remainder = Numerics.ModuloP2(source.Length, Vector<byte>.Count);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
NormalizedFloatToByteSaturate(source[..adjustedCount], dest[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
}
}
/// <summary>
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
VerifySpanInput(source, dest, Vector<byte>.Count);
nuint n = dest.VectorCount<byte>();
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
for (nuint i = 0; i < n; i++)
{
Vector<byte> b = Unsafe.Add(ref sourceBase, i);
Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
Vector<float> f0 = ConvertToSingle(w0);
Vector<float> f1 = ConvertToSingle(w1);
Vector<float> f2 = ConvertToSingle(w2);
Vector<float> f3 = ConvertToSingle(w3);
ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
VerifySpanInput(source, dest, Vector<byte>.Count);
nuint n = dest.VectorCount<byte>();
ref Vector<float> sourceBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));
for (nuint i = 0; i < n; i++)
{
ref Vector<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector<float> f0 = s;
Vector<float> f1 = Unsafe.Add(ref s, 1);
Vector<float> f2 = Unsafe.Add(ref s, 2);
Vector<float> f3 = Unsafe.Add(ref s, 3);
Vector<uint> w0 = ConvertToUInt32(f0);
Vector<uint> w1 = ConvertToUInt32(f1);
Vector<uint> w2 = ConvertToUInt32(f2);
Vector<uint> w3 = ConvertToUInt32(f3);
var u0 = Vector.Narrow(w0, w1);
var u1 = Vector.Narrow(w2, w3);
Unsafe.Add(ref destBase, i) = Vector.Narrow(u0, u1);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<uint> ConvertToUInt32(Vector<float> vf)
{
var maxBytes = new Vector<float>(255f);
vf *= maxBytes;
vf += new Vector<float>(0.5f);
vf = Vector.Min(Vector.Max(vf, Vector<float>.Zero), maxBytes);
var vi = Vector.ConvertToInt32(vf);
return Vector.AsVectorUInt32(vi);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<float> ConvertToSingle(Vector<uint> u)
{
var vi = Vector.AsVectorInt32(u);
var v = Vector.ConvertToSingle(vi);
v *= new Vector<float>(1f / 255f);
return v;
}
}
}

144
src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs

@ -1,144 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Fallback implementation based on <see cref="Vector4"/> (128bit).
/// For <see cref="Vector4"/>, efficient software fallback implementations are present,
/// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
/// </summary>
public static class FallbackIntrinsics128
{
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
int remainder = Numerics.Modulo4(source.Length);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
}
}
/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
int remainder = Numerics.Modulo4(source.Length);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
NormalizedFloatToByteSaturate(
source[..adjustedCount],
dest[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// </summary>
[MethodImpl(InliningOptions.ColdPath)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
VerifySpanInput(source, dest, 4);
uint count = (uint)dest.Length / 4;
if (count == 0)
{
return;
}
ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
const float scale = 1f / 255f;
Vector4 d = default;
for (nuint i = 0; i < count; i++)
{
ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
d.X = s.X;
d.Y = s.Y;
d.Z = s.Z;
d.W = s.W;
d *= scale;
Unsafe.Add(ref dBase, i) = d;
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> using <see cref="Vector4"/>.
/// </summary>
[MethodImpl(InliningOptions.ColdPath)]
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
VerifySpanInput(source, dest, 4);
uint count = (uint)source.Length / 4;
if (count == 0)
{
return;
}
ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
var half = new Vector4(0.5f);
var maxBytes = new Vector4(255f);
for (nuint i = 0; i < count; i++)
{
Vector4 s = Unsafe.Add(ref sBase, i);
s *= maxBytes;
s += half;
s = Numerics.Clamp(s, Vector4.Zero, maxBytes);
ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
d.X = (byte)s.X;
d.Y = (byte)s.Y;
d.Z = (byte)s.Z;
d.W = (byte)s.W;
}
}
[StructLayout(LayoutKind.Sequential)]
private struct ByteVector4
{
public byte X;
public byte Y;
public byte Z;
public byte W;
}
}
}

335
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -17,8 +17,13 @@ internal static partial class SimdUtils
{
public static class HwIntrinsics
{
#pragma warning disable SA1117 // Parameters should be on same line or separate lines
#pragma warning disable SA1137 // Elements should have the same indentation
[MethodImpl(MethodImplOptions.AggressiveInlining)] // too much IL for JIT to inline, so give a hint
public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsInt32();
public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<int> PermuteMaskDeinterleave16x32() => Vector512.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<uint> PermuteMaskEvenOdd8x32() => Vector256.Create(0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32();
@ -38,17 +43,15 @@ internal static partial class SimdUtils
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> ShuffleMaskSlice4Nx16() => Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80);
#pragma warning disable SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create((byte)
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15,
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create(
(byte)0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15,
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<uint> PermuteMaskShiftAlpha8x32() => Vector256.Create(
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0,
5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32();
#pragma warning restore SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
public static Vector256<uint> PermuteMaskShiftAlpha8x32() => Vector256.Create(0u, 1, 2, 4, 5, 6, 3, 7);
#pragma warning restore SA1137 // Elements should have the same indentation
#pragma warning restore SA1117 // Parameters should be on same line or separate lines
/// <summary>
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@ -749,17 +752,23 @@ internal static partial class SimdUtils
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
ref Span<float> destination)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported || Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
int remainder;
if (Avx2.IsSupported)
if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
}
else if (Avx2.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
}
@ -772,10 +781,10 @@ internal static partial class SimdUtils
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
ByteToNormalizedFloat(source[..adjustedCount], destination[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
destination = destination[adjustedCount..];
}
}
}
@ -783,97 +792,126 @@ internal static partial class SimdUtils
/// <summary>
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
/// </remarks>
internal static unsafe void ByteToNormalizedFloat(
ReadOnlySpan<byte> source,
Span<float> dest)
Span<float> destination)
{
fixed (byte* sourceBase = source)
if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
{
if (Avx2.IsSupported)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
nuint n = dest.Vector256Count<byte>();
DebugVerifySpanInput(source, destination, Vector512<byte>.Count);
ref Vector256<float> destBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
nuint n = destination.Vector512Count<byte>();
Vector256<float> scale = Vector256.Create(1 / (float)byte.MaxValue);
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector256<byte>.Count * i;
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector512<byte>.Count * i;
Vector512<int> i0 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si));
Vector512<int> i1 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector512<int>.Count));
Vector512<int> i2 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 2)));
Vector512<int> i3 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 3)));
// Declare multiplier on each line. Codegen is better.
Vector512<float> f0 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i0);
Vector512<float> f1 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i1);
Vector512<float> f2 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i2);
Vector512<float> f3 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i3);
ref Vector512<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
else
}
else if (Avx2.IsSupported)
{
DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
nuint n = destination.Vector256Count<byte>();
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
for (nuint i = 0; i < n; i++)
{
// Sse
VerifySpanInput(source, dest, Vector128<byte>.Count);
nuint si = (uint)Vector256<byte>.Count * i;
Vector256<int> i0 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si));
Vector256<int> i1 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector256<int>.Count));
Vector256<int> i2 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector256<int>.Count * 2)));
// Ensure overreads past 16 byte boundary do not happen in debug due to lack of containment.
ref ulong refULong = ref Unsafe.As<byte, ulong>(ref Unsafe.Add(ref sourceBase, si));
Vector256<int> i3 = Avx2.ConvertToVector256Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refULong, 3)).AsByte());
// Declare multiplier on each line. Codegen is better.
Vector256<float> f0 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i0);
Vector256<float> f1 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i1);
Vector256<float> f2 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i2);
Vector256<float> f3 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i3);
ref Vector256<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
else if (Vector128.IsHardwareAccelerated)
{
DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
nuint n = dest.Vector128Count<byte>();
nuint n = destination.Vector128Count<byte>();
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
ref Vector128<float> destBase =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector128<byte>.Count * i;
Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
Vector128<byte> zero = Vector128<byte>.Zero;
Vector128<int> i0, i1, i2, i3;
if (Sse41.IsSupported)
{
ref int refInt = ref Unsafe.As<byte, int>(ref Unsafe.Add(ref sourceBase, si));
for (nuint i = 0; i < n; i++)
i0 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(refInt).AsByte());
i1 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 1)).AsByte());
i2 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 2)).AsByte());
i3 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 3)).AsByte());
}
else
{
nuint si = (uint)Vector128<byte>.Count * i;
Vector128<int> i0, i1, i2, i3;
if (Sse41.IsSupported)
{
i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
}
else
{
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
}
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
// Sse2, AdvSimd, etc
Vector128<byte> b = Vector128.LoadUnsafe(ref sourceBase, si);
(Vector128<ushort> s0, Vector128<ushort> s1) = Vector128.Widen(b);
(i0, i1) = Vector128.Widen(s0.AsInt16());
(i2, i3) = Vector128.Widen(s1.AsInt16());
}
Vector128<float> f0 = scale * Vector128.ConvertToSingle(i0);
Vector128<float> f1 = scale * Vector128.ConvertToSingle(i1);
Vector128<float> f2 = scale * Vector128.ConvertToSingle(i2);
Vector128<float> f3 = scale * Vector128.ConvertToSingle(i3);
ref Vector128<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
}
@ -881,17 +919,24 @@ internal static partial class SimdUtils
/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
ref Span<byte> destination)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported || Sse2.IsSupported)
if (Sse2.IsSupported || AdvSimd.IsSupported)
{
int remainder;
if (Avx2.IsSupported)
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
}
else if (Avx2.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
}
@ -906,10 +951,10 @@ internal static partial class SimdUtils
{
NormalizedFloatToByteSaturate(
source[..adjustedCount],
dest[..adjustedCount]);
destination[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
destination = destination[adjustedCount..];
}
}
}
@ -917,25 +962,58 @@ internal static partial class SimdUtils
/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
/// </remarks>
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
Span<byte> destination)
{
if (Avx2.IsSupported)
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
DebugVerifySpanInput(source, destination, Vector512<byte>.Count);
nuint n = dest.Vector256Count<byte>();
nuint n = destination.Vector512Count<byte>();
ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<byte> destinationBase = ref Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(destination));
ref Vector256<byte> destBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
Vector512<float> scale = Vector512.Create((float)byte.MaxValue);
Vector512<int> mask = PermuteMaskDeinterleave16x32();
for (nuint i = 0; i < n; i++)
{
ref Vector512<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector512<float> f0 = scale * s;
Vector512<float> f1 = scale * Unsafe.Add(ref s, 1);
Vector512<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector512<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0);
Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1);
Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2);
Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3);
Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1);
Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3);
Vector512<byte> b = Avx512BW.PackUnsignedSaturate(u0, u1);
b = Avx512F.PermuteVar16x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destinationBase, i) = b;
}
}
else if (Avx2.IsSupported)
{
DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
nuint n = destination.Vector256Count<byte>();
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<byte> destinationBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(destination));
Vector256<float> scale = Vector256.Create((float)byte.MaxValue);
Vector256<int> mask = PermuteMaskDeinterleave8x32();
@ -944,36 +1022,33 @@ internal static partial class SimdUtils
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector256<float> f0 = Avx.Multiply(scale, s);
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
Vector256<float> f0 = scale * s;
Vector256<float> f1 = scale * Unsafe.Add(ref s, 1);
Vector256<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector256<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0);
Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1);
Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2);
Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destBase, i) = b;
Unsafe.Add(ref destinationBase, i) = b;
}
}
else
else if (Sse2.IsSupported || AdvSimd.IsSupported)
{
// Sse
VerifySpanInput(source, dest, Vector128<byte>.Count);
// Sse, AdvSimd
DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
nuint n = dest.Vector128Count<byte>();
nuint n = destination.Vector128Count<byte>();
ref Vector128<float> sourceBase =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<byte> destBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination));
Vector128<float> scale = Vector128.Create((float)byte.MaxValue);
@ -981,20 +1056,20 @@ internal static partial class SimdUtils
{
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector128<float> f0 = Sse.Multiply(scale, s);
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
Vector128<float> f0 = scale * s;
Vector128<float> f1 = scale * Unsafe.Add(ref s, 1);
Vector128<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector128<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0);
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1);
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2);
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3);
Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0);
Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1);
Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2);
Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3);
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3);
Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3);
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1);
}
}
}

118
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -22,13 +22,6 @@ internal static partial class SimdUtils
public static bool HasVector8 { get; } =
Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
/// <summary>
/// Gets a value indicating whether <see cref="Vector{T}"/> code is being JIT-ed to SSE instructions
/// where float and integer registers are of size 128 byte.
/// </summary>
public static bool HasVector4 { get; } =
Vector.IsHardwareAccelerated && Vector<float>.Count == 4;
/// <summary>
/// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
/// </summary>
@ -69,111 +62,8 @@ internal static partial class SimdUtils
}
}
/// <summary>
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of bytes</param>
/// <param name="dest">The destination span of floats</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
// Also deals with the remainder from previous conversions:
FallbackIntrinsics128.ByteToNormalizedFloatReduce(ref source, ref dest);
// Deal with the remainder:
if (source.Length > 0)
{
ConvertByteToNormalizedFloatRemainder(source, dest);
}
}
/// <summary>
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="dest">The destination span of bytes</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
// Also deals with the remainder from previous conversions:
FallbackIntrinsics128.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
// Deal with the remainder:
if (source.Length > 0)
{
ConvertNormalizedFloatToByteRemainder(source, dest);
}
}
[MethodImpl(InliningOptions.ColdPath)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(dest);
// There are at most 3 elements at this point, having a for loop is overkill.
// Let's minimize the no. of instructions!
switch (source.Length)
{
case 3:
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
goto case 2;
case 2:
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
goto case 1;
case 1:
dBase = sBase / 255f;
break;
}
}
[MethodImpl(InliningOptions.ColdPath)]
private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> dest)
{
ref float sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
switch (source.Length)
{
case 3:
Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2));
goto case 2;
case 2:
Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1));
goto case 1;
case 1:
dBase = ConvertToByte(sBase);
break;
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
[Conditional("DEBUG")]
private static void VerifyHasVector8(string operation)
{
if (!HasVector8)
{
throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
}
}
[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
private static void DebugVerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(
@ -183,11 +73,11 @@ internal static partial class SimdUtils
}
[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
private static void DebugVerifySpanInput(ReadOnlySpan<float> source, Span<byte> destination, int shouldBeDivisibleBy)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(
Numerics.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
Numerics.ModuloP2(destination.Length, shouldBeDivisibleBy) == 0,
nameof(source),
$"length should be divisible by {shouldBeDivisibleBy}!");
}

74
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -62,6 +62,7 @@ internal static class Vector128Utilities
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector128{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
{
if (Sse.IsSupported)
@ -84,6 +85,7 @@ internal static class Vector128Utilities
/// <returns>
/// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
{
if (Ssse3.IsSupported)
@ -155,6 +157,7 @@ internal static class Vector128Utilities
/// <param name="right">The right hand source vector.</param>
/// <param name="mask">An 8-bit mask used for the operation.</param>
/// <returns>The <see cref="Vector128{Byte}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> AlignRight(Vector128<byte> left, Vector128<byte> right, [ConstantExpected(Max = (byte)15)] byte mask)
{
if (Ssse3.IsSupported)
@ -171,6 +174,77 @@ internal static class Vector128Utilities
return default;
}
/// <summary>
/// Performs a conversion from a 128-bit vector of 4 single-precision floating-point values to a 128-bit vector of 4 signed 32-bit integer values.
/// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
/// </summary>
/// <param name="vector">The value to convert.</param>
/// <returns>The <see cref="Vector128{Int32}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<int> ConvertToInt32RoundToEven(Vector128<float> vector)
{
if (Sse2.IsSupported)
{
return Sse2.ConvertToVector128Int32(vector);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.ConvertToInt32RoundToEven(vector);
}
Vector128<float> sign = vector & Vector128.Create(-0.0f);
Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608.0f);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return Vector128.ConvertToInt32(val_2p23_f32 | sign);
}
/// <summary>
/// Packs signed 16-bit integers to unsigned 8-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector128{Int16}"/>.</returns>
public static Vector128<byte> PackUnsignedSaturate(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.PackUnsignedSaturate(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
}
ThrowUnreachableException();
return default;
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector128{Int16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128<int> right)
{
if (Sse2.IsSupported)
{
return Sse2.PackSignedSaturate(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
}
ThrowUnreachableException();
return default;
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

39
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -25,7 +25,7 @@ internal static class Vector256Utilities
public static bool SupportsShuffleFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx.IsSupported;
get => Avx.IsSupported || Sse.IsSupported;
}
/// <summary>
@ -43,6 +43,7 @@ internal static class Vector256Utilities
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
{
if (Avx.IsSupported)
@ -50,6 +51,13 @@ internal static class Vector256Utilities
return Avx.Shuffle(vector, vector, control);
}
if (Sse.IsSupported)
{
Vector128<float> lower = vector.GetLower();
Vector128<float> upper = vector.GetUpper();
return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -62,6 +70,7 @@ internal static class Vector256Utilities
/// The per-element indices used to select a value from <paramref name="vector" />.
/// </param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
{
if (Avx2.IsSupported)
@ -73,6 +82,34 @@ internal static class Vector256Utilities
return default;
}
/// <summary>
/// Performs a conversion from a 256-bit vector of 8 single-precision floating-point values to a 256-bit vector of 8 signed 32-bit integer values.
/// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
/// </summary>
/// <param name="vector">The value to convert.</param>
/// <returns>The <see cref="Vector256{Int32}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
{
if (Avx.IsSupported)
{
return Avx.ConvertToVector256Int32(vector);
}
if (Sse2.IsSupported)
{
Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
return Vector256.Create(lower, upper);
}
Vector256<float> sign = vector & Vector256.Create(-0.0f);
Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608.0f);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return Vector256.ConvertToInt32(val_2p23_f32 | sign);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

37
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -25,7 +25,7 @@ internal static class Vector512Utilities
public static bool SupportsShuffleFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512F.IsSupported;
get => Avx512F.IsSupported || Avx.IsSupported;
}
/// <summary>
@ -51,6 +51,13 @@ internal static class Vector512Utilities
return Avx512F.Shuffle(vector, vector, control);
}
if (Avx.IsSupported)
{
Vector256<float> lower = vector.GetLower();
Vector256<float> upper = vector.GetUpper();
return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -75,6 +82,34 @@ internal static class Vector512Utilities
return default;
}
/// <summary>
/// Performs a conversion from a 512-bit vector of 16 single-precision floating-point values to a 512-bit vector of 16 signed 32-bit integer values.
/// Rounding is equivalent to <see cref="MidpointRounding.ToEven"/>.
/// </summary>
/// <param name="vector">The value to convert.</param>
/// <returns>The <see cref="Vector128{Int32}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
{
if (Avx512F.IsSupported)
{
return Avx512F.ConvertToVector512Int32(vector);
}
if (Avx.IsSupported)
{
Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0.0f);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return Vector512.ConvertToInt32(val_2p23_f32 | sign);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

42
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -386,29 +386,33 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
{
DebugGuard.IsTrue(
SimdUtils.HasVector8,
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
ref Vector<short> sRef = ref Unsafe.As<Block8x8, Vector<short>>(ref source);
ref Vector<float> dRef = ref Unsafe.As<Block8x8F, Vector<float>>(ref this);
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector<ushort>.Count == 16 on AVX2
// Vector256<ushort>.Count == 16 on AVX2
// We can process 2 block rows in a single step
SimdUtils.ExtendedIntrinsics.ConvertToSingle(sRef, out Vector<float> top, out Vector<float> bottom);
dRef = top;
Unsafe.Add(ref dRef, 1) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 1), out top, out bottom);
Unsafe.Add(ref dRef, 2) = top;
Unsafe.Add(ref dRef, 3) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 2), out top, out bottom);
Unsafe.Add(ref dRef, 4) = top;
Unsafe.Add(ref dRef, 5) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 3), out top, out bottom);
Unsafe.Add(ref dRef, 6) = top;
Unsafe.Add(ref dRef, 7) = bottom;
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
}
/// <summary>

16
src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs

@ -20,15 +20,15 @@ public partial struct Rgba32
/// <inheritdoc />
public override void ToVector4(
Configuration configuration,
ReadOnlySpan<Rgba32> sourcePixels,
ReadOnlySpan<Rgba32> source,
Span<Vector4> destinationVectors,
PixelConversionModifiers modifiers)
{
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationVectors, nameof(destinationVectors));
Guard.DestinationShouldNotBeTooShort(source, destinationVectors, nameof(destinationVectors));
destinationVectors = destinationVectors[..sourcePixels.Length];
destinationVectors = destinationVectors[..source.Length];
SimdUtils.ByteToNormalizedFloat(
MemoryMarshal.Cast<Rgba32, byte>(sourcePixels),
MemoryMarshal.Cast<Rgba32, byte>(source),
MemoryMarshal.Cast<Vector4, float>(destinationVectors));
Vector4Converters.ApplyForwardConversionModifiers(destinationVectors, modifiers);
}
@ -37,16 +37,16 @@ public partial struct Rgba32
public override void FromVector4Destructive(
Configuration configuration,
Span<Vector4> sourceVectors,
Span<Rgba32> destinationPixels,
Span<Rgba32> destination,
PixelConversionModifiers modifiers)
{
Guard.DestinationShouldNotBeTooShort(sourceVectors, destinationPixels, nameof(destinationPixels));
Guard.DestinationShouldNotBeTooShort(sourceVectors, destination, nameof(destination));
destinationPixels = destinationPixels[..sourceVectors.Length];
destination = destination[..sourceVectors.Length];
Vector4Converters.ApplyBackwardConversionModifiers(sourceVectors, modifiers);
SimdUtils.NormalizedFloatToByteSaturate(
MemoryMarshal.Cast<Vector4, float>(sourceVectors),
MemoryMarshal.Cast<Rgba32, byte>(destinationPixels));
MemoryMarshal.Cast<Rgba32, byte>(destination));
}
/// <inheritdoc />

75
src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs

@ -5,6 +5,7 @@ using System.Buffers;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace SixLabors.ImageSharp.PixelFormats.Utils;
@ -31,74 +32,86 @@ internal static partial class Vector4Converters
/// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.ToVector4(Configuration,ReadOnlySpan{TPixel},Span{Vector4},PixelConversionModifiers)"/>
/// The method works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
/// <typeparam name="TPixel">The type of pixel format.</typeparam>
/// <param name="configuration">The configuration.</param>
/// <param name="pixelOperations">The pixel operations instance.</param>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <param name="modifiers">The conversion modifier flags.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ToVector4<TPixel>(
Configuration configuration,
PixelOperations<TPixel> pixelOperations,
ReadOnlySpan<TPixel> sourcePixels,
Span<Vector4> destVectors,
ReadOnlySpan<TPixel> source,
Span<Vector4> destination,
PixelConversionModifiers modifiers)
where TPixel : unmanaged, IPixel<TPixel>
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destVectors, nameof(destVectors));
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
int count = sourcePixels.Length;
int count = source.Length;
// Not worth for small buffers:
if (count < Vector4ConversionThreshold)
{
Default.UnsafeToVector4(sourcePixels, destVectors, modifiers);
Default.UnsafeToVector4(source, destination, modifiers);
return;
}
// Using the last quarter of 'destVectors' as a temporary buffer to avoid allocation:
// Using the last quarter of 'destination' as a temporary buffer to avoid allocation:
int countWithoutLastItem = count - 1;
ReadOnlySpan<TPixel> reducedSource = sourcePixels[..countWithoutLastItem];
Span<Rgba32> lastQuarterOfDestBuffer = MemoryMarshal.Cast<Vector4, Rgba32>(destVectors).Slice((3 * count) + 1, countWithoutLastItem);
pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestBuffer);
ReadOnlySpan<TPixel> reducedSource = source[..countWithoutLastItem];
Span<Rgba32> lastQuarterOfDestination = MemoryMarshal.Cast<Vector4, Rgba32>(destination).Slice((3 * count) + 1, countWithoutLastItem);
pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestination);
// 'destVectors' and 'lastQuarterOfDestBuffer' are overlapping buffers,
// 'destination' and 'lastQuarterOfDestination' are overlapping buffers,
// but we are always reading/writing at different positions:
SimdUtils.ByteToNormalizedFloat(
MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestBuffer),
MemoryMarshal.Cast<Vector4, float>(destVectors[..countWithoutLastItem]));
MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestination),
MemoryMarshal.Cast<Vector4, float>(destination[..countWithoutLastItem]));
destVectors[countWithoutLastItem] = sourcePixels[countWithoutLastItem].ToVector4();
destination[countWithoutLastItem] = source[countWithoutLastItem].ToVector4();
// TODO: Investigate optimized 1-pass approach!
ApplyForwardConversionModifiers(destVectors, modifiers);
ApplyForwardConversionModifiers(destination, modifiers);
}
/// <summary>
/// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.FromVector4Destructive(Configuration,Span{Vector4},Span{TPixel},PixelConversionModifiers)"/>
/// The method is works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
/// <typeparam name="TPixel">The type of pixel format.</typeparam>
/// <param name="configuration">The configuration.</param>
/// <param name="pixelOperations">The pixel operations instance.</param>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <param name="modifiers">The conversion modifier flags.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void FromVector4<TPixel>(
Configuration configuration,
PixelOperations<TPixel> pixelOperations,
Span<Vector4> sourceVectors,
Span<TPixel> destPixels,
Span<Vector4> source,
Span<TPixel> destination,
PixelConversionModifiers modifiers)
where TPixel : unmanaged, IPixel<TPixel>
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourceVectors, destPixels, nameof(destPixels));
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
int count = sourceVectors.Length;
int count = source.Length;
// Not worth for small buffers:
if (count < Vector4ConversionThreshold)
{
Default.UnsafeFromVector4(sourceVectors, destPixels, modifiers);
Default.UnsafeFromVector4(source, destination, modifiers);
return;
}
// TODO: Investigate optimized 1-pass approach!
ApplyBackwardConversionModifiers(sourceVectors, modifiers);
ApplyBackwardConversionModifiers(source, modifiers);
// For the opposite direction it's not easy to implement the trick used in RunRgba32CompatibleToVector4Conversion,
// so let's allocate a temporary buffer as usually:
@ -106,20 +119,30 @@ internal static partial class Vector4Converters
Span<Rgba32> tempSpan = tempBuffer.Memory.Span;
SimdUtils.NormalizedFloatToByteSaturate(
MemoryMarshal.Cast<Vector4, float>(sourceVectors),
MemoryMarshal.Cast<Vector4, float>(source),
MemoryMarshal.Cast<Rgba32, byte>(tempSpan));
pixelOperations.FromRgba32(configuration, tempSpan, destPixels);
pixelOperations.FromRgba32(configuration, tempSpan, destination);
}
private static int CalculateVector4ConversionThreshold()
{
if (!Vector.IsHardwareAccelerated)
if (!Vector128.IsHardwareAccelerated)
{
return int.MaxValue;
}
return SimdUtils.ExtendedIntrinsics.IsAvailable && SimdUtils.HasVector8 ? 256 : 128;
if (Vector512.IsHardwareAccelerated)
{
return 512;
}
if (Vector256.IsHardwareAccelerated)
{
return 256;
}
return 128;
}
}
}

102
tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs

@ -18,9 +18,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
public abstract class FromVector4<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
protected IMemoryOwner<Vector4> source;
protected IMemoryOwner<Vector4> Source { get; set; }
protected IMemoryOwner<TPixel> destination;
protected IMemoryOwner<TPixel> Destination { get; set; }
protected Configuration Configuration => Configuration.Default;
@ -31,22 +31,22 @@ public abstract class FromVector4<TPixel>
[GlobalSetup]
public void Setup()
{
this.destination = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.source = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
this.Destination = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.Source = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
}
[GlobalCleanup]
public void Cleanup()
{
this.destination.Dispose();
this.source.Dispose();
this.Destination.Dispose();
this.Source.Dispose();
}
// [Benchmark]
public void PerElement()
{
ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan());
ref TPixel d = ref MemoryMarshal.GetReference(this.destination.GetSpan());
ref Vector4 s = ref MemoryMarshal.GetReference(this.Source.GetSpan());
ref TPixel d = ref MemoryMarshal.GetReference(this.Destination.GetSpan());
for (nuint i = 0; i < (uint)this.Count; i++)
{
Unsafe.Add(ref d, i) = TPixel.FromVector4(Unsafe.Add(ref s, i));
@ -55,38 +55,20 @@ public abstract class FromVector4<TPixel>
[Benchmark(Baseline = true)]
public void PixelOperations_Base()
=> new PixelOperations<TPixel>().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
=> new PixelOperations<TPixel>().FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan());
[Benchmark]
public void PixelOperations_Specialized()
=> PixelOperations<TPixel>.Instance.FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
=> PixelOperations<TPixel>.Instance.FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan());
}
public class FromVector4Rgba32 : FromVector4<Rgba32>
{
[Benchmark]
public void FallbackIntrinsics128()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(sBytes, dFloats);
}
[Benchmark]
public void ExtendedIntrinsic()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
}
[Benchmark]
public void UseHwIntrinsics()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.Source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.Destination.GetSpan());
SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
}
@ -96,8 +78,8 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
[Benchmark]
public void UseAvx2_Grouped()
{
Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.Source.GetSpan());
Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.Destination.GetSpan());
nuint n = (uint)dest.Length / (uint)Vector<byte>.Count;
@ -107,7 +89,7 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
var maxBytes = Vector256.Create(255f);
Vector256<float> maxBytes = Vector256.Create(255f);
for (nuint i = 0; i < n; i++)
{
@ -137,25 +119,37 @@ public class FromVector4Rgba32 : FromVector4<Rgba32>
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
vf = Avx.Multiply(scale, vf);
return Avx.ConvertToVector256Int32(vf);
}
// *** RESULTS 2020 March: ***
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
// .NET Core SDK=3.1.200-preview-014971
// Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
//
// | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:|
// | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - |
// | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - |
// | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - |
// | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - |
// | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - |
// | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B |
// | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - |
/*
BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
.NET SDK 8.0.200-preview.23624.5
[Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Job-YJYLLR : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3
LaunchCount=1 WarmupCount=3
| Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio |
|---------------------------- |------ |------------:|-------------:|-----------:|------:|--------:|----------:|------------:|
| PixelOperations_Base | 64 | 114.80 ns | 16.459 ns | 0.902 ns | 1.00 | 0.00 | - | NA |
| PixelOperations_Specialized | 64 | 28.91 ns | 80.482 ns | 4.411 ns | 0.25 | 0.04 | - | NA |
| FallbackIntrinsics128 | 64 | 133.60 ns | 23.750 ns | 1.302 ns | 1.16 | 0.02 | - | NA |
| ExtendedIntrinsic | 64 | 40.11 ns | 10.183 ns | 0.558 ns | 0.35 | 0.01 | - | NA |
| UseHwIntrinsics | 64 | 14.71 ns | 4.860 ns | 0.266 ns | 0.13 | 0.00 | - | NA |
| UseAvx2_Grouped | 64 | 20.23 ns | 11.619 ns | 0.637 ns | 0.18 | 0.00 | - | NA |
| | | | | | | | | |
| PixelOperations_Base | 256 | 387.94 ns | 31.591 ns | 1.732 ns | 1.00 | 0.00 | - | NA |
| PixelOperations_Specialized | 256 | 50.93 ns | 22.388 ns | 1.227 ns | 0.13 | 0.00 | - | NA |
| FallbackIntrinsics128 | 256 | 509.72 ns | 249.926 ns | 13.699 ns | 1.31 | 0.04 | - | NA |
| ExtendedIntrinsic | 256 | 140.32 ns | 9.353 ns | 0.513 ns | 0.36 | 0.00 | - | NA |
| UseHwIntrinsics | 256 | 41.99 ns | 16.000 ns | 0.877 ns | 0.11 | 0.00 | - | NA |
| UseAvx2_Grouped | 256 | 63.81 ns | 2.360 ns | 0.129 ns | 0.16 | 0.00 | - | NA |
| | | | | | | | | |
| PixelOperations_Base | 2048 | 2,979.49 ns | 2,023.706 ns | 110.926 ns | 1.00 | 0.00 | - | NA |
| PixelOperations_Specialized | 2048 | 326.19 ns | 19.077 ns | 1.046 ns | 0.11 | 0.00 | - | NA |
| FallbackIntrinsics128 | 2048 | 3,885.95 ns | 411.078 ns | 22.533 ns | 1.31 | 0.05 | - | NA |
| ExtendedIntrinsic | 2048 | 1,078.58 ns | 136.960 ns | 7.507 ns | 0.36 | 0.01 | - | NA |
| UseHwIntrinsics | 2048 | 312.07 ns | 68.662 ns | 3.764 ns | 0.10 | 0.00 | - | NA |
| UseAvx2_Grouped | 2048 | 451.83 ns | 41.742 ns | 2.288 ns | 0.15 | 0.01 | - | NA |
*/
}

66
tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs

@ -7,48 +7,26 @@ using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Benchmarks.Bulk;
[Config(typeof(Config.Short))]
public class FromVector4_Rgb24 : FromVector4<Rgb24>
{
}
public class FromVector4_Rgb24 : FromVector4<Rgb24>;
// 2020-11-02
// ##########
//
// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
// .NET Core SDK=3.1.403
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
//
// IterationCount=3 LaunchCount=1 WarmupCount=3
//
// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 64 | 343.2 ns | 305.91 ns | 16.77 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 64 | 320.8 ns | 19.93 ns | 1.09 ns | 0.94 | 0.05 | - | - | - | - |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 64 | 234.3 ns | 17.98 ns | 0.99 ns | 1.00 | 0.00 | 0.0052 | - | - | 24 B |
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 64 | 246.0 ns | 82.34 ns | 4.51 ns | 1.05 | 0.02 | - | - | - | - |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 64 | 222.3 ns | 39.46 ns | 2.16 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 64 | 243.4 ns | 33.58 ns | 1.84 ns | 1.09 | 0.01 | - | - | - | - |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 256 | 824.9 ns | 32.77 ns | 1.80 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 256 | 967.0 ns | 39.09 ns | 2.14 ns | 1.17 | 0.01 | 0.0172 | - | - | 72 B |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 256 | 756.9 ns | 94.43 ns | 5.18 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B |
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns | 1.32 | 0.22 | 0.0172 | - | - | 72 B |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 256 | 748.6 ns | 248.03 ns | 13.60 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 256 | 437.0 ns | 36.48 ns | 2.00 ns | 0.58 | 0.01 | 0.0172 | - | - | 72 B |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 2048 | 5,751.6 ns | 704.24 ns | 38.60 ns | 1.00 | 0.00 | - | - | - | 24 B |
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 2048 | 4,391.6 ns | 718.17 ns | 39.37 ns | 0.76 | 0.00 | 0.0153 | - | - | 72 B |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 2048 | 6,202.0 ns | 1,815.18 ns | 99.50 ns | 1.00 | 0.00 | - | - | - | 24 B |
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 2048 | 4,225.6 ns | 1,004.03 ns | 55.03 ns | 0.68 | 0.01 | 0.0153 | - | - | 72 B |
// | | | | | | | | | | | | | |
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns | 1.00 | 0.00 | - | - | - | 24 B |
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 2048 | 1,822.7 ns | 1,764.43 ns | 96.71 ns | 0.30 | 0.02 | 0.0172 | - | - | 72 B |
/*
BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
.NET SDK 8.0.200-preview.23624.5
[Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Job-NEHCEM : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3
LaunchCount=1 WarmupCount=3
| Method | Count | Mean | Error | StdDev | Ratio | Gen0 | Allocated | Alloc Ratio |
|---------------------------- |------ |------------:|----------:|---------:|------:|-------:|----------:|------------:|
| PixelOperations_Base | 64 | 95.87 ns | 13.60 ns | 0.745 ns | 1.00 | - | - | NA |
| PixelOperations_Specialized | 64 | 97.34 ns | 30.34 ns | 1.663 ns | 1.02 | - | - | NA |
| | | | | | | | | |
| PixelOperations_Base | 256 | 337.80 ns | 88.10 ns | 4.829 ns | 1.00 | - | - | NA |
| PixelOperations_Specialized | 256 | 195.07 ns | 30.54 ns | 1.674 ns | 0.58 | 0.0153 | 96 B | NA |
| | | | | | | | | |
| PixelOperations_Base | 2048 | 2,561.79 ns | 162.45 ns | 8.905 ns | 1.00 | - | - | NA |
| PixelOperations_Specialized | 2048 | 741.85 ns | 18.05 ns | 0.989 ns | 0.29 | 0.0153 | 96 B | NA |
*/

25
tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs

@ -14,9 +14,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
public abstract class ToVector4<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
protected IMemoryOwner<TPixel> source;
protected IMemoryOwner<TPixel> Source { get; set; }
protected IMemoryOwner<Vector4> destination;
protected IMemoryOwner<Vector4> Destination { get; set; }
protected Configuration Configuration => Configuration.Default;
@ -26,22 +26,22 @@ public abstract class ToVector4<TPixel>
[GlobalSetup]
public void Setup()
{
this.source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
this.Source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.Destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
}
[GlobalCleanup]
public void Cleanup()
{
this.source.Dispose();
this.destination.Dispose();
this.Source.Dispose();
this.Destination.Dispose();
}
// [Benchmark]
public void Naive()
{
Span<TPixel> s = this.source.GetSpan();
Span<Vector4> d = this.destination.GetSpan();
Span<TPixel> s = this.Source.GetSpan();
Span<Vector4> d = this.Destination.GetSpan();
for (int i = 0; i < this.Count; i++)
{
@ -50,11 +50,8 @@ public abstract class ToVector4<TPixel>
}
[Benchmark]
public void PixelOperations_Specialized()
{
PixelOperations<TPixel>.Instance.ToVector4(
public void PixelOperations_Specialized() => PixelOperations<TPixel>.Instance.ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
}
this.Source.GetSpan(),
this.Destination.GetSpan());
}

4
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs

@ -16,8 +16,8 @@ public class ToVector4_Bgra32 : ToVector4<Bgra32>
{
new PixelOperations<Bgra32>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
this.Source.GetSpan(),
this.Destination.GetSpan());
}
// RESULTS:

4
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs

@ -16,8 +16,8 @@ public class ToVector4_Rgb24 : ToVector4<Rgb24>
{
new PixelOperations<Rgb24>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
this.Source.GetSpan(),
this.Destination.GetSpan());
}
}

72
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs

@ -14,36 +14,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
[Config(typeof(Config.Short))]
public class ToVector4_Rgba32 : ToVector4<Rgba32>
{
[Benchmark]
public void FallbackIntrinsics128()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(sBytes, dFloats);
}
[Benchmark]
public void PixelOperations_Base()
=> new PixelOperations<Rgba32>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
[Benchmark]
public void ExtendedIntrinsics()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
}
this.Source.GetSpan(),
this.Destination.GetSpan());
[Benchmark]
public void HwIntrinsics()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
}
@ -51,8 +33,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
// [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;
@ -76,14 +58,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
}
n = (uint)(dFloats.Length / Vector<float>.Count);
var scale = new Vector<float>(1f / 255f);
Vector<float> scale = new(1f / 255f);
for (nuint i = 0; i < n; i++)
{
ref Vector<float> dRef = ref Unsafe.Add(ref destBase, i);
var du = Vector.AsVectorInt32(dRef);
var v = Vector.ConvertToSingle(du);
Vector<int> du = Vector.AsVectorInt32(dRef);
Vector<float> v = Vector.ConvertToSingle(du);
v *= scale;
dRef = v;
@ -93,14 +75,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
// [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
var scale = new Vector<float>(1f / 255f);
Vector<float> scale = new(1f / 255f);
for (nuint i = 0; i < n; i++)
{
@ -126,8 +108,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<float> ConvertToNormalizedSingle(Vector<uint> u, Vector<float> scale)
{
var vi = Vector.AsVectorInt32(u);
var v = Vector.ConvertToSingle(vi);
Vector<int> vi = Vector.AsVectorInt32(u);
Vector<float> v = Vector.ConvertToSingle(vi);
v *= scale;
return v;
}
@ -160,4 +142,30 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
PixelOperations_Base | Core | 2048 | 6,752.68 ns | 272.820 ns | 15.4148 ns | 1.67 | 0.02 | - | 24 B |
PixelOperations_Specialized | Core | 2048 | 1,126.13 ns | 79.192 ns | 4.4745 ns |!! 0.28 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock!
*/
/*
BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
.NET SDK 8.0.200-preview.23624.5
[Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Job-DFEQJT : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3
LaunchCount=1 WarmupCount=3
| Method | Count | Mean | Error | StdDev | Allocated |
|---------------------------- |------ |------------:|-----------:|----------:|----------:|
| FallbackIntrinsics128 | 64 | 139.66 ns | 27.429 ns | 1.503 ns | - |
| PixelOperations_Base | 64 | 124.65 ns | 29.653 ns | 1.625 ns | - |
| HwIntrinsics | 64 | 18.16 ns | 4.731 ns | 0.259 ns | - |
| PixelOperations_Specialized | 64 | 27.94 ns | 15.220 ns | 0.834 ns | - |
| FallbackIntrinsics128 | 256 | 525.07 ns | 34.397 ns | 1.885 ns | - |
| PixelOperations_Base | 256 | 464.17 ns | 46.897 ns | 2.571 ns | - |
| HwIntrinsics | 256 | 43.88 ns | 4.525 ns | 0.248 ns | - |
| PixelOperations_Specialized | 256 | 55.57 ns | 14.587 ns | 0.800 ns | - |
| FallbackIntrinsics128 | 2048 | 4,148.44 ns | 476.583 ns | 26.123 ns | - |
| PixelOperations_Base | 2048 | 3,608.42 ns | 66.293 ns | 3.634 ns | - |
| HwIntrinsics | 2048 | 361.42 ns | 35.576 ns | 1.950 ns | - |
| PixelOperations_Specialized | 2048 | 374.82 ns | 33.371 ns | 1.829 ns | - |
*/
}

2
tests/ImageSharp.Benchmarks/LoadResizeSave/README.md

@ -1,4 +1,4 @@
The benchmarks have been adapted from the
The benchmarks have been adapted from the
[PhotoSauce's MemoryStress project](https://github.com/saucecontrol/core-imaging-playground/tree/beeees/MemoryStress).
### Setup

59
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -3,6 +3,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Tests.TestUtilities;
@ -112,26 +113,15 @@ public partial class SimdUtilsTests
public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new() { 0, 4, 8, 28, 1020 };
public static readonly TheoryData<int> ArraySizesDivisibleBy3 = new() { 0, 3, 9, 36, 957 };
public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new() { 0, 32, 512 };
public static readonly TheoryData<int> ArraySizesDivisibleBy64 = new() { 0, 64, 512 };
public static readonly TheoryData<int> ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 };
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(s.Span, d.Span));
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
[MemberData(nameof(ArraySizesDivisibleBy64))]
public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
if (!Sse2.IsSupported)
if (!Sse2.IsSupported && !AdvSimd.IsSupported)
{
return;
}
@ -143,7 +133,7 @@ public partial class SimdUtilsTests
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
count,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
}
[Theory]
@ -166,43 +156,10 @@ public partial class SimdUtilsTests
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count) => TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
count,
(s, d) => SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(s.Span, d.Span));
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) => TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
count,
(s, d) => SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span));
[Theory]
[InlineData(1234)]
public void ExtendedIntrinsics_ConvertToSingle(short scale)
{
int n = Vector<float>.Count;
short[] sData = new Random(scale).GenerateRandomInt16Array(2 * n, (short)-scale, scale);
float[] fData = sData.Select(u => (float)u).ToArray();
Vector<short> source = new(sData);
Vector<float> expected1 = new(fData, 0);
Vector<float> expected2 = new(fData, n);
// Act:
SimdUtils.ExtendedIntrinsics.ConvertToSingle(source, out Vector<float> actual1, out Vector<float> actual2);
// Assert:
Assert.Equal(expected1, actual1);
Assert.Equal(expected2, actual2);
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
[MemberData(nameof(ArraySizesDivisibleBy64))]
public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (!Sse2.IsSupported)
if (!Sse2.IsSupported && !AdvSimd.IsSupported)
{
return;
}
@ -214,7 +171,7 @@ public partial class SimdUtilsTests
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
count,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512BW | HwIntrinsics.DisableAVX2);
}
[Theory]

22
tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs

@ -13,14 +13,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities;
/// </summary>
internal class BasicSerializer : IXunitSerializationInfo
{
private readonly Dictionary<string, string> map = new Dictionary<string, string>();
private readonly Dictionary<string, string> map = [];
public const char Separator = ':';
private string DumpToString(Type type)
{
using var ms = new MemoryStream();
using var writer = new StreamWriter(ms);
using MemoryStream ms = new();
using StreamWriter writer = new(ms);
writer.WriteLine(type.FullName);
foreach (KeyValuePair<string, string> kv in this.map)
{
@ -29,16 +29,16 @@ internal class BasicSerializer : IXunitSerializationInfo
writer.Flush();
byte[] data = ms.ToArray();
return System.Convert.ToBase64String(data);
return Convert.ToBase64String(data);
}
private Type LoadDump(string dump)
{
byte[] data = System.Convert.FromBase64String(dump);
byte[] data = Convert.FromBase64String(dump);
using var ms = new MemoryStream(data);
using var reader = new StreamReader(ms);
var type = Type.GetType(reader.ReadLine());
using MemoryStream ms = new(data);
using StreamReader reader = new(ms);
Type type = Type.GetType(reader.ReadLine());
for (string s = reader.ReadLine(); s != null; s = reader.ReadLine())
{
string[] kv = s.Split(Separator);
@ -50,7 +50,7 @@ internal class BasicSerializer : IXunitSerializationInfo
public static string Serialize(IXunitSerializable serializable)
{
var serializer = new BasicSerializer();
BasicSerializer serializer = new();
serializable.Serialize(serializer);
return serializer.DumpToString(serializable.GetType());
}
@ -58,10 +58,10 @@ internal class BasicSerializer : IXunitSerializationInfo
public static T Deserialize<T>(string dump)
where T : IXunitSerializable
{
var serializer = new BasicSerializer();
BasicSerializer serializer = new();
Type type = serializer.LoadDump(dump);
var result = (T)Activator.CreateInstance(type);
T result = (T)Activator.CreateInstance(type);
result.Deserialize(serializer);
return result;
}

81
tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs

@ -2,6 +2,7 @@
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Globalization;
using Microsoft.DotNet.RemoteExecutor;
using Xunit.Abstractions;
@ -12,7 +13,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities;
/// </summary>
public static class FeatureTestRunner
{
private static readonly char[] SplitChars = { ',', ' ' };
private static readonly char[] SplitChars = [',', ' '];
/// <summary>
/// Allows the deserialization of parameters passed to the feature test.
@ -40,7 +41,7 @@ public static class FeatureTestRunner
/// <returns>The <typeparamref name="T"/> value.</returns>
public static T Deserialize<T>(string value)
where T : IConvertible
=> (T)Convert.ChangeType(value, typeof(T));
=> (T)Convert.ChangeType(value, typeof(T), CultureInfo.InvariantCulture);
/// <summary>
/// Runs the given test <paramref name="action"/> within an environment
@ -127,6 +128,7 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="intrinsics">The intrinsics features.</param>
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
@ -170,6 +172,7 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="intrinsics">The intrinsics features.</param>
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
@ -214,6 +217,8 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <typeparam name="T2">The addition type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="intrinsics">The intrinsics features.</param>
/// <param name="arg1">The value to pass as a parameter to the test action.</param>
@ -261,6 +266,7 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="intrinsics">The intrinsics features.</param>
/// <param name="arg1">The value to pass as a parameter to the test action.</param>
@ -307,6 +313,7 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
/// <param name="intrinsics">The intrinsics features.</param>
@ -350,6 +357,7 @@ public static class FeatureTestRunner
/// Runs the given test <paramref name="action"/> within an environment
/// where the given <paramref name="intrinsics"/> features.
/// </summary>
/// <typeparam name="T">The type of argument.</typeparam>
/// <param name="action">The test action to run.</param>
/// <param name="arg0">The value to pass as a parameter #0 to the test action.</param>
/// <param name="arg1">The value to pass as a parameter #1 to the test action.</param>
@ -395,10 +403,10 @@ public static class FeatureTestRunner
internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
{
// Loop through and translate the given values into COMPlus equivalents
Dictionary<HwIntrinsics, string> features = new();
Dictionary<HwIntrinsics, string> features = [];
foreach (string intrinsic in intrinsics.ToString("G").Split(SplitChars, StringSplitOptions.RemoveEmptyEntries))
{
HwIntrinsics key = (HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic);
HwIntrinsics key = Enum.Parse<HwIntrinsics>(intrinsic);
switch (intrinsic)
{
case nameof(HwIntrinsics.AllowAll):
@ -418,40 +426,47 @@ public static class FeatureTestRunner
}
/// <summary>
/// See <see href="https://github.com/dotnet/runtime/blob/50ac454d8d8a1915188b2a4bb3fff3b81bf6c0cf/src/coreclr/src/jit/jitconfigvalues.h#L224"/>
/// <remarks>
/// <see cref="DisableSIMD"/> ends up impacting all SIMD support(including System.Numerics)
/// but not things like <see cref="DisableBMI1"/>, <see cref="DisableBMI2"/>, and <see cref="DisableLZCNT"/>.
/// </remarks>
/// See <see href="https://github.com/dotnet/runtime/blob/58601ba7da092fe82bb71d087d30df95472968b6/src/coreclr/jit/jitconfigvalues.h#L315"/>
/// </summary>
[Flags]
#pragma warning disable RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
public enum HwIntrinsics
public enum HwIntrinsics : long
#pragma warning restore RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
{
// Use flags so we can pass multiple values without using params.
// Don't base on 0 or use inverse for All as that doesn't translate to string values.
DisableHWIntrinsic = 1 << 0,
DisableSSE = 1 << 1,
DisableSSE2 = 1 << 2,
DisableAES = 1 << 3,
DisablePCLMULQDQ = 1 << 4,
DisableSSE3 = 1 << 5,
DisableSSSE3 = 1 << 6,
DisableSSE41 = 1 << 7,
DisableSSE42 = 1 << 8,
DisablePOPCNT = 1 << 9,
DisableAVX = 1 << 10,
DisableFMA = 1 << 11,
DisableAVX2 = 1 << 12,
DisableBMI1 = 1 << 13,
DisableBMI2 = 1 << 14,
DisableLZCNT = 1 << 15,
DisableArm64AdvSimd = 1 << 16,
DisableArm64Crc32 = 1 << 17,
DisableArm64Dp = 1 << 18,
DisableArm64Aes = 1 << 19,
DisableArm64Sha1 = 1 << 20,
DisableArm64Sha256 = 1 << 21,
AllowAll = 1 << 22
DisableHWIntrinsic = 1L << 0,
DisableSSE = 1L << 1,
DisableSSE2 = 1L << 2,
DisableAES = 1L << 3,
DisablePCLMULQDQ = 1L << 4,
DisableSSE3 = 1L << 5,
DisableSSSE3 = 1L << 6,
DisableSSE41 = 1L << 7,
DisableSSE42 = 1L << 8,
DisablePOPCNT = 1L << 9,
DisableAVX = 1L << 10,
DisableFMA = 1L << 11,
DisableAVX2 = 1L << 12,
DisableAVXVNNI = 1L << 13,
DisableAVX512BW = 1L << 14,
DisableAVX512BW_VL = 1L << 15,
DisableAVX512CD = 1L << 16,
DisableAVX512CD_VL = 1L << 17,
DisableAVX512DQ = 1L << 18,
DisableAVX512DQ_VL = 1L << 19,
DisableAVX512F = 1L << 20,
DisableAVX512F_VL = 1L << 21,
DisableAVX512VBMI = 1L << 22,
DisableAVX512VBMI_VL = 1L << 23,
DisableBMI1 = 1L << 24,
DisableBMI2 = 1L << 25,
DisableLZCNT = 1L << 26,
DisableArm64AdvSimd = 1L << 27,
DisableArm64Crc32 = 1L << 28,
DisableArm64Dp = 1L << 29,
DisableArm64Aes = 1L << 30,
DisableArm64Sha1 = 1L << 31,
DisableArm64Sha256 = 1L << 32,
AllowAll = 1L << 33
}

Loading…
Cancel
Save