Browse Source

Optimize and cleanup ByteToNormalizedFloatReduce

pull/2654/head
James Jackson-South 2 years ago
parent
commit
c6758df08b
  1. 20
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 77
      src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
  3. 38
      src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
  4. 83
      src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
  5. 190
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  6. 97
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  7. 13
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  8. 42
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  9. 75
      src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
  10. 25
      tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
  11. 4
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
  12. 4
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
  13. 63
      tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
  14. 2
      tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
  15. 36
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

20
src/ImageSharp/Common/Helpers/Numerics.cs

@ -1069,4 +1069,24 @@ internal static class Numerics
public static nuint Vector256Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector256<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into the given span.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="span">The given span.</param>
/// <returns>Count of vectors that safely fit into the span.</returns>
public static nuint Vector512Count<TVector>(this Span<float> span)
where TVector : struct
=> (uint)span.Length / (uint)Vector512<TVector>.Count;
/// <summary>
/// Gets the count of vectors that safely fit into length.
/// </summary>
/// <typeparam name="TVector">The type of the vector.</typeparam>
/// <param name="length">The given length.</param>
/// <returns>Count of vectors that safely fit into the length.</returns>
public static nuint Vector512Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector512<TVector>.Count;
}

77
src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs

@ -0,0 +1,77 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of bytes</param>
/// <param name="destination">The destination span of floats</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> destination)
{
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref destination);
if (source.Length > 0)
{
ConvertByteToNormalizedFloatRemainder(source, destination);
}
}
/// <summary>
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="destination">The destination span of bytes</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
{
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
if (source.Length > 0)
{
ConvertNormalizedFloatToByteRemainder(source, destination);
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(destination);
for (int i = 0; i < source.Length; i++)
{
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
{
ref float sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(destination);
for (int i = 0; i < source.Length; i++)
{
Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
}

38
src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

@ -1,38 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
/// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
/// See:
/// https://github.com/dotnet/coreclr/pull/10662
/// API Proposal:
/// https://github.com/dotnet/corefx/issues/15957
/// </summary>
public static class ExtendedIntrinsics
{
public static bool IsAvailable { get; } = Vector.IsHardwareAccelerated;
/// <summary>
/// Widen and convert a vector of <see cref="short"/> values into 2 vectors of <see cref="float"/>-s.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ConvertToSingle(
Vector<short> source,
out Vector<float> dest1,
out Vector<float> dest2)
{
Vector.Widen(source, out Vector<int> i1, out Vector<int> i2);
dest1 = Vector.ConvertToSingle(i1);
dest2 = Vector.ConvertToSingle(i2);
}
}
}

83
src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs

@ -1,83 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp;
internal static partial class SimdUtils
{
/// <summary>
/// Fallback implementation based on <see cref="Vector4"/> (128bit).
/// For <see cref="Vector4"/>, efficient software fallback implementations are present,
/// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
/// </summary>
public static class FallbackIntrinsics128
{
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
int remainder = Numerics.Modulo4(source.Length);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// </summary>
[MethodImpl(InliningOptions.ColdPath)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
DebugVerifySpanInput(source, dest, 4);
uint count = (uint)dest.Length / 4;
if (count == 0)
{
return;
}
ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
const float scale = 1f / 255f;
Vector4 d = default;
for (nuint i = 0; i < count; i++)
{
ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
d.X = s.X;
d.Y = s.Y;
d.Z = s.Z;
d.W = s.W;
d *= scale;
Unsafe.Add(ref dBase, i) = d;
}
}
[StructLayout(LayoutKind.Sequential)]
private struct ByteVector4
{
public byte X;
public byte Y;
public byte Z;
public byte W;
}
}
}

190
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -752,17 +752,23 @@ internal static partial class SimdUtils
/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
ref Span<float> destination)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported || Sse2.IsSupported)
{
int remainder;
if (Avx2.IsSupported)
if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
}
else if (Avx2.IsSupported)
{
remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
}
@ -775,10 +781,10 @@ internal static partial class SimdUtils
if (adjustedCount > 0)
{
ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
ByteToNormalizedFloat(source[..adjustedCount], destination[..adjustedCount]);
source = source[adjustedCount..];
dest = dest[adjustedCount..];
destination = destination[adjustedCount..];
}
}
}
@ -786,97 +792,127 @@ internal static partial class SimdUtils
/// <summary>
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
/// </remarks>
internal static unsafe void ByteToNormalizedFloat(
ReadOnlySpan<byte> source,
Span<float> dest)
Span<float> destination)
{
fixed (byte* sourceBase = source)
if (Avx512F.IsSupported)
{
if (Avx2.IsSupported)
{
DebugVerifySpanInput(source, dest, Vector256<byte>.Count);
nuint n = dest.Vector256Count<byte>();
DebugVerifySpanInput(source, destination, Vector512<byte>.Count);
ref Vector256<float> destBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
nuint n = destination.Vector512Count<byte>();
Vector256<float> scale = Vector256.Create(1 / (float)byte.MaxValue);
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector256<byte>.Count * i;
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector512<byte>.Count * i;
Vector512<int> i0 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si));
Vector512<int> i1 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector512<int>.Count));
Vector512<int> i2 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 2)));
Vector512<int> i3 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 3)));
// Declare multiplier on each line. Codegen is better.
Vector512<float> f0 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i0);
Vector512<float> f1 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i1);
Vector512<float> f2 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i2);
Vector512<float> f3 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i3);
ref Vector512<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
else
}
else if (Avx2.IsSupported)
{
DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
nuint n = destination.Vector256Count<byte>();
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
for (nuint i = 0; i < n; i++)
{
// Sse
DebugVerifySpanInput(source, dest, Vector128<byte>.Count);
nuint si = (uint)Vector256<byte>.Count * i;
Vector256<int> i0 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si));
Vector256<int> i1 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector256<int>.Count));
Vector256<int> i2 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector256<int>.Count * 2)));
// Ensure overreads past 16 byte boundary do not happen in debug due to lack of containment.
ref ulong refULong = ref Unsafe.As<byte, ulong>(ref Unsafe.Add(ref sourceBase, si));
Vector256<int> i3 = Avx2.ConvertToVector256Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refULong, 3)).AsByte());
// Declare multiplier on each line. Codegen is better.
Vector256<float> f0 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i0);
Vector256<float> f1 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i1);
Vector256<float> f2 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i2);
Vector256<float> f3 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i3);
ref Vector256<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
else if (Sse2.IsSupported || AdvSimd.IsSupported)
{
DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
nuint n = dest.Vector128Count<byte>();
nuint n = destination.Vector128Count<byte>();
ref byte sourceBase = ref MemoryMarshal.GetReference(source);
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
ref Vector128<float> destBase =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
Vector128<byte> zero = Vector128<byte>.Zero;
Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
Vector128<byte> zero = Vector128<byte>.Zero;
for (nuint i = 0; i < n; i++)
{
nuint si = (uint)Vector128<byte>.Count * i;
for (nuint i = 0; i < n; i++)
Vector128<int> i0, i1, i2, i3;
if (Sse41.IsSupported)
{
nuint si = (uint)Vector128<byte>.Count * i;
Vector128<int> i0, i1, i2, i3;
if (Sse41.IsSupported)
{
i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
}
else
{
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
}
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
ref int refInt = ref Unsafe.As<byte, int>(ref Unsafe.Add(ref sourceBase, si));
i0 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(refInt).AsByte());
i1 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 1)).AsByte());
i2 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 2)).AsByte());
i3 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 3)).AsByte());
}
else
{
// Sse2, AdvSimd
Vector128<byte> b = Vector128.LoadUnsafe(ref sourceBase, si);
(Vector128<ushort> s0, Vector128<ushort> s1) = Vector128.Widen(b);
(i0, i1) = Vector128.Widen(s0.AsInt16());
(i2, i3) = Vector128.Widen(s1.AsInt16());
}
Vector128<float> f0 = scale * Vector128.ConvertToSingle(i0);
Vector128<float> f1 = scale * Vector128.ConvertToSingle(i1);
Vector128<float> f2 = scale * Vector128.ConvertToSingle(i2);
Vector128<float> f3 = scale * Vector128.ConvertToSingle(i3);
ref Vector128<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
d = f0;
Unsafe.Add(ref d, 1) = f1;
Unsafe.Add(ref d, 2) = f2;
Unsafe.Add(ref d, 3) = f3;
}
}
}

97
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -22,13 +22,6 @@ internal static partial class SimdUtils
public static bool HasVector8 { get; } =
Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
/// <summary>
/// Gets a value indicating whether <see cref="Vector{T}"/> code is being JIT-ed to SSE instructions
/// where float and integer registers are of size 128 byte.
/// </summary>
public static bool HasVector4 { get; } =
Vector.IsHardwareAccelerated && Vector<float>.Count == 4;
/// <summary>
/// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
/// </summary>
@ -69,96 +62,6 @@ internal static partial class SimdUtils
}
}
/// <summary>
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of bytes</param>
/// <param name="dest">The destination span of floats</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
// Also deals with the remainder from previous conversions:
FallbackIntrinsics128.ByteToNormalizedFloatReduce(ref source, ref dest);
// Deal with the remainder:
if (source.Length > 0)
{
ConvertByteToNormalizedFloatRemainder(source, dest);
}
}
/// <summary>
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
/// but there are no restrictions on the span's length.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="destination">The destination span of bytes</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
{
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
// Deal with the remainder:
if (source.Length > 0)
{
ConvertNormalizedFloatToByteRemainder(source, destination);
}
}
[MethodImpl(InliningOptions.ColdPath)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(destination);
// There are at most 3 elements at this point, having a for loop is overkill.
// Let's minimize the no. of instructions!
switch (source.Length)
{
case 3:
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
goto case 2;
case 2:
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
goto case 1;
case 1:
dBase = sBase / 255f;
break;
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
{
ref float sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(destination);
for (int i = 0; i < source.Length; i++)
{
Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i));
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
[Conditional("DEBUG")]
private static void VerifyHasVector8(string operation)
{
if (!HasVector8)
{
throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
}
}
[Conditional("DEBUG")]
private static void DebugVerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
{

13
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -26,7 +26,7 @@ internal static class Vector128Utilities
public static bool SupportsShuffleFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Sse.IsSupported || AdvSimd.IsSupported;
get => Sse.IsSupported;
}
/// <summary>
@ -70,17 +70,6 @@ internal static class Vector128Utilities
return Sse.Shuffle(vector, vector, control);
}
if (AdvSimd.IsSupported)
{
#pragma warning disable CA1857 // A constant is expected for the parameter
Vector128<float> result = Vector128.Create(AdvSimd.Extract(vector, (byte)(control & 0x3)));
result = AdvSimd.Insert(result, 1, AdvSimd.Extract(vector, (byte)((control >> 2) & 0x3)));
result = AdvSimd.Insert(result, 2, AdvSimd.Extract(vector, (byte)((control >> 4) & 0x3)));
result = AdvSimd.Insert(result, 3, AdvSimd.Extract(vector, (byte)((control >> 6) & 0x3)));
#pragma warning restore CA1857 // A constant is expected for the parameter
return result;
}
ThrowUnreachableException();
return default;
}

42
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -386,29 +386,33 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
{
DebugGuard.IsTrue(
SimdUtils.HasVector8,
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
ref Vector<short> sRef = ref Unsafe.As<Block8x8, Vector<short>>(ref source);
ref Vector<float> dRef = ref Unsafe.As<Block8x8F, Vector<float>>(ref this);
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector<ushort>.Count == 16 on AVX2
// Vector256<ushort>.Count == 16 on AVX2
// We can process 2 block rows in a single step
SimdUtils.ExtendedIntrinsics.ConvertToSingle(sRef, out Vector<float> top, out Vector<float> bottom);
dRef = top;
Unsafe.Add(ref dRef, 1) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 1), out top, out bottom);
Unsafe.Add(ref dRef, 2) = top;
Unsafe.Add(ref dRef, 3) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 2), out top, out bottom);
Unsafe.Add(ref dRef, 4) = top;
Unsafe.Add(ref dRef, 5) = bottom;
SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 3), out top, out bottom);
Unsafe.Add(ref dRef, 6) = top;
Unsafe.Add(ref dRef, 7) = bottom;
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
}
/// <summary>

75
src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs

@ -5,6 +5,7 @@ using System.Buffers;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace SixLabors.ImageSharp.PixelFormats.Utils;
@ -31,74 +32,86 @@ internal static partial class Vector4Converters
/// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.ToVector4(Configuration,ReadOnlySpan{TPixel},Span{Vector4},PixelConversionModifiers)"/>
/// The method works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
/// <typeparam name="TPixel">The type of pixel format.</typeparam>
/// <param name="configuration">The configuration.</param>
/// <param name="pixelOperations">The pixel operations instance.</param>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <param name="modifiers">The conversion modifier flags.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ToVector4<TPixel>(
Configuration configuration,
PixelOperations<TPixel> pixelOperations,
ReadOnlySpan<TPixel> sourcePixels,
Span<Vector4> destVectors,
ReadOnlySpan<TPixel> source,
Span<Vector4> destination,
PixelConversionModifiers modifiers)
where TPixel : unmanaged, IPixel<TPixel>
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destVectors, nameof(destVectors));
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
int count = sourcePixels.Length;
int count = source.Length;
// Not worth for small buffers:
if (count < Vector4ConversionThreshold)
{
Default.UnsafeToVector4(sourcePixels, destVectors, modifiers);
Default.UnsafeToVector4(source, destination, modifiers);
return;
}
// Using the last quarter of 'destVectors' as a temporary buffer to avoid allocation:
// Using the last quarter of 'destination' as a temporary buffer to avoid allocation:
int countWithoutLastItem = count - 1;
ReadOnlySpan<TPixel> reducedSource = sourcePixels[..countWithoutLastItem];
Span<Rgba32> lastQuarterOfDestBuffer = MemoryMarshal.Cast<Vector4, Rgba32>(destVectors).Slice((3 * count) + 1, countWithoutLastItem);
pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestBuffer);
ReadOnlySpan<TPixel> reducedSource = source[..countWithoutLastItem];
Span<Rgba32> lastQuarterOfDestination = MemoryMarshal.Cast<Vector4, Rgba32>(destination).Slice((3 * count) + 1, countWithoutLastItem);
pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestination);
// 'destVectors' and 'lastQuarterOfDestBuffer' are overlapping buffers,
// 'destination' and 'lastQuarterOfDestination' are overlapping buffers,
// but we are always reading/writing at different positions:
SimdUtils.ByteToNormalizedFloat(
MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestBuffer),
MemoryMarshal.Cast<Vector4, float>(destVectors[..countWithoutLastItem]));
MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestination),
MemoryMarshal.Cast<Vector4, float>(destination[..countWithoutLastItem]));
destVectors[countWithoutLastItem] = sourcePixels[countWithoutLastItem].ToVector4();
destination[countWithoutLastItem] = source[countWithoutLastItem].ToVector4();
// TODO: Investigate optimized 1-pass approach!
ApplyForwardConversionModifiers(destVectors, modifiers);
ApplyForwardConversionModifiers(destination, modifiers);
}
/// <summary>
/// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.FromVector4Destructive(Configuration,Span{Vector4},Span{TPixel},PixelConversionModifiers)"/>
/// The method is works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
/// <typeparam name="TPixel">The type of pixel format.</typeparam>
/// <param name="configuration">The configuration.</param>
/// <param name="pixelOperations">The pixel operations instance.</param>
/// <param name="source">The source buffer.</param>
/// <param name="destination">The destination buffer.</param>
/// <param name="modifiers">The conversion modifier flags.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void FromVector4<TPixel>(
Configuration configuration,
PixelOperations<TPixel> pixelOperations,
Span<Vector4> sourceVectors,
Span<TPixel> destPixels,
Span<Vector4> source,
Span<TPixel> destination,
PixelConversionModifiers modifiers)
where TPixel : unmanaged, IPixel<TPixel>
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourceVectors, destPixels, nameof(destPixels));
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
int count = sourceVectors.Length;
int count = source.Length;
// Not worth for small buffers:
if (count < Vector4ConversionThreshold)
{
Default.UnsafeFromVector4(sourceVectors, destPixels, modifiers);
Default.UnsafeFromVector4(source, destination, modifiers);
return;
}
// TODO: Investigate optimized 1-pass approach!
ApplyBackwardConversionModifiers(sourceVectors, modifiers);
ApplyBackwardConversionModifiers(source, modifiers);
// For the opposite direction it's not easy to implement the trick used in RunRgba32CompatibleToVector4Conversion,
// so let's allocate a temporary buffer as usually:
@ -106,20 +119,30 @@ internal static partial class Vector4Converters
Span<Rgba32> tempSpan = tempBuffer.Memory.Span;
SimdUtils.NormalizedFloatToByteSaturate(
MemoryMarshal.Cast<Vector4, float>(sourceVectors),
MemoryMarshal.Cast<Vector4, float>(source),
MemoryMarshal.Cast<Rgba32, byte>(tempSpan));
pixelOperations.FromRgba32(configuration, tempSpan, destPixels);
pixelOperations.FromRgba32(configuration, tempSpan, destination);
}
private static int CalculateVector4ConversionThreshold()
{
if (!Vector.IsHardwareAccelerated)
if (!Vector128.IsHardwareAccelerated)
{
return int.MaxValue;
}
return SimdUtils.ExtendedIntrinsics.IsAvailable && SimdUtils.HasVector8 ? 256 : 128;
if (Vector512.IsHardwareAccelerated)
{
return 512;
}
if (Vector256.IsHardwareAccelerated)
{
return 256;
}
return 128;
}
}
}

25
tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs

@ -14,9 +14,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
public abstract class ToVector4<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
protected IMemoryOwner<TPixel> source;
protected IMemoryOwner<TPixel> Source { get; set; }
protected IMemoryOwner<Vector4> destination;
protected IMemoryOwner<Vector4> Destination { get; set; }
protected Configuration Configuration => Configuration.Default;
@ -26,22 +26,22 @@ public abstract class ToVector4<TPixel>
[GlobalSetup]
public void Setup()
{
this.source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
this.Source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
this.Destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
}
[GlobalCleanup]
public void Cleanup()
{
this.source.Dispose();
this.destination.Dispose();
this.Source.Dispose();
this.Destination.Dispose();
}
// [Benchmark]
public void Naive()
{
Span<TPixel> s = this.source.GetSpan();
Span<Vector4> d = this.destination.GetSpan();
Span<TPixel> s = this.Source.GetSpan();
Span<Vector4> d = this.Destination.GetSpan();
for (int i = 0; i < this.Count; i++)
{
@ -50,11 +50,8 @@ public abstract class ToVector4<TPixel>
}
[Benchmark]
public void PixelOperations_Specialized()
{
PixelOperations<TPixel>.Instance.ToVector4(
public void PixelOperations_Specialized() => PixelOperations<TPixel>.Instance.ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
}
this.Source.GetSpan(),
this.Destination.GetSpan());
}

4
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs

@ -16,8 +16,8 @@ public class ToVector4_Bgra32 : ToVector4<Bgra32>
{
new PixelOperations<Bgra32>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
this.Source.GetSpan(),
this.Destination.GetSpan());
}
// RESULTS:

4
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs

@ -16,8 +16,8 @@ public class ToVector4_Rgb24 : ToVector4<Rgb24>
{
new PixelOperations<Rgb24>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
this.Source.GetSpan(),
this.Destination.GetSpan());
}
}

63
tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs

@ -14,27 +14,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
[Config(typeof(Config.Short))]
public class ToVector4_Rgba32 : ToVector4<Rgba32>
{
[Benchmark]
public void FallbackIntrinsics128()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(sBytes, dFloats);
}
[Benchmark]
public void PixelOperations_Base()
=> new PixelOperations<Rgba32>().ToVector4(
this.Configuration,
this.source.GetSpan(),
this.destination.GetSpan());
this.Source.GetSpan(),
this.Destination.GetSpan());
[Benchmark]
public void HwIntrinsics()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
}
@ -42,8 +33,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
// [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;
@ -67,14 +58,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
}
n = (uint)(dFloats.Length / Vector<float>.Count);
var scale = new Vector<float>(1f / 255f);
Vector<float> scale = new(1f / 255f);
for (nuint i = 0; i < n; i++)
{
ref Vector<float> dRef = ref Unsafe.Add(ref destBase, i);
var du = Vector.AsVectorInt32(dRef);
var v = Vector.ConvertToSingle(du);
Vector<int> du = Vector.AsVectorInt32(dRef);
Vector<float> v = Vector.ConvertToSingle(du);
v *= scale;
dRef = v;
@ -84,14 +75,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
// [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());
nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
var scale = new Vector<float>(1f / 255f);
Vector<float> scale = new(1f / 255f);
for (nuint i = 0; i < n; i++)
{
@ -117,8 +108,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<float> ConvertToNormalizedSingle(Vector<uint> u, Vector<float> scale)
{
var vi = Vector.AsVectorInt32(u);
var v = Vector.ConvertToSingle(vi);
Vector<int> vi = Vector.AsVectorInt32(u);
Vector<float> v = Vector.ConvertToSingle(vi);
v *= scale;
return v;
}
@ -151,4 +142,30 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
PixelOperations_Base | Core | 2048 | 6,752.68 ns | 272.820 ns | 15.4148 ns | 1.67 | 0.02 | - | 24 B |
PixelOperations_Specialized | Core | 2048 | 1,126.13 ns | 79.192 ns | 4.4745 ns |!! 0.28 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock!
*/
/*
BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
.NET SDK 8.0.200-preview.23624.5
[Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Job-DFEQJT : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3
LaunchCount=1 WarmupCount=3
| Method | Count | Mean | Error | StdDev | Allocated |
|---------------------------- |------ |------------:|-----------:|----------:|----------:|
| FallbackIntrinsics128 | 64 | 139.66 ns | 27.429 ns | 1.503 ns | - |
| PixelOperations_Base | 64 | 124.65 ns | 29.653 ns | 1.625 ns | - |
| HwIntrinsics | 64 | 18.16 ns | 4.731 ns | 0.259 ns | - |
| PixelOperations_Specialized | 64 | 27.94 ns | 15.220 ns | 0.834 ns | - |
| FallbackIntrinsics128 | 256 | 525.07 ns | 34.397 ns | 1.885 ns | - |
| PixelOperations_Base | 256 | 464.17 ns | 46.897 ns | 2.571 ns | - |
| HwIntrinsics | 256 | 43.88 ns | 4.525 ns | 0.248 ns | - |
| PixelOperations_Specialized | 256 | 55.57 ns | 14.587 ns | 0.800 ns | - |
| FallbackIntrinsics128 | 2048 | 4,148.44 ns | 476.583 ns | 26.123 ns | - |
| PixelOperations_Base | 2048 | 3,608.42 ns | 66.293 ns | 3.634 ns | - |
| HwIntrinsics | 2048 | 361.42 ns | 35.576 ns | 1.950 ns | - |
| PixelOperations_Specialized | 2048 | 374.82 ns | 33.371 ns | 1.829 ns | - |
*/
}

2
tests/ImageSharp.Benchmarks/LoadResizeSave/README.md

@ -1,4 +1,4 @@
The benchmarks have been adapted from the
The benchmarks have been adapted from the
[PhotoSauce's MemoryStress project](https://github.com/saucecontrol/core-imaging-playground/tree/beeees/MemoryStress).
### Setup

36
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -3,6 +3,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Tests.TestUtilities;
@ -117,16 +118,10 @@ public partial class SimdUtilsTests
public static readonly TheoryData<int> ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 };
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy4))]
public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(s.Span, d.Span));
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
[MemberData(nameof(ArraySizesDivisibleBy64))]
public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
if (!Sse2.IsSupported)
if (!Sse2.IsSupported && !AdvSimd.IsSupported)
{
return;
}
@ -138,7 +133,7 @@ public partial class SimdUtilsTests
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
count,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
}
[Theory]
@ -160,32 +155,11 @@ public partial class SimdUtilsTests
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
[Theory]
[InlineData(1234)]
public void ExtendedIntrinsics_ConvertToSingle(short scale)
{
int n = Vector<float>.Count;
short[] sData = new Random(scale).GenerateRandomInt16Array(2 * n, (short)-scale, scale);
float[] fData = sData.Select(u => (float)u).ToArray();
Vector<short> source = new(sData);
Vector<float> expected1 = new(fData, 0);
Vector<float> expected2 = new(fData, n);
// Act:
SimdUtils.ExtendedIntrinsics.ConvertToSingle(source, out Vector<float> actual1, out Vector<float> actual2);
// Assert:
Assert.Equal(expected1, actual1);
Assert.Equal(expected2, actual2);
}
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy64))]
public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (!Sse2.IsSupported)
if (!Sse2.IsSupported && !AdvSimd.IsSupported)
{
return;
}

Loading…
Cancel
Save