mirror of https://github.com/SixLabors/ImageSharp
committed by
GitHub
80 changed files with 4926 additions and 1087 deletions
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="utf-8" ?> |
|||
<RunSettings> |
|||
<RunConfiguration> |
|||
<!--Used in conjunction with ActiveIssueAttribute to skip tests with known issues--> |
|||
<TestCaseFilter>category!=failing</TestCaseFilter> |
|||
</RunConfiguration> |
|||
</RunSettings> |
|||
@ -0,0 +1,193 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Buffers.Binary; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
// The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
|
|||
// and ROTR (Rotate Right) emitting efficient CPU instructions:
|
|||
// https://github.com/dotnet/coreclr/pull/1830
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <summary>
|
|||
/// Defines the contract for methods that allow the shuffling of pixel components.
|
|||
/// Used for shuffling on platforms that do not support Hardware Intrinsics.
|
|||
/// </summary>
|
|||
internal interface IComponentShuffle |
|||
{ |
|||
/// <summary>
|
|||
/// Gets the shuffle control.
|
|||
/// </summary>
|
|||
byte Control { get; } |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest); |
|||
} |
|||
|
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle4 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle4 : IShuffle4 |
|||
{ |
|||
private readonly byte p3; |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p3 = p3; |
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p3 = this.p3; |
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0; i < source.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct WXYZShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// ROTL(8, packed) = [Z Y X W]
|
|||
Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct WZYXShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// REVERSE(packedArgb) = [X Y Z W]
|
|||
Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct YZWXShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// ROTR(8, packedArgb) = [Y Z W X]
|
|||
Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct ZYXWShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// tmp1 = [W 0 Y 0]
|
|||
// tmp2 = [0 Z 0 X]
|
|||
// tmp3=ROTL(16, tmp2) = [0 X 0 Z]
|
|||
// tmp1 + tmp3 = [W X Y Z]
|
|||
uint tmp1 = packed & 0xFF00FF00; |
|||
uint tmp2 = packed & 0x00FF00FF; |
|||
uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); |
|||
|
|||
Unsafe.Add(ref dBase, i) = tmp1 + tmp3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,103 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IPad3Shuffle4 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4 |
|||
{ |
|||
private readonly byte p3; |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p3 = p3; |
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p3 = this.p3; |
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
Span<byte> temp = stackalloc byte[4]; |
|||
ref byte t = ref MemoryMarshal.GetReference(temp); |
|||
ref uint tu = ref Unsafe.As<byte, uint>(ref t); |
|||
|
|||
for (int i = 0, j = 0; i < source.Length; i += 3, j += 4) |
|||
{ |
|||
ref var s = ref Unsafe.Add(ref sBase, i); |
|||
tu = Unsafe.As<byte, uint>(ref s) | 0xFF000000; |
|||
|
|||
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref t, p0); |
|||
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); |
|||
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); |
|||
Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
ref byte sEnd = ref Unsafe.Add(ref sBase, source.Length); |
|||
ref byte sLoopEnd = ref Unsafe.Subtract(ref sEnd, 4); |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) |
|||
{ |
|||
Unsafe.As<byte, uint>(ref dBase) = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000; |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 3); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); |
|||
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); |
|||
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); |
|||
Unsafe.Add(ref dBase, 3) = byte.MaxValue; |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 3); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,53 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle3 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle3 : IShuffle3 |
|||
{ |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle3(byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 2, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 2, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 2, nameof(p0)); |
|||
|
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0; i < source.Length; i += 3) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,101 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle4Slice3 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3 |
|||
{ |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref Byte3 dBase = ref Unsafe.As<byte, Byte3>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / 4; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
ref uint sLoopEnd = ref Unsafe.Add(ref sBase, u); |
|||
ref uint sEnd = ref Unsafe.Add(ref sBase, n); |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); |
|||
Unsafe.Add(ref dBase, 1) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 1)); |
|||
Unsafe.Add(ref dBase, 2) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 2)); |
|||
Unsafe.Add(ref dBase, 3) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 3)); |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 4); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 1); |
|||
dBase = ref Unsafe.Add(ref dBase, 1); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[StructLayout(LayoutKind.Explicit, Size = 3)] |
|||
internal readonly struct Byte3 |
|||
{ |
|||
} |
|||
} |
|||
@ -1,103 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
public static class Avx2Intrinsics |
|||
{ |
|||
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
/// <summary>
|
|||
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void NormalizedFloatToByteSaturateReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported) |
|||
{ |
|||
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
NormalizedFloatToByteSaturate( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
|
|||
/// </remarks>
|
|||
internal static void NormalizedFloatToByteSaturate( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var maxBytes = Vector256.Create(255f); |
|||
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); |
|||
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector256<float> f0 = s; |
|||
Vector256<float> f1 = Unsafe.Add(ref s, 1); |
|||
Vector256<float> f2 = Unsafe.Add(ref s, 2); |
|||
Vector256<float> f3 = Unsafe.Add(ref s, 3); |
|||
|
|||
Vector256<int> w0 = ConvertToInt32(f0, maxBytes); |
|||
Vector256<int> w1 = ConvertToInt32(f1, maxBytes); |
|||
Vector256<int> w2 = ConvertToInt32(f2, maxBytes); |
|||
Vector256<int> w3 = ConvertToInt32(f3, maxBytes); |
|||
|
|||
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); |
|||
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); |
|||
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); |
|||
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); |
|||
|
|||
Unsafe.Add(ref destBase, i) = b; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale) |
|||
{ |
|||
vf = Avx.Multiply(vf, scale); |
|||
return Avx.ConvertToVector256Int32(vf); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,795 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
public static class HwIntrinsics |
|||
{ |
|||
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
private static ReadOnlySpan<byte> ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 }; |
|||
|
|||
private static ReadOnlySpan<byte> ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; |
|||
|
|||
/// <summary>
|
|||
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats.</param>
|
|||
/// <param name="dest">The destination span of floats.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Reduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<float> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx.IsSupported || Sse.IsSupported) |
|||
{ |
|||
int remainder = Avx.IsSupported |
|||
? ImageMaths.ModuloP2(source.Length, Vector256<float>.Count) |
|||
: ImageMaths.ModuloP2(source.Length, Vector128<float>.Count); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle4( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx2.IsSupported || Ssse3.IsSupported) |
|||
{ |
|||
int remainder = Avx2.IsSupported |
|||
? ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count) |
|||
: ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle4( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles 8-bit integer triplets within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle3Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle3( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Pad3Shuffle4Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|||
|
|||
int sourceCount = source.Length - remainder; |
|||
int destCount = sourceCount * 4 / 3; |
|||
|
|||
if (sourceCount > 0) |
|||
{ |
|||
Pad3Shuffle4( |
|||
source.Slice(0, sourceCount), |
|||
dest.Slice(0, destCount), |
|||
control); |
|||
|
|||
source = source.Slice(sourceCount); |
|||
dest = dest.Slice(destCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Slice3Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 4); |
|||
|
|||
int sourceCount = source.Length - remainder; |
|||
int destCount = sourceCount * 3 / 4; |
|||
|
|||
if (sourceCount > 0) |
|||
{ |
|||
Shuffle4Slice3( |
|||
source.Slice(0, sourceCount), |
|||
dest.Slice(0, destCount), |
|||
control); |
|||
|
|||
source = source.Slice(sourceCount); |
|||
dest = dest.Slice(destCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx.IsSupported) |
|||
{ |
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<float> destBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector256<float>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector256<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
vd0 = Avx.Permute(vs0, control); |
|||
Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control); |
|||
Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control); |
|||
Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); |
|||
} |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
ref Vector128<float> sourceBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<float> destBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector128<float>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector128<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
vd0 = Sse.Shuffle(vs0, vs0, control); |
|||
|
|||
Vector128<float> vs1 = Unsafe.Add(ref vs0, 1); |
|||
Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control); |
|||
|
|||
Vector128<float> vs2 = Unsafe.Add(ref vs0, 2); |
|||
Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control); |
|||
|
|||
Vector128<float> vs3 = Unsafe.Add(ref vs0, 3); |
|||
Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Vector128<float> vs = Unsafe.Add(ref sourceBase, i); |
|||
Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
// I've chosen to do this for convenience while we determine what
|
|||
// shuffle controls to add to the library.
|
|||
// We can add static ROS instances if need be in the future.
|
|||
Span<byte> bytes = stackalloc byte[Vector256<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector256<byte> vshuffle = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector256<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<byte> destBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd0 = Avx2.Shuffle(vs0, vshuffle); |
|||
Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|||
Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|||
Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Ssse3
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd0 = Ssse3.Shuffle(vs0, vshuffle); |
|||
Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|||
Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|||
Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle3( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); |
|||
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase); |
|||
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); |
|||
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase); |
|||
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0; i < n; i += 3) |
|||
{ |
|||
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
Vector128<byte> v0 = vs; |
|||
Vector128<byte> v1 = Unsafe.Add(ref vs, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|||
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v2 = Ssse3.AlignRight(v2, v1, 8); |
|||
v1 = Ssse3.AlignRight(v1, v0, 12); |
|||
|
|||
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle); |
|||
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle); |
|||
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle); |
|||
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle); |
|||
|
|||
v0 = Ssse3.Shuffle(v0, vmaske); |
|||
v1 = Ssse3.Shuffle(v1, vmasko); |
|||
v2 = Ssse3.Shuffle(v2, vmaske); |
|||
v3 = Ssse3.Shuffle(v3, vmasko); |
|||
|
|||
v0 = Ssse3.AlignRight(v1, v0, 4); |
|||
v3 = Ssse3.AlignRight(v3, v2, 12); |
|||
|
|||
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|||
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v1 = Ssse3.AlignRight(v2, v1, 8); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd = v0; |
|||
Unsafe.Add(ref vd, 1) = v1; |
|||
Unsafe.Add(ref vd, 2) = v3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Pad3Shuffle4( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); |
|||
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase); |
|||
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0, j = 0; i < n; i += 3, j += 4) |
|||
{ |
|||
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i); |
|||
Vector128<byte> v1 = Unsafe.Add(ref v0, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref v0, 2); |
|||
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v2 = Ssse3.AlignRight(v2, v1, 8); |
|||
v1 = Ssse3.AlignRight(v1, v0, 12); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|||
|
|||
vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4Slice3( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); |
|||
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase); |
|||
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0, j = 0; i < n; i += 4, j += 3) |
|||
{ |
|||
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
Vector128<byte> v0 = vs; |
|||
Vector128<byte> v1 = Unsafe.Add(ref vs, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|||
Vector128<byte> v3 = Unsafe.Add(ref vs, 3); |
|||
|
|||
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); |
|||
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); |
|||
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske); |
|||
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); |
|||
|
|||
v0 = Ssse3.AlignRight(v1, v0, 4); |
|||
v3 = Ssse3.AlignRight(v3, v2, 12); |
|||
|
|||
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|||
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v1 = Ssse3.AlignRight(v2, v1, 8); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|||
|
|||
vd = v0; |
|||
Unsafe.Add(ref vd, 1) = v1; |
|||
Unsafe.Add(ref vd, 2) = v3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
|
|||
/// </summary>
|
|||
/// <param name="va">The vector to add to the intermediate result.</param>
|
|||
/// <param name="vm0">The first vector to multiply.</param>
|
|||
/// <param name="vm1">The second vector to multiply.</param>
|
|||
/// <returns>The <see cref="Vector256{T}"/>.</returns>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static Vector256<float> MultiplyAdd( |
|||
in Vector256<float> va, |
|||
in Vector256<float> vm0, |
|||
in Vector256<float> vm1) |
|||
{ |
|||
if (Fma.IsSupported) |
|||
{ |
|||
return Fma.MultiplyAdd(vm1, vm0, va); |
|||
} |
|||
else |
|||
{ |
|||
return Avx.Add(Avx.Multiply(vm0, vm1), va); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void ByteToNormalizedFloatReduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported || Sse2.IsSupported) |
|||
{ |
|||
int remainder; |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count); |
|||
} |
|||
else |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
} |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
|
|||
/// </remarks>
|
|||
internal static unsafe void ByteToNormalizedFloat( |
|||
ReadOnlySpan<byte> source, |
|||
Span<float> dest) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<float> destBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector256.Create(1 / (float)byte.MaxValue); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
int si = Vector256<byte>.Count * i; |
|||
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si); |
|||
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count); |
|||
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2)); |
|||
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3)); |
|||
|
|||
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0)); |
|||
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1)); |
|||
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2)); |
|||
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3)); |
|||
|
|||
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4); |
|||
|
|||
d = f0; |
|||
Unsafe.Add(ref d, 1) = f1; |
|||
Unsafe.Add(ref d, 2) = f2; |
|||
Unsafe.Add(ref d, 3) = f3; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
|
|||
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<float> destBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector128.Create(1 / (float)byte.MaxValue); |
|||
Vector128<byte> zero = Vector128<byte>.Zero; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
int si = Vector128<byte>.Count * i; |
|||
|
|||
Vector128<int> i0, i1, i2, i3; |
|||
if (Sse41.IsSupported) |
|||
{ |
|||
i0 = Sse41.ConvertToVector128Int32(sourceBase + si); |
|||
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count); |
|||
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2)); |
|||
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3)); |
|||
} |
|||
else |
|||
{ |
|||
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si); |
|||
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16(); |
|||
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16(); |
|||
|
|||
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32(); |
|||
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32(); |
|||
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32(); |
|||
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32(); |
|||
} |
|||
|
|||
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0)); |
|||
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1)); |
|||
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2)); |
|||
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3)); |
|||
|
|||
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4); |
|||
|
|||
d = f0; |
|||
Unsafe.Add(ref d, 1) = f1; |
|||
Unsafe.Add(ref d, 2) = f2; |
|||
Unsafe.Add(ref d, 3) = f3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void NormalizedFloatToByteSaturateReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported || Sse2.IsSupported) |
|||
{ |
|||
int remainder; |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count); |
|||
} |
|||
else |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
} |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
NormalizedFloatToByteSaturate( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
|
|||
/// </remarks>
|
|||
internal static void NormalizedFloatToByteSaturate( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<byte> destBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector256.Create((float)byte.MaxValue); |
|||
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); |
|||
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector256<float> f0 = Avx.Multiply(scale, s); |
|||
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); |
|||
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); |
|||
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); |
|||
|
|||
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0); |
|||
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1); |
|||
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2); |
|||
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3); |
|||
|
|||
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); |
|||
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); |
|||
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); |
|||
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); |
|||
|
|||
Unsafe.Add(ref destBase, i) = b; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
|
|||
ref Vector128<float> sourceBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector128.Create((float)byte.MaxValue); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector128<float> f0 = Sse.Multiply(scale, s); |
|||
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); |
|||
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); |
|||
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); |
|||
|
|||
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0); |
|||
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1); |
|||
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2); |
|||
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3); |
|||
|
|||
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1); |
|||
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3); |
|||
|
|||
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,275 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats.</param>
|
|||
/// <param name="dest">The destination span of floats.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
VerifyShuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
Shuffle4Remainder(source, dest, control); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle4 |
|||
{ |
|||
VerifyShuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integer triplets within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle3<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle3 |
|||
{ |
|||
VerifyShuffle3SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle3Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Pad3Shuffle4<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IPad3Shuffle4 |
|||
{ |
|||
VerifyPad3Shuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Slice3<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle4Slice3 |
|||
{ |
|||
VerifyShuffle4Slice3SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
private static void Shuffle4Remainder( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
ref float sBase = ref MemoryMarshal.GetReference(source); |
|||
ref float dBase = ref MemoryMarshal.GetReference(dest); |
|||
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); |
|||
|
|||
for (int i = 0; i < source.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
|||
} |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle4SpanInput<T>(ReadOnlySpan<T> source, Span<T> dest) |
|||
where T : struct |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length, |
|||
nameof(source), |
|||
"Input spans must be of same length!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length % 4 == 0, |
|||
nameof(source), |
|||
"Input spans must be divisable by 4!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle3SpanInput<T>(ReadOnlySpan<T> source, Span<T> dest) |
|||
where T : struct |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length, |
|||
nameof(source), |
|||
"Input spans must be of same length!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length % 3 == 0, |
|||
nameof(source), |
|||
"Input spans must be divisable by 3!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length % 3 == 0, |
|||
nameof(source), |
|||
"Input span must be divisable by 3!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length % 4 == 0, |
|||
nameof(dest), |
|||
"Output span must be divisable by 4!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length * 3 / 4, |
|||
nameof(source), |
|||
"Input span must be 3/4 the length of the output span!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length % 4 == 0, |
|||
nameof(source), |
|||
"Input span must be divisable by 4!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length % 3 == 0, |
|||
nameof(dest), |
|||
"Output span must be divisable by 3!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length >= source.Length * 3 / 4, |
|||
nameof(source), |
|||
"Output span must be at least 3/4 the length of the input span!"); |
|||
} |
|||
|
|||
public static class Shuffle |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0) |
|||
=> (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void MmShuffleSpan(ref Span<byte> span, byte control) |
|||
{ |
|||
InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
ref byte spanBase = ref MemoryMarshal.GetReference(span); |
|||
|
|||
for (int i = 0; i < span.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref spanBase, i) = (byte)(p0 + i); |
|||
Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i); |
|||
Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i); |
|||
Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i); |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void InverseMmShuffle( |
|||
byte control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0) |
|||
{ |
|||
p3 = control >> 6 & 0x3; |
|||
p2 = control >> 4 & 0x3; |
|||
p1 = control >> 2 & 0x3; |
|||
p0 = control >> 0 & 0x3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_AddInPlace |
|||
{ |
|||
[Benchmark] |
|||
public float AddInplace() |
|||
{ |
|||
float f = 42F; |
|||
Block8x8F b = default; |
|||
b.AddInPlace(f); |
|||
return f; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_MultiplyInPlaceBlock |
|||
{ |
|||
private static readonly Block8x8F Source = Create8x8FloatData(); |
|||
|
|||
[Benchmark] |
|||
public void MultiplyInPlaceBlock() |
|||
{ |
|||
Block8x8F dest = default; |
|||
Source.MultiplyInPlace(ref dest); |
|||
} |
|||
|
|||
private static Block8x8F Create8x8FloatData() |
|||
{ |
|||
var result = new float[64]; |
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
for (int j = 0; j < 8; j++) |
|||
{ |
|||
result[(i * 8) + j] = (i * 10) + j; |
|||
} |
|||
} |
|||
|
|||
var source = default(Block8x8F); |
|||
source.LoadFrom(result); |
|||
return source; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_MultiplyInPlaceScalar |
|||
{ |
|||
[Benchmark] |
|||
public float MultiplyInPlaceScalar() |
|||
{ |
|||
float f = 42F; |
|||
Block8x8F b = default; |
|||
b.MultiplyInPlace(f); |
|||
return f; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_Transpose |
|||
{ |
|||
private static readonly Block8x8F Source = Create8x8FloatData(); |
|||
|
|||
[Benchmark] |
|||
public void TransposeInto() |
|||
{ |
|||
var dest = default(Block8x8F); |
|||
Source.TransposeInto(ref dest); |
|||
} |
|||
|
|||
private static Block8x8F Create8x8FloatData() |
|||
{ |
|||
var result = new float[64]; |
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
for (int j = 0; j < 8; j++) |
|||
{ |
|||
result[(i * 8) + j] = (i * 10) + j; |
|||
} |
|||
} |
|||
|
|||
var source = default(Block8x8F); |
|||
source.LoadFrom(result); |
|||
return source; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class FromVector4_Rgb24 : FromVector4<Rgb24> |
|||
{ |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
|
|||
// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
|
|||
// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// IterationCount=3 LaunchCount=1 WarmupCount=3
|
|||
//
|
|||
// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 64 | 343.2 ns | 305.91 ns | 16.77 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 64 | 320.8 ns | 19.93 ns | 1.09 ns | 0.94 | 0.05 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 64 | 234.3 ns | 17.98 ns | 0.99 ns | 1.00 | 0.00 | 0.0052 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 64 | 246.0 ns | 82.34 ns | 4.51 ns | 1.05 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 64 | 222.3 ns | 39.46 ns | 2.16 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 64 | 243.4 ns | 33.58 ns | 1.84 ns | 1.09 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 256 | 824.9 ns | 32.77 ns | 1.80 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 256 | 967.0 ns | 39.09 ns | 2.14 ns | 1.17 | 0.01 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 256 | 756.9 ns | 94.43 ns | 5.18 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns | 1.32 | 0.22 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 256 | 748.6 ns | 248.03 ns | 13.60 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 256 | 437.0 ns | 36.48 ns | 2.00 ns | 0.58 | 0.01 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 2048 | 5,751.6 ns | 704.24 ns | 38.60 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 2048 | 4,391.6 ns | 718.17 ns | 39.37 ns | 0.76 | 0.00 | 0.0153 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 2048 | 6,202.0 ns | 1,815.18 ns | 99.50 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 2048 | 4,225.6 ns | 1,004.03 ns | 55.03 ns | 0.68 | 0.01 | 0.0153 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 2048 | 1,822.7 ns | 1,764.43 ns | 96.71 ns | 0.30 | 0.02 | 0.0172 | - | - | 72 B |
|
|||
@ -0,0 +1,87 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Pad3Shuffle4Channel |
|||
{ |
|||
private static readonly DefaultPad3Shuffle4 Control = new DefaultPad3Shuffle4(1, 0, 3, 2); |
|||
private static readonly XYZWPad3Shuffle4 ControlFast = default; |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count * 4 / 3]; |
|||
} |
|||
|
|||
[Params(96, 384, 768, 1536)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Pad3Shuffle4() |
|||
{ |
|||
SimdUtils.Pad3Shuffle4(this.source, this.destination, Control); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Pad3Shuffle4FastFallback() |
|||
{ |
|||
SimdUtils.Pad3Shuffle4(this.source, this.destination, ControlFast); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-30
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |------------------------- |------------------- |-------------------------------------------------- |------ |------------:|----------:|----------:|------------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 120.64 ns | 7.190 ns | 21.200 ns | 114.26 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.63 ns | 0.175 ns | 0.155 ns | 23.65 ns | 0.15 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 25.25 ns | 0.356 ns | 0.298 ns | 25.27 ns | 0.17 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 14.80 ns | 0.358 ns | 1.032 ns | 14.64 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 96 | 24.84 ns | 0.376 ns | 0.333 ns | 24.74 ns | 1.57 | 0.06 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 96 | 24.58 ns | 0.471 ns | 0.704 ns | 24.38 ns | 1.60 | 0.09 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 258.92 ns | 4.873 ns | 4.069 ns | 257.95 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 384 | 41.41 ns | 0.859 ns | 1.204 ns | 41.33 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 40.74 ns | 0.848 ns | 0.793 ns | 40.48 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 74.50 ns | 0.490 ns | 0.383 ns | 74.49 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 384 | 40.74 ns | 0.624 ns | 0.584 ns | 40.72 ns | 0.55 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 384 | 38.28 ns | 0.534 ns | 0.417 ns | 38.22 ns | 0.51 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 503.91 ns | 6.466 ns | 6.048 ns | 501.58 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 768 | 62.86 ns | 0.332 ns | 0.277 ns | 62.80 ns | 0.12 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 64.59 ns | 0.469 ns | 0.415 ns | 64.62 ns | 0.13 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 110.51 ns | 0.592 ns | 0.554 ns | 110.33 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 768 | 64.72 ns | 1.306 ns | 1.090 ns | 64.51 ns | 0.59 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.11 ns | 0.816 ns | 0.682 ns | 61.98 ns | 0.56 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 1,005.84 ns | 13.176 ns | 12.325 ns | 1,004.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 110.05 ns | 0.256 ns | 0.214 ns | 110.04 ns | 0.11 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.545 ns | 0.483 ns | 110.09 ns | 0.11 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 220.37 ns | 1.601 ns | 1.419 ns | 220.13 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 1536 | 111.54 ns | 2.173 ns | 2.901 ns | 111.27 ns | 0.51 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.456 ns | 0.427 ns | 110.25 ns | 0.50 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortCore31))] |
|||
public class PremultiplyVector4 |
|||
{ |
|||
private static readonly Vector4[] Vectors = CreateVectors(); |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void PremultiplyBaseline() |
|||
{ |
|||
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors); |
|||
|
|||
for (int i = 0; i < Vectors.Length; i++) |
|||
{ |
|||
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); |
|||
Premultiply(ref v); |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Premultiply() |
|||
{ |
|||
Vector4Utilities.Premultiply(Vectors); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Premultiply(ref Vector4 source) |
|||
{ |
|||
float w = source.W; |
|||
source *= w; |
|||
source.W = w; |
|||
} |
|||
|
|||
private static Vector4[] CreateVectors() |
|||
{ |
|||
var rnd = new Random(42); |
|||
return GenerateRandomVectorArray(rnd, 2048, 0, 1); |
|||
} |
|||
|
|||
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) |
|||
{ |
|||
var values = new Vector4[length]; |
|||
|
|||
for (int i = 0; i < length; i++) |
|||
{ |
|||
ref Vector4 v = ref values[i]; |
|||
v.X = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Y = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Z = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.W = GetRandomFloat(rnd, minVal, maxVal); |
|||
} |
|||
|
|||
return values; |
|||
} |
|||
|
|||
private static float GetRandomFloat(Random rnd, float minVal, float maxVal) |
|||
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Shuffle3Channel |
|||
{ |
|||
private static readonly DefaultShuffle3 Control = new DefaultShuffle3(1, 0, 2); |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count]; |
|||
} |
|||
|
|||
[Params(96, 384, 768, 1536)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle3() |
|||
{ |
|||
SimdUtils.Shuffle3(this.source, this.destination, Control); |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |--------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 48.46 ns | 1.034 ns | 2.438 ns | 47.46 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 96 | 32.42 ns | 0.537 ns | 0.476 ns | 32.34 ns | 0.66 | 0.04 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 32.51 ns | 0.373 ns | 0.349 ns | 32.56 ns | 0.66 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 199.04 ns | 1.512 ns | 1.180 ns | 199.17 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 384 | 71.20 ns | 2.654 ns | 7.784 ns | 69.60 ns | 0.41 | 0.02 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 63.23 ns | 0.569 ns | 0.505 ns | 63.21 ns | 0.32 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 391.28 ns | 5.087 ns | 3.972 ns | 391.22 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 768 | 109.12 ns | 2.149 ns | 2.010 ns | 108.66 ns | 0.28 | 0.01 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 106.51 ns | 0.734 ns | 0.613 ns | 106.56 ns | 0.27 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 773.70 ns | 5.516 ns | 4.890 ns | 772.96 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 1536 | 190.41 ns | 1.090 ns | 0.851 ns | 190.38 ns | 0.25 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 190.94 ns | 0.985 ns | 0.769 ns | 190.85 ns | 0.25 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,95 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Shuffle4Slice3Channel |
|||
{ |
|||
private static readonly DefaultShuffle4Slice3 Control = new DefaultShuffle4Slice3(1, 0, 3, 2); |
|||
private static readonly XYZWShuffle4Slice3 ControlFast = default; |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[(int)(this.Count * (3 / 4F))]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Slice3() |
|||
{ |
|||
SimdUtils.Shuffle4Slice3(this.source, this.destination, Control); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Slice3FastFallback() |
|||
{ |
|||
SimdUtils.Shuffle4Slice3(this.source, this.destination, ControlFast); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |--------------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 56.44 ns | 2.843 ns | 8.382 ns | 56.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 128 | 27.15 ns | 0.556 ns | 0.762 ns | 27.34 ns | 0.41 | 0.03 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 128 | 26.36 ns | 0.321 ns | 0.268 ns | 26.26 ns | 0.38 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 25.85 ns | 0.494 ns | 0.462 ns | 25.84 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 128 | 26.15 ns | 0.113 ns | 0.106 ns | 26.16 ns | 1.01 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 128 | 25.57 ns | 0.078 ns | 0.061 ns | 25.56 ns | 0.99 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 97.47 ns | 0.327 ns | 0.289 ns | 97.35 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 256 | 32.61 ns | 0.107 ns | 0.095 ns | 32.62 ns | 0.33 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.21 ns | 0.169 ns | 0.150 ns | 33.15 ns | 0.34 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 52.34 ns | 0.779 ns | 0.729 ns | 51.94 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 256 | 32.16 ns | 0.111 ns | 0.104 ns | 32.16 ns | 0.61 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.61 ns | 0.342 ns | 0.319 ns | 33.62 ns | 0.64 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 210.74 ns | 3.825 ns | 5.956 ns | 207.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 512 | 51.03 ns | 0.535 ns | 0.501 ns | 51.18 ns | 0.24 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 512 | 66.60 ns | 1.313 ns | 1.613 ns | 65.93 ns | 0.31 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 119.12 ns | 1.905 ns | 1.689 ns | 118.52 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 512 | 50.33 ns | 0.382 ns | 0.339 ns | 50.41 ns | 0.42 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 512 | 49.25 ns | 0.555 ns | 0.492 ns | 49.26 ns | 0.41 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 423.55 ns | 4.891 ns | 4.336 ns | 423.27 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 1024 | 77.13 ns | 1.355 ns | 2.264 ns | 76.19 ns | 0.19 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 79.39 ns | 0.103 ns | 0.086 ns | 79.37 ns | 0.19 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 226.57 ns | 2.930 ns | 2.598 ns | 226.10 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 1024 | 80.25 ns | 1.647 ns | 2.082 ns | 80.98 ns | 0.35 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 84.99 ns | 1.234 ns | 1.155 ns | 85.60 ns | 0.38 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 794.96 ns | 1.735 ns | 1.538 ns | 795.15 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 2048 | 128.41 ns | 0.417 ns | 0.390 ns | 128.24 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 127.24 ns | 0.294 ns | 0.229 ns | 127.23 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 382.97 ns | 1.064 ns | 0.831 ns | 382.87 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 2048 | 126.93 ns | 0.382 ns | 0.339 ns | 126.94 ns | 0.33 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 149.36 ns | 1.875 ns | 1.754 ns | 149.33 ns | 0.39 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class ShuffleByte4Channel |
|||
{ |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Channel() |
|||
{ |
|||
SimdUtils.Shuffle4<WXYZShuffle4>(this.source, this.destination, default); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Tests; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class ShuffleFloat4Channel |
|||
{ |
|||
private static readonly byte Control = default(WXYZShuffle4).Control; |
|||
private float[] source; |
|||
private float[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); |
|||
this.destination = new float[this.Count]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Channel() |
|||
{ |
|||
SimdUtils.Shuffle4(this.source, this.destination, Control); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------- |------------------- |-------------------------------------------------- |------ |-----------:|----------:|----------:|------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.647 ns | 0.5475 ns | 0.4853 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 128 | 9.818 ns | 0.1457 ns | 0.1292 ns | 0.15 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 15.267 ns | 0.1005 ns | 0.0940 ns | 0.24 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 125.586 ns | 1.9312 ns | 1.8064 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 256 | 15.878 ns | 0.1983 ns | 0.1758 ns | 0.13 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 29.170 ns | 0.2925 ns | 0.2442 ns | 0.23 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 263.859 ns | 2.6660 ns | 2.3634 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 512 | 29.452 ns | 0.3334 ns | 0.3118 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 52.912 ns | 0.1932 ns | 0.1713 ns | 0.20 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 495.717 ns | 1.9850 ns | 1.8567 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 1024 | 53.757 ns | 0.3212 ns | 0.2847 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 107.815 ns | 1.6201 ns | 1.3528 ns | 0.22 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 980.134 ns | 3.7407 ns | 3.1237 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 2048 | 105.120 ns | 0.6140 ns | 0.5443 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 216.473 ns | 2.3268 ns | 2.0627 ns | 0.22 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,65 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
using SixLabors.ImageSharp.Memory; |
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class ToVector4_Rgb24 : ToVector4<Rgb24> |
|||
{ |
|||
[Benchmark(Baseline = true)] |
|||
public void PixelOperations_Base() |
|||
{ |
|||
new PixelOperations<Rgb24>().ToVector4( |
|||
this.Configuration, |
|||
this.source.GetSpan(), |
|||
this.destination.GetSpan()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
|
|||
// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
|
|||
// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// IterationCount=3 LaunchCount=1 WarmupCount=3
|
|||
//
|
|||
// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 64 | 298.4 ns | 33.63 ns | 1.84 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 64 | 355.5 ns | 908.51 ns | 49.80 ns | 1.19 | 0.17 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 64 | 220.1 ns | 13.77 ns | 0.75 ns | 1.00 | 0.00 | 0.0055 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 64 | 228.5 ns | 41.41 ns | 2.27 ns | 1.04 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 64 | 213.6 ns | 12.47 ns | 0.68 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 64 | 217.0 ns | 9.95 ns | 0.55 ns | 1.02 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 256 | 829.0 ns | 242.93 ns | 13.32 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 256 | 448.9 ns | 4.04 ns | 0.22 ns | 0.54 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 256 | 863.0 ns | 1,253.26 ns | 68.70 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 256 | 309.2 ns | 66.16 ns | 3.63 ns | 0.36 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 256 | 737.0 ns | 253.90 ns | 13.92 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 256 | 212.3 ns | 1.07 ns | 0.06 ns | 0.29 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 2048 | 5,625.6 ns | 404.35 ns | 22.16 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 2048 | 1,974.1 ns | 229.84 ns | 12.60 ns | 0.35 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 2048 | 5,467.2 ns | 537.29 ns | 29.45 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 2048 | 1,985.5 ns | 4,714.23 ns | 258.40 ns | 0.36 | 0.05 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 2048 | 5,888.2 ns | 1,622.23 ns | 88.92 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 2048 | 1,165.0 ns | 191.71 ns | 10.51 ns | 0.20 | 0.00 | - | - | - | - |
|
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortCore31))] |
|||
public class UnPremultiplyVector4 |
|||
{ |
|||
private static readonly Vector4[] Vectors = CreateVectors(); |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void UnPremultiplyBaseline() |
|||
{ |
|||
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors); |
|||
|
|||
for (int i = 0; i < Vectors.Length; i++) |
|||
{ |
|||
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); |
|||
UnPremultiply(ref v); |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void UnPremultiply() |
|||
{ |
|||
Vector4Utilities.UnPremultiply(Vectors); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void UnPremultiply(ref Vector4 source) |
|||
{ |
|||
float w = source.W; |
|||
source /= w; |
|||
source.W = w; |
|||
} |
|||
|
|||
private static Vector4[] CreateVectors() |
|||
{ |
|||
var rnd = new Random(42); |
|||
return GenerateRandomVectorArray(rnd, 2048, 0, 1); |
|||
} |
|||
|
|||
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) |
|||
{ |
|||
var values = new Vector4[length]; |
|||
|
|||
for (int i = 0; i < length; i++) |
|||
{ |
|||
ref Vector4 v = ref values[i]; |
|||
v.X = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Y = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Z = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.W = GetRandomFloat(rnd, minVal, maxVal); |
|||
} |
|||
|
|||
return values; |
|||
} |
|||
|
|||
private static float GetRandomFloat(Random rnd, float minVal, float maxVal) |
|||
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; |
|||
} |
|||
} |
|||
@ -0,0 +1,84 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics.X86; |
|||
#endif
|
|||
using BenchmarkDotNet.Environments; |
|||
using BenchmarkDotNet.Jobs; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks |
|||
{ |
|||
public partial class Config |
|||
{ |
|||
private const string On = "1"; |
|||
private const string Off = "0"; |
|||
|
|||
// See https://github.com/SixLabors/ImageSharp/pull/1229#discussion_r440477861
|
|||
// * EnableHWIntrinsic
|
|||
// * EnableSSE
|
|||
// * EnableSSE2
|
|||
// * EnableAES
|
|||
// * EnablePCLMULQDQ
|
|||
// * EnableSSE3
|
|||
// * EnableSSSE3
|
|||
// * EnableSSE41
|
|||
// * EnableSSE42
|
|||
// * EnablePOPCNT
|
|||
// * EnableAVX
|
|||
// * EnableFMA
|
|||
// * EnableAVX2
|
|||
// * EnableBMI1
|
|||
// * EnableBMI2
|
|||
// * EnableLZCNT
|
|||
//
|
|||
// `FeatureSIMD` ends up impacting all SIMD support(including `System.Numerics`) but not things
|
|||
// like `LZCNT`, `BMI1`, or `BMI2`
|
|||
// `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
|
|||
private const string EnableAES = "COMPlus_EnableAES"; |
|||
private const string EnableAVX = "COMPlus_EnableAVX"; |
|||
private const string EnableAVX2 = "COMPlus_EnableAVX2"; |
|||
private const string EnableBMI1 = "COMPlus_EnableBMI1"; |
|||
private const string EnableBMI2 = "COMPlus_EnableBMI2"; |
|||
private const string EnableFMA = "COMPlus_EnableFMA"; |
|||
private const string EnableHWIntrinsic = "COMPlus_EnableHWIntrinsic"; |
|||
private const string EnableLZCNT = "COMPlus_EnableLZCNT"; |
|||
private const string EnablePCLMULQDQ = "COMPlus_EnablePCLMULQDQ"; |
|||
private const string EnablePOPCNT = "COMPlus_EnablePOPCNT"; |
|||
private const string EnableSSE = "COMPlus_EnableSSE"; |
|||
private const string EnableSSE2 = "COMPlus_EnableSSE2"; |
|||
private const string EnableSSE3 = "COMPlus_EnableSSE3"; |
|||
private const string EnableSSE3_4 = "COMPlus_EnableSSE3_4"; |
|||
private const string EnableSSE41 = "COMPlus_EnableSSE41"; |
|||
private const string EnableSSE42 = "COMPlus_EnableSSE42"; |
|||
private const string EnableSSSE3 = "COMPlus_EnableSSSE3"; |
|||
private const string FeatureSIMD = "COMPlus_FeatureSIMD"; |
|||
|
|||
public class HwIntrinsics_SSE_AVX : Config |
|||
{ |
|||
public HwIntrinsics_SSE_AVX() |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithEnvironmentVariables( |
|||
new EnvironmentVariable(EnableHWIntrinsic, Off), |
|||
new EnvironmentVariable(FeatureSIMD, Off)) |
|||
.WithId("1. No HwIntrinsics").AsBaseline()); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
if (Avx.IsSupported) |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithId("2. AVX")); |
|||
} |
|||
|
|||
if (Sse.IsSupported) |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) |
|||
.WithId("3. SSE")); |
|||
} |
|||
#endif
|
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,399 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using SixLabors.ImageSharp.Tests.TestUtilities; |
|||
using Xunit; |
|||
|
|||
namespace SixLabors.ImageSharp.Tests.Common |
|||
{ |
|||
public partial class SimdUtilsTests |
|||
{ |
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffleFloat4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
// No need to test multiple shuffle controls as the
|
|||
// pipeline is always the same.
|
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
byte control = default(WZYXShuffle4).Control; |
|||
|
|||
TestShuffleFloat4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, control), |
|||
control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffleByte4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle4 to the generic utils method.
|
|||
WXYZShuffle4 wxyz = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wxyz), |
|||
wxyz.Control); |
|||
|
|||
WZYXShuffle4 wzyx = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wzyx), |
|||
wzyx.Control); |
|||
|
|||
YZWXShuffle4 yzwx = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yzwx), |
|||
yzwx.Control); |
|||
|
|||
ZYXWShuffle4 zyxw = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, zyxw), |
|||
zyxw.Control); |
|||
|
|||
var xwyz = new DefaultShuffle4(2, 1, 3, 0); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultShuffle4(1, 1, 1, 1); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultShuffle4(3, 3, 3, 3); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy3))] |
|||
public void BulkShuffleByte3Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle3 to the generic utils method.
|
|||
var zyx = new DefaultShuffle3(0, 1, 2); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zyx), |
|||
zyx.Control); |
|||
|
|||
var xyz = new DefaultShuffle3(2, 1, 0); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, xyz), |
|||
xyz.Control); |
|||
|
|||
var yyy = new DefaultShuffle3(1, 1, 1); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, yyy), |
|||
yyy.Control); |
|||
|
|||
var zzz = new DefaultShuffle3(2, 2, 2); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zzz), |
|||
zzz.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy3))] |
|||
public void BulkPad3Shuffle4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IPad3Shuffle4 to the generic utils method.
|
|||
XYZWPad3Shuffle4 xyzw = default; |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xyzw), |
|||
xyzw.Control); |
|||
|
|||
var xwyz = new DefaultPad3Shuffle4(2, 1, 3, 0); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultPad3Shuffle4(1, 1, 1, 1); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultPad3Shuffle4(3, 3, 3, 3); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffle4Slice3Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle4Slice3 to the generic utils method.
|
|||
XYZWShuffle4Slice3 xyzw = default; |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xyzw), |
|||
xyzw.Control); |
|||
|
|||
var xwyz = new DefaultShuffle4Slice3(2, 1, 3, 0); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultShuffle4Slice3(1, 1, 1, 1); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultShuffle4Slice3(3, 3, 3, 3); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
private static void TestShuffleFloat4Channel( |
|||
int count, |
|||
Action<Memory<float>, Memory<float>> convert, |
|||
byte control) |
|||
{ |
|||
float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); |
|||
var result = new float[count]; |
|||
|
|||
float[] expected = new float[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 4) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
expected[i + 3] = source[p3 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); |
|||
} |
|||
|
|||
private static void TestShuffleByte4Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
var result = new byte[count]; |
|||
|
|||
byte[] expected = new byte[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 4) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
expected[i + 3] = source[p3 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestShuffleByte3Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
var result = new byte[count]; |
|||
|
|||
byte[] expected = new byte[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int _, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 3) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestPad3Shuffle4Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
|
|||
var result = new byte[count * 4 / 3]; |
|||
|
|||
byte[] expected = new byte[result.Length]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) |
|||
{ |
|||
expected[p0 + i] = source[j]; |
|||
expected[p1 + i] = source[j + 1]; |
|||
expected[p2 + i] = source[j + 2]; |
|||
expected[p3 + i] = byte.MaxValue; |
|||
} |
|||
|
|||
Span<byte> temp = stackalloc byte[4]; |
|||
for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) |
|||
{ |
|||
temp[0] = source[j]; |
|||
temp[1] = source[j + 1]; |
|||
temp[2] = source[j + 2]; |
|||
temp[3] = byte.MaxValue; |
|||
|
|||
expected[i] = temp[p0]; |
|||
expected[i + 1] = temp[p1]; |
|||
expected[i + 2] = temp[p2]; |
|||
expected[i + 3] = temp[p3]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
for (int i = 0; i < expected.Length; i++) |
|||
{ |
|||
Assert.Equal(expected[i], result[i]); |
|||
} |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestShuffle4Slice3Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
|
|||
var result = new byte[count * 3 / 4]; |
|||
|
|||
byte[] expected = new byte[result.Length]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int _, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0, j = 0; i < expected.Length; i += 3, j += 4) |
|||
{ |
|||
expected[i] = source[p0 + j]; |
|||
expected[i + 1] = source[p1 + j]; |
|||
expected[i + 2] = source[p2 + j]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
for (int i = 0; i < expected.Length; i++) |
|||
{ |
|||
Assert.Equal(expected[i], result[i]); |
|||
} |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,319 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Collections.Generic; |
|||
using System.Diagnostics; |
|||
using Microsoft.DotNet.RemoteExecutor; |
|||
using Xunit.Abstractions; |
|||
|
|||
namespace SixLabors.ImageSharp.Tests.TestUtilities |
|||
{ |
|||
/// <summary>
|
|||
/// Allows the testing against specific feature sets.
|
|||
/// </summary>
|
|||
public static class FeatureTestRunner |
|||
{ |
|||
private static readonly char[] SplitChars = new[] { ',', ' ' }; |
|||
|
|||
/// <summary>
|
|||
/// Allows the deserialization of parameters passed to the feature test.
|
|||
/// <remark>
|
|||
/// <para>
|
|||
/// This is required because <see cref="RemoteExecutor"/> does not allow
|
|||
/// marshalling of fields so we cannot pass a wrapped <see cref="Action{T}"/>
|
|||
/// allowing automatic deserialization.
|
|||
/// </para>
|
|||
/// </remark>
|
|||
/// </summary>
|
|||
/// <typeparam name="T">The type to deserialize to.</typeparam>
|
|||
/// <param name="value">The string value to deserialize.</param>
|
|||
/// <returns>The <see cref="T"/> value.</returns>
|
|||
public static T DeserializeForXunit<T>(string value) |
|||
where T : IXunitSerializable |
|||
=> BasicSerializer.Deserialize<T>(value); |
|||
|
|||
/// <summary>
|
|||
/// Allows the deserialization of types implementing <see cref="IConvertible"/>
|
|||
/// passed to the feature test.
|
|||
/// </summary>
|
|||
/// <param name="value">The string value to deserialize.</param>
|
|||
/// <returns>The <typeparamref name="T"/> value.</returns>
|
|||
public static T Deserialize<T>(string value) |
|||
where T : IConvertible |
|||
=> (T)Convert.ChangeType(value, typeof(T)); |
|||
|
|||
/// <summary>
|
|||
/// Runs the given test <paramref name="action"/> within an environment
|
|||
/// where the given <paramref name="intrinsics"/> features.
|
|||
/// </summary>
|
|||
/// <param name="action">The test action to run.</param>
|
|||
/// <param name="intrinsics">The intrinsics features.</param>
|
|||
public static void RunWithHwIntrinsicsFeature( |
|||
Action action, |
|||
HwIntrinsics intrinsics) |
|||
{ |
|||
if (!RemoteExecutor.IsSupported) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection()) |
|||
{ |
|||
var processStartInfo = new ProcessStartInfo(); |
|||
if (intrinsic.Key != HwIntrinsics.AllowAll) |
|||
{ |
|||
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; |
|||
|
|||
RemoteExecutor.Invoke( |
|||
action, |
|||
new RemoteInvokeOptions |
|||
{ |
|||
StartInfo = processStartInfo |
|||
}) |
|||
.Dispose(); |
|||
} |
|||
else |
|||
{ |
|||
// Since we are running using the default architecture there is no
|
|||
// point creating the overhead of running the action in a separate process.
|
|||
action(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Runs the given test <paramref name="action"/> within an environment
|
|||
/// where the given <paramref name="intrinsics"/> features.
|
|||
/// </summary>
|
|||
/// <param name="action">
|
|||
/// The test action to run.
|
|||
/// The parameter passed will be a string representing the currently testing <see cref="HwIntrinsics"/>.</param>
|
|||
/// <param name="intrinsics">The intrinsics features.</param>
|
|||
public static void RunWithHwIntrinsicsFeature( |
|||
Action<string> action, |
|||
HwIntrinsics intrinsics) |
|||
{ |
|||
if (!RemoteExecutor.IsSupported) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection()) |
|||
{ |
|||
var processStartInfo = new ProcessStartInfo(); |
|||
if (intrinsic.Key != HwIntrinsics.AllowAll) |
|||
{ |
|||
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; |
|||
|
|||
RemoteExecutor.Invoke( |
|||
action, |
|||
intrinsic.Key.ToString(), |
|||
new RemoteInvokeOptions |
|||
{ |
|||
StartInfo = processStartInfo |
|||
}) |
|||
.Dispose(); |
|||
} |
|||
else |
|||
{ |
|||
// Since we are running using the default architecture there is no
|
|||
// point creating the overhead of running the action in a separate process.
|
|||
action(intrinsic.Key.ToString()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Runs the given test <paramref name="action"/> within an environment
|
|||
/// where the given <paramref name="intrinsics"/> features.
|
|||
/// </summary>
|
|||
/// <param name="action">The test action to run.</param>
|
|||
/// <param name="intrinsics">The intrinsics features.</param>
|
|||
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
|
|||
public static void RunWithHwIntrinsicsFeature<T>( |
|||
Action<string> action, |
|||
HwIntrinsics intrinsics, |
|||
T serializable) |
|||
where T : IXunitSerializable |
|||
{ |
|||
if (!RemoteExecutor.IsSupported) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection()) |
|||
{ |
|||
var processStartInfo = new ProcessStartInfo(); |
|||
if (intrinsic.Key != HwIntrinsics.AllowAll) |
|||
{ |
|||
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; |
|||
|
|||
RemoteExecutor.Invoke( |
|||
action, |
|||
BasicSerializer.Serialize(serializable), |
|||
new RemoteInvokeOptions |
|||
{ |
|||
StartInfo = processStartInfo |
|||
}) |
|||
.Dispose(); |
|||
} |
|||
else |
|||
{ |
|||
// Since we are running using the default architecture there is no
|
|||
// point creating the overhead of running the action in a separate process.
|
|||
action(BasicSerializer.Serialize(serializable)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Runs the given test <paramref name="action"/> within an environment
|
|||
/// where the given <paramref name="intrinsics"/> features.
|
|||
/// </summary>
|
|||
/// <param name="action">The test action to run.</param>
|
|||
/// <param name="intrinsics">The intrinsics features.</param>
|
|||
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
|
|||
public static void RunWithHwIntrinsicsFeature<T>( |
|||
Action<string, string> action, |
|||
HwIntrinsics intrinsics, |
|||
T serializable) |
|||
where T : IXunitSerializable |
|||
{ |
|||
if (!RemoteExecutor.IsSupported) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection()) |
|||
{ |
|||
var processStartInfo = new ProcessStartInfo(); |
|||
if (intrinsic.Key != HwIntrinsics.AllowAll) |
|||
{ |
|||
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; |
|||
|
|||
RemoteExecutor.Invoke( |
|||
action, |
|||
BasicSerializer.Serialize(serializable), |
|||
intrinsic.Key.ToString(), |
|||
new RemoteInvokeOptions |
|||
{ |
|||
StartInfo = processStartInfo |
|||
}) |
|||
.Dispose(); |
|||
} |
|||
else |
|||
{ |
|||
// Since we are running using the default architecture there is no
|
|||
// point creating the overhead of running the action in a separate process.
|
|||
action(BasicSerializer.Serialize(serializable), intrinsic.Key.ToString()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Runs the given test <paramref name="action"/> within an environment
|
|||
/// where the given <paramref name="intrinsics"/> features.
|
|||
/// </summary>
|
|||
/// <param name="action">The test action to run.</param>
|
|||
/// <param name="serializable">The value to pass as a parameter to the test action.</param>
|
|||
/// <param name="intrinsics">The intrinsics features.</param>
|
|||
public static void RunWithHwIntrinsicsFeature<T>( |
|||
Action<string> action, |
|||
T serializable, |
|||
HwIntrinsics intrinsics) |
|||
where T : IConvertible |
|||
{ |
|||
if (!RemoteExecutor.IsSupported) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection()) |
|||
{ |
|||
var processStartInfo = new ProcessStartInfo(); |
|||
if (intrinsic.Key != HwIntrinsics.AllowAll) |
|||
{ |
|||
processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; |
|||
|
|||
RemoteExecutor.Invoke( |
|||
action, |
|||
serializable.ToString(), |
|||
new RemoteInvokeOptions |
|||
{ |
|||
StartInfo = processStartInfo |
|||
}) |
|||
.Dispose(); |
|||
} |
|||
else |
|||
{ |
|||
// Since we are running using the default architecture there is no
|
|||
// point creating the overhead of running the action in a separate process.
|
|||
action(serializable.ToString()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal static Dictionary<HwIntrinsics, string> ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) |
|||
{ |
|||
// Loop through and translate the given values into COMPlus equivaluents
|
|||
var features = new Dictionary<HwIntrinsics, string>(); |
|||
foreach (string intrinsic in intrinsics.ToString("G").Split(SplitChars, StringSplitOptions.RemoveEmptyEntries)) |
|||
{ |
|||
var key = (HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic); |
|||
switch (intrinsic) |
|||
{ |
|||
case nameof(HwIntrinsics.DisableSIMD): |
|||
features.Add(key, "FeatureSIMD"); |
|||
break; |
|||
|
|||
case nameof(HwIntrinsics.AllowAll): |
|||
|
|||
// Not a COMPlus value. We filter in calling method.
|
|||
features.Add(key, nameof(HwIntrinsics.AllowAll)); |
|||
break; |
|||
|
|||
default: |
|||
features.Add(key, intrinsic.Replace("Disable", "Enable")); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
return features; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// See <see href="https://github.com/dotnet/runtime/blob/50ac454d8d8a1915188b2a4bb3fff3b81bf6c0cf/src/coreclr/src/jit/jitconfigvalues.h#L224"/>
|
|||
/// <remarks>
|
|||
/// <see cref="DisableSIMD"/> ends up impacting all SIMD support(including System.Numerics)
|
|||
/// but not things like <see cref="DisableBMI1"/>, <see cref="DisableBMI2"/>, and <see cref="DisableLZCNT"/>.
|
|||
/// </remarks>
|
|||
/// </summary>
|
|||
[Flags] |
|||
#pragma warning disable RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
|
|||
public enum HwIntrinsics |
|||
#pragma warning restore RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute).
|
|||
{ |
|||
// Use flags so we can pass multiple values without using params.
|
|||
// Don't base on 0 or use inverse for All as that doesn't translate to string values.
|
|||
DisableSIMD = 1 << 0, |
|||
DisableHWIntrinsic = 1 << 1, |
|||
DisableSSE = 1 << 2, |
|||
DisableSSE2 = 1 << 3, |
|||
DisableAES = 1 << 4, |
|||
DisablePCLMULQDQ = 1 << 5, |
|||
DisableSSE3 = 1 << 6, |
|||
DisableSSSE3 = 1 << 7, |
|||
DisableSSE41 = 1 << 8, |
|||
DisableSSE42 = 1 << 9, |
|||
DisablePOPCNT = 1 << 10, |
|||
DisableAVX = 1 << 11, |
|||
DisableFMA = 1 << 12, |
|||
DisableAVX2 = 1 << 13, |
|||
DisableBMI1 = 1 << 14, |
|||
DisableBMI2 = 1 << 15, |
|||
DisableLZCNT = 1 << 16, |
|||
AllowAll = 1 << 17 |
|||
} |
|||
} |
|||
@ -1,54 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
namespace SixLabors.ImageSharp.Tests |
|||
{ |
|||
public static partial class TestEnvironment |
|||
{ |
|||
internal static class Features |
|||
{ |
|||
public const string On = "1"; |
|||
public const string Off = "0"; |
|||
|
|||
// See https://github.com/SixLabors/ImageSharp/pull/1229#discussion_r440477861
|
|||
// * EnableHWIntrinsic
|
|||
// * EnableSSE
|
|||
// * EnableSSE2
|
|||
// * EnableAES
|
|||
// * EnablePCLMULQDQ
|
|||
// * EnableSSE3
|
|||
// * EnableSSSE3
|
|||
// * EnableSSE41
|
|||
// * EnableSSE42
|
|||
// * EnablePOPCNT
|
|||
// * EnableAVX
|
|||
// * EnableFMA
|
|||
// * EnableAVX2
|
|||
// * EnableBMI1
|
|||
// * EnableBMI2
|
|||
// * EnableLZCNT
|
|||
//
|
|||
// `FeatureSIMD` ends up impacting all SIMD support(including `System.Numerics`) but not things
|
|||
// like `LZCNT`, `BMI1`, or `BMI2`
|
|||
// `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
|
|||
public const string EnableAES = "COMPlus_EnableAES"; |
|||
public const string EnableAVX = "COMPlus_EnableAVX"; |
|||
public const string EnableAVX2 = "COMPlus_EnableAVX2"; |
|||
public const string EnableBMI1 = "COMPlus_EnableBMI1"; |
|||
public const string EnableBMI2 = "COMPlus_EnableBMI2"; |
|||
public const string EnableFMA = "COMPlus_EnableFMA"; |
|||
public const string EnableHWIntrinsic = "COMPlus_EnableHWIntrinsic"; |
|||
public const string EnableLZCNT = "COMPlus_EnableLZCNT"; |
|||
public const string EnablePCLMULQDQ = "COMPlus_EnablePCLMULQDQ"; |
|||
public const string EnablePOPCNT = "COMPlus_EnablePOPCNT"; |
|||
public const string EnableSSE = "COMPlus_EnableSSE"; |
|||
public const string EnableSSE2 = "COMPlus_EnableSSE2"; |
|||
public const string EnableSSE3 = "COMPlus_EnableSSE3"; |
|||
public const string EnableSSE3_4 = "COMPlus_EnableSSE3_4"; |
|||
public const string EnableSSE41 = "COMPlus_EnableSSE41"; |
|||
public const string EnableSSE42 = "COMPlus_EnableSSE42"; |
|||
public const string EnableSSSE3 = "COMPlus_EnableSSSE3"; |
|||
public const string FeatureSIMD = "COMPlus_FeatureSIMD"; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,296 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Collections.Generic; |
|||
using System.Linq; |
|||
using System.Numerics; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics.X86; |
|||
#endif
|
|||
using Xunit; |
|||
using Xunit.Abstractions; |
|||
|
|||
namespace SixLabors.ImageSharp.Tests.TestUtilities.Tests |
|||
{ |
|||
public class FeatureTestRunnerTests |
|||
{ |
|||
public static TheoryData<HwIntrinsics, string[]> Intrinsics => |
|||
new TheoryData<HwIntrinsics, string[]> |
|||
{ |
|||
{ HwIntrinsics.DisableAES | HwIntrinsics.AllowAll, new string[] { "EnableAES", "AllowAll" } }, |
|||
{ HwIntrinsics.DisableSIMD | HwIntrinsics.DisableHWIntrinsic, new string[] { "FeatureSIMD", "EnableHWIntrinsic" } }, |
|||
{ HwIntrinsics.DisableSSE42 | HwIntrinsics.DisableAVX, new string[] { "EnableSSE42", "EnableAVX" } } |
|||
}; |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(Intrinsics))] |
|||
public void ToFeatureCollectionReturnsExpectedResult(HwIntrinsics expectedItrinsics, string[] expectedValues) |
|||
{ |
|||
Dictionary<HwIntrinsics, string> features = expectedItrinsics.ToFeatureKeyValueCollection(); |
|||
HwIntrinsics[] keys = features.Keys.ToArray(); |
|||
|
|||
HwIntrinsics actualIntrinsics = keys[0]; |
|||
for (int i = 1; i < keys.Length; i++) |
|||
{ |
|||
actualIntrinsics |= keys[i]; |
|||
} |
|||
|
|||
Assert.Equal(expectedItrinsics, actualIntrinsics); |
|||
|
|||
IEnumerable<string> actualValues = features.Select(x => x.Value); |
|||
Assert.Equal(expectedValues, actualValues); |
|||
} |
|||
|
|||
[Fact] |
|||
public void AllowsAllHwIntrinsicFeatures() |
|||
{ |
|||
if (!Vector.IsHardwareAccelerated) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
() => Assert.True(Vector.IsHardwareAccelerated), |
|||
HwIntrinsics.AllowAll); |
|||
} |
|||
|
|||
[Fact] |
|||
public void CanLimitHwIntrinsicSIMDFeatures() |
|||
{ |
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
() => Assert.False(Vector.IsHardwareAccelerated), |
|||
HwIntrinsics.DisableSIMD); |
|||
} |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
[Fact] |
|||
public void CanLimitHwIntrinsicBaseFeatures() |
|||
{ |
|||
static void AssertDisabled() |
|||
{ |
|||
Assert.False(Sse.IsSupported); |
|||
Assert.False(Sse2.IsSupported); |
|||
Assert.False(Aes.IsSupported); |
|||
Assert.False(Pclmulqdq.IsSupported); |
|||
Assert.False(Sse3.IsSupported); |
|||
Assert.False(Ssse3.IsSupported); |
|||
Assert.False(Sse41.IsSupported); |
|||
Assert.False(Sse42.IsSupported); |
|||
Assert.False(Popcnt.IsSupported); |
|||
Assert.False(Avx.IsSupported); |
|||
Assert.False(Fma.IsSupported); |
|||
Assert.False(Avx2.IsSupported); |
|||
Assert.False(Bmi1.IsSupported); |
|||
Assert.False(Bmi2.IsSupported); |
|||
Assert.False(Lzcnt.IsSupported); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
AssertDisabled, |
|||
HwIntrinsics.DisableHWIntrinsic); |
|||
} |
|||
#endif
|
|||
|
|||
[Fact] |
|||
public void CanLimitHwIntrinsicFeaturesWithIntrinsicsParam() |
|||
{ |
|||
static void AssertHwIntrinsicsFeatureDisabled(string intrinsic) |
|||
{ |
|||
Assert.NotNull(intrinsic); |
|||
|
|||
switch ((HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic)) |
|||
{ |
|||
case HwIntrinsics.DisableSIMD: |
|||
Assert.False(Vector.IsHardwareAccelerated); |
|||
break; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
case HwIntrinsics.DisableHWIntrinsic: |
|||
Assert.False(Sse.IsSupported); |
|||
Assert.False(Sse2.IsSupported); |
|||
Assert.False(Aes.IsSupported); |
|||
Assert.False(Pclmulqdq.IsSupported); |
|||
Assert.False(Sse3.IsSupported); |
|||
Assert.False(Ssse3.IsSupported); |
|||
Assert.False(Sse41.IsSupported); |
|||
Assert.False(Sse42.IsSupported); |
|||
Assert.False(Popcnt.IsSupported); |
|||
Assert.False(Avx.IsSupported); |
|||
Assert.False(Fma.IsSupported); |
|||
Assert.False(Avx2.IsSupported); |
|||
Assert.False(Bmi1.IsSupported); |
|||
Assert.False(Bmi2.IsSupported); |
|||
Assert.False(Lzcnt.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE: |
|||
Assert.False(Sse.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE2: |
|||
Assert.False(Sse2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAES: |
|||
Assert.False(Aes.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisablePCLMULQDQ: |
|||
Assert.False(Pclmulqdq.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE3: |
|||
Assert.False(Sse3.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSSE3: |
|||
Assert.False(Ssse3.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE41: |
|||
Assert.False(Sse41.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE42: |
|||
Assert.False(Sse42.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisablePOPCNT: |
|||
Assert.False(Popcnt.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAVX: |
|||
Assert.False(Avx.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableFMA: |
|||
Assert.False(Fma.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAVX2: |
|||
Assert.False(Avx2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableBMI1: |
|||
Assert.False(Bmi1.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableBMI2: |
|||
Assert.False(Bmi2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableLZCNT: |
|||
Assert.False(Lzcnt.IsSupported); |
|||
break; |
|||
#endif
|
|||
} |
|||
} |
|||
|
|||
foreach (HwIntrinsics intrinsic in (HwIntrinsics[])Enum.GetValues(typeof(HwIntrinsics))) |
|||
{ |
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature(AssertHwIntrinsicsFeatureDisabled, intrinsic); |
|||
} |
|||
} |
|||
|
|||
[Fact] |
|||
public void CanLimitHwIntrinsicFeaturesWithSerializableParam() |
|||
{ |
|||
static void AssertHwIntrinsicsFeatureDisabled(string serializable) |
|||
{ |
|||
Assert.NotNull(serializable); |
|||
Assert.NotNull(FeatureTestRunner.DeserializeForXunit<FakeSerializable>(serializable)); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
Assert.False(Sse.IsSupported); |
|||
#endif
|
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
AssertHwIntrinsicsFeatureDisabled, |
|||
HwIntrinsics.DisableSSE, |
|||
new FakeSerializable()); |
|||
} |
|||
|
|||
[Fact] |
|||
public void CanLimitHwIntrinsicFeaturesWithSerializableAndIntrinsicsParams() |
|||
{ |
|||
static void AssertHwIntrinsicsFeatureDisabled(string serializable, string intrinsic) |
|||
{ |
|||
Assert.NotNull(serializable); |
|||
Assert.NotNull(FeatureTestRunner.DeserializeForXunit<FakeSerializable>(serializable)); |
|||
|
|||
switch ((HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic)) |
|||
{ |
|||
case HwIntrinsics.DisableSIMD: |
|||
Assert.False(Vector.IsHardwareAccelerated, nameof(Vector.IsHardwareAccelerated)); |
|||
break; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
case HwIntrinsics.DisableHWIntrinsic: |
|||
Assert.False(Sse.IsSupported); |
|||
Assert.False(Sse2.IsSupported); |
|||
Assert.False(Aes.IsSupported); |
|||
Assert.False(Pclmulqdq.IsSupported); |
|||
Assert.False(Sse3.IsSupported); |
|||
Assert.False(Ssse3.IsSupported); |
|||
Assert.False(Sse41.IsSupported); |
|||
Assert.False(Sse42.IsSupported); |
|||
Assert.False(Popcnt.IsSupported); |
|||
Assert.False(Avx.IsSupported); |
|||
Assert.False(Fma.IsSupported); |
|||
Assert.False(Avx2.IsSupported); |
|||
Assert.False(Bmi1.IsSupported); |
|||
Assert.False(Bmi2.IsSupported); |
|||
Assert.False(Lzcnt.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE: |
|||
Assert.False(Sse.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE2: |
|||
Assert.False(Sse2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAES: |
|||
Assert.False(Aes.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisablePCLMULQDQ: |
|||
Assert.False(Pclmulqdq.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE3: |
|||
Assert.False(Sse3.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSSE3: |
|||
Assert.False(Ssse3.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE41: |
|||
Assert.False(Sse41.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableSSE42: |
|||
Assert.False(Sse42.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisablePOPCNT: |
|||
Assert.False(Popcnt.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAVX: |
|||
Assert.False(Avx.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableFMA: |
|||
Assert.False(Fma.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableAVX2: |
|||
Assert.False(Avx2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableBMI1: |
|||
Assert.False(Bmi1.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableBMI2: |
|||
Assert.False(Bmi2.IsSupported); |
|||
break; |
|||
case HwIntrinsics.DisableLZCNT: |
|||
Assert.False(Lzcnt.IsSupported); |
|||
break; |
|||
#endif
|
|||
} |
|||
} |
|||
|
|||
foreach (HwIntrinsics intrinsic in (HwIntrinsics[])Enum.GetValues(typeof(HwIntrinsics))) |
|||
{ |
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature(AssertHwIntrinsicsFeatureDisabled, intrinsic, new FakeSerializable()); |
|||
} |
|||
} |
|||
|
|||
public class FakeSerializable : IXunitSerializable |
|||
{ |
|||
public void Deserialize(IXunitSerializationInfo info) |
|||
{ |
|||
} |
|||
|
|||
public void Serialize(IXunitSerializationInfo info) |
|||
{ |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,3 @@ |
|||
version https://git-lfs.github.com/spec/v1 |
|||
oid sha256:6a9c5cdacc9bedf481c883828de5bfb7902e2bec038fff08830171cf7075e4f9 |
|||
size 870 |
|||
Loading…
Reference in new issue