mirror of https://github.com/SixLabors/ImageSharp
114 changed files with 6571 additions and 1479 deletions
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="utf-8" ?> |
|||
<RunSettings> |
|||
<RunConfiguration> |
|||
<!--Used in conjunction with ActiveIssueAttribute to skip tests with known issues--> |
|||
<TestCaseFilter>category!=failing</TestCaseFilter> |
|||
</RunConfiguration> |
|||
</RunSettings> |
|||
@ -0,0 +1,193 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Buffers.Binary; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
// The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
|
|||
// and ROTR (Rotate Right) emitting efficient CPU instructions:
|
|||
// https://github.com/dotnet/coreclr/pull/1830
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <summary>
|
|||
/// Defines the contract for methods that allow the shuffling of pixel components.
|
|||
/// Used for shuffling on platforms that do not support Hardware Intrinsics.
|
|||
/// </summary>
|
|||
internal interface IComponentShuffle |
|||
{ |
|||
/// <summary>
|
|||
/// Gets the shuffle control.
|
|||
/// </summary>
|
|||
byte Control { get; } |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest); |
|||
} |
|||
|
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle4 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle4 : IShuffle4 |
|||
{ |
|||
private readonly byte p3; |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p3 = p3; |
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p3 = this.p3; |
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0; i < source.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct WXYZShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// ROTL(8, packed) = [Z Y X W]
|
|||
Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct WZYXShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// REVERSE(packedArgb) = [X Y Z W]
|
|||
Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct YZWXShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// ROTR(8, packedArgb) = [Y Z W X]
|
|||
Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct ZYXWShuffle4 : IShuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref uint dBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 4; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
uint packed = Unsafe.Add(ref sBase, i); |
|||
|
|||
// packed = [W Z Y X]
|
|||
// tmp1 = [W 0 Y 0]
|
|||
// tmp2 = [0 Z 0 X]
|
|||
// tmp3=ROTL(16, tmp2) = [0 X 0 Z]
|
|||
// tmp1 + tmp3 = [W X Y Z]
|
|||
uint tmp1 = packed & 0xFF00FF00; |
|||
uint tmp2 = packed & 0x00FF00FF; |
|||
uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); |
|||
|
|||
Unsafe.Add(ref dBase, i) = tmp1 + tmp3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,103 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IPad3Shuffle4 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4 |
|||
{ |
|||
private readonly byte p3; |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p3 = p3; |
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p3 = this.p3; |
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
Span<byte> temp = stackalloc byte[4]; |
|||
ref byte t = ref MemoryMarshal.GetReference(temp); |
|||
ref uint tu = ref Unsafe.As<byte, uint>(ref t); |
|||
|
|||
for (int i = 0, j = 0; i < source.Length; i += 3, j += 4) |
|||
{ |
|||
ref var s = ref Unsafe.Add(ref sBase, i); |
|||
tu = Unsafe.As<byte, uint>(ref s) | 0xFF000000; |
|||
|
|||
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref t, p0); |
|||
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); |
|||
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); |
|||
Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
ref byte sEnd = ref Unsafe.Add(ref sBase, source.Length); |
|||
ref byte sLoopEnd = ref Unsafe.Subtract(ref sEnd, 4); |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) |
|||
{ |
|||
Unsafe.As<byte, uint>(ref dBase) = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000; |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 3); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); |
|||
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); |
|||
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); |
|||
Unsafe.Add(ref dBase, 3) = byte.MaxValue; |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 3); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,53 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle3 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle3 : IShuffle3 |
|||
{ |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle3(byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 2, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 2, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 2, nameof(p0)); |
|||
|
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0; i < source.Length; i += 3) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,101 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <inheritdoc/>
|
|||
internal interface IShuffle4Slice3 : IComponentShuffle |
|||
{ |
|||
} |
|||
|
|||
internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3 |
|||
{ |
|||
private readonly byte p2; |
|||
private readonly byte p1; |
|||
private readonly byte p0; |
|||
|
|||
public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0) |
|||
{ |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p3, 0, 3, nameof(p3)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p2, 0, 3, nameof(p2)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p1, 0, 3, nameof(p1)); |
|||
DebugGuard.MustBeBetweenOrEqualTo<byte>(p0, 0, 3, nameof(p0)); |
|||
|
|||
this.p2 = p2; |
|||
this.p1 = p1; |
|||
this.p0 = p0; |
|||
this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); |
|||
} |
|||
|
|||
public byte Control { get; } |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
int p2 = this.p2; |
|||
int p1 = this.p1; |
|||
int p0 = this.p0; |
|||
|
|||
for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); |
|||
} |
|||
} |
|||
} |
|||
|
|||
internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 |
|||
{ |
|||
public byte Control |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void RunFallbackShuffle(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
ref uint sBase = ref Unsafe.As<byte, uint>(ref MemoryMarshal.GetReference(source)); |
|||
ref Byte3 dBase = ref Unsafe.As<byte, Byte3>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / 4; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
ref uint sLoopEnd = ref Unsafe.Add(ref sBase, u); |
|||
ref uint sEnd = ref Unsafe.Add(ref sBase, n); |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); |
|||
Unsafe.Add(ref dBase, 1) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 1)); |
|||
Unsafe.Add(ref dBase, 2) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 2)); |
|||
Unsafe.Add(ref dBase, 3) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 3)); |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 4); |
|||
dBase = ref Unsafe.Add(ref dBase, 4); |
|||
} |
|||
|
|||
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) |
|||
{ |
|||
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); |
|||
|
|||
sBase = ref Unsafe.Add(ref sBase, 1); |
|||
dBase = ref Unsafe.Add(ref dBase, 1); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[StructLayout(LayoutKind.Explicit, Size = 3)] |
|||
internal readonly struct Byte3 |
|||
{ |
|||
} |
|||
} |
|||
@ -1,103 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
public static class Avx2Intrinsics |
|||
{ |
|||
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
/// <summary>
|
|||
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void NormalizedFloatToByteSaturateReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported) |
|||
{ |
|||
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
NormalizedFloatToByteSaturate( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
|
|||
/// </remarks>
|
|||
internal static void NormalizedFloatToByteSaturate( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var maxBytes = Vector256.Create(255f); |
|||
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); |
|||
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector256<float> f0 = s; |
|||
Vector256<float> f1 = Unsafe.Add(ref s, 1); |
|||
Vector256<float> f2 = Unsafe.Add(ref s, 2); |
|||
Vector256<float> f3 = Unsafe.Add(ref s, 3); |
|||
|
|||
Vector256<int> w0 = ConvertToInt32(f0, maxBytes); |
|||
Vector256<int> w1 = ConvertToInt32(f1, maxBytes); |
|||
Vector256<int> w2 = ConvertToInt32(f2, maxBytes); |
|||
Vector256<int> w3 = ConvertToInt32(f3, maxBytes); |
|||
|
|||
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); |
|||
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); |
|||
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); |
|||
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); |
|||
|
|||
Unsafe.Add(ref destBase, i) = b; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale) |
|||
{ |
|||
vf = Avx.Multiply(vf, scale); |
|||
return Avx.ConvertToVector256Int32(vf); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,795 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
public static class HwIntrinsics |
|||
{ |
|||
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; |
|||
|
|||
private static ReadOnlySpan<byte> ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 }; |
|||
|
|||
private static ReadOnlySpan<byte> ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; |
|||
|
|||
/// <summary>
|
|||
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats.</param>
|
|||
/// <param name="dest">The destination span of floats.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Reduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<float> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx.IsSupported || Sse.IsSupported) |
|||
{ |
|||
int remainder = Avx.IsSupported |
|||
? ImageMaths.ModuloP2(source.Length, Vector256<float>.Count) |
|||
: ImageMaths.ModuloP2(source.Length, Vector128<float>.Count); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle4( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx2.IsSupported || Ssse3.IsSupported) |
|||
{ |
|||
int remainder = Avx2.IsSupported |
|||
? ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count) |
|||
: ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle4( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles 8-bit integer triplets within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle3Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
Shuffle3( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount), |
|||
control); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Pad3Shuffle4Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|||
|
|||
int sourceCount = source.Length - remainder; |
|||
int destCount = sourceCount * 4 / 3; |
|||
|
|||
if (sourceCount > 0) |
|||
{ |
|||
Pad3Shuffle4( |
|||
source.Slice(0, sourceCount), |
|||
dest.Slice(0, destCount), |
|||
control); |
|||
|
|||
source = source.Slice(sourceCount); |
|||
dest = dest.Slice(destCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Slice3Reduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
int remainder = source.Length % (Vector128<byte>.Count * 4); |
|||
|
|||
int sourceCount = source.Length - remainder; |
|||
int destCount = sourceCount * 3 / 4; |
|||
|
|||
if (sourceCount > 0) |
|||
{ |
|||
Shuffle4Slice3( |
|||
source.Slice(0, sourceCount), |
|||
dest.Slice(0, destCount), |
|||
control); |
|||
|
|||
source = source.Slice(sourceCount); |
|||
dest = dest.Slice(destCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx.IsSupported) |
|||
{ |
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<float> destBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector256<float>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector256<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
vd0 = Avx.Permute(vs0, control); |
|||
Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control); |
|||
Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control); |
|||
Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); |
|||
} |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
ref Vector128<float> sourceBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<float> destBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector128<float>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector128<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
vd0 = Sse.Shuffle(vs0, vs0, control); |
|||
|
|||
Vector128<float> vs1 = Unsafe.Add(ref vs0, 1); |
|||
Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control); |
|||
|
|||
Vector128<float> vs2 = Unsafe.Add(ref vs0, 2); |
|||
Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control); |
|||
|
|||
Vector128<float> vs3 = Unsafe.Add(ref vs0, 3); |
|||
Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Vector128<float> vs = Unsafe.Add(ref sourceBase, i); |
|||
Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
// I've chosen to do this for convenience while we determine what
|
|||
// shuffle controls to add to the library.
|
|||
// We can add static ROS instances if need be in the future.
|
|||
Span<byte> bytes = stackalloc byte[Vector256<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector256<byte> vshuffle = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector256<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<byte> destBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd0 = Avx2.Shuffle(vs0, vshuffle); |
|||
Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|||
Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|||
Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Ssse3
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
int m = ImageMaths.Modulo4(n); |
|||
int u = n - m; |
|||
|
|||
for (int i = 0; i < u; i += 4) |
|||
{ |
|||
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|||
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd0 = Ssse3.Shuffle(vs0, vshuffle); |
|||
Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|||
Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|||
Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|||
} |
|||
|
|||
if (m > 0) |
|||
{ |
|||
for (int i = u; i < n; i++) |
|||
{ |
|||
Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle3( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); |
|||
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase); |
|||
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); |
|||
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase); |
|||
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0; i < n; i += 3) |
|||
{ |
|||
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
Vector128<byte> v0 = vs; |
|||
Vector128<byte> v1 = Unsafe.Add(ref vs, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|||
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v2 = Ssse3.AlignRight(v2, v1, 8); |
|||
v1 = Ssse3.AlignRight(v1, v0, 12); |
|||
|
|||
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle); |
|||
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle); |
|||
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle); |
|||
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle); |
|||
|
|||
v0 = Ssse3.Shuffle(v0, vmaske); |
|||
v1 = Ssse3.Shuffle(v1, vmasko); |
|||
v2 = Ssse3.Shuffle(v2, vmaske); |
|||
v3 = Ssse3.Shuffle(v3, vmasko); |
|||
|
|||
v0 = Ssse3.AlignRight(v1, v0, 4); |
|||
v3 = Ssse3.AlignRight(v3, v2, 12); |
|||
|
|||
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|||
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v1 = Ssse3.AlignRight(v2, v1, 8); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, i); |
|||
|
|||
vd = v0; |
|||
Unsafe.Add(ref vd, 1) = v1; |
|||
Unsafe.Add(ref vd, 2) = v3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Pad3Shuffle4( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); |
|||
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase); |
|||
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0, j = 0; i < n; i += 3, j += 4) |
|||
{ |
|||
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i); |
|||
Vector128<byte> v1 = Unsafe.Add(ref v0, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref v0, 2); |
|||
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v2 = Ssse3.AlignRight(v2, v1, 8); |
|||
v1 = Ssse3.AlignRight(v1, v0, 12); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|||
|
|||
vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); |
|||
Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); |
|||
} |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Shuffle4Slice3( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
byte control) |
|||
{ |
|||
if (Ssse3.IsSupported) |
|||
{ |
|||
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); |
|||
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase); |
|||
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|||
|
|||
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|||
Shuffle.MmShuffleSpan(ref bytes, control); |
|||
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|||
|
|||
ref Vector128<byte> sourceBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
int n = source.Length / Vector128<byte>.Count; |
|||
|
|||
for (int i = 0, j = 0; i < n; i += 4, j += 3) |
|||
{ |
|||
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|||
|
|||
Vector128<byte> v0 = vs; |
|||
Vector128<byte> v1 = Unsafe.Add(ref vs, 1); |
|||
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|||
Vector128<byte> v3 = Unsafe.Add(ref vs, 3); |
|||
|
|||
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); |
|||
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); |
|||
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske); |
|||
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); |
|||
|
|||
v0 = Ssse3.AlignRight(v1, v0, 4); |
|||
v3 = Ssse3.AlignRight(v3, v2, 12); |
|||
|
|||
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|||
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|||
|
|||
v1 = Ssse3.AlignRight(v2, v1, 8); |
|||
|
|||
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|||
|
|||
vd = v0; |
|||
Unsafe.Add(ref vd, 1) = v1; |
|||
Unsafe.Add(ref vd, 2) = v3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
|
|||
/// </summary>
|
|||
/// <param name="va">The vector to add to the intermediate result.</param>
|
|||
/// <param name="vm0">The first vector to multiply.</param>
|
|||
/// <param name="vm1">The second vector to multiply.</param>
|
|||
/// <returns>The <see cref="Vector256{T}"/>.</returns>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static Vector256<float> MultiplyAdd( |
|||
in Vector256<float> va, |
|||
in Vector256<float> vm0, |
|||
in Vector256<float> vm1) |
|||
{ |
|||
if (Fma.IsSupported) |
|||
{ |
|||
return Fma.MultiplyAdd(vm1, vm0, va); |
|||
} |
|||
else |
|||
{ |
|||
return Avx.Add(Avx.Multiply(vm0, vm1), va); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void ByteToNormalizedFloatReduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported || Sse2.IsSupported) |
|||
{ |
|||
int remainder; |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count); |
|||
} |
|||
else |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
} |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
|
|||
/// </remarks>
|
|||
internal static unsafe void ByteToNormalizedFloat( |
|||
ReadOnlySpan<byte> source, |
|||
Span<float> dest) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<float> destBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector256.Create(1 / (float)byte.MaxValue); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
int si = Vector256<byte>.Count * i; |
|||
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si); |
|||
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count); |
|||
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2)); |
|||
Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3)); |
|||
|
|||
Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0)); |
|||
Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1)); |
|||
Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2)); |
|||
Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3)); |
|||
|
|||
ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4); |
|||
|
|||
d = f0; |
|||
Unsafe.Add(ref d, 1) = f1; |
|||
Unsafe.Add(ref d, 2) = f2; |
|||
Unsafe.Add(ref d, 3) = f3; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
|
|||
byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<float> destBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector128.Create(1 / (float)byte.MaxValue); |
|||
Vector128<byte> zero = Vector128<byte>.Zero; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
int si = Vector128<byte>.Count * i; |
|||
|
|||
Vector128<int> i0, i1, i2, i3; |
|||
if (Sse41.IsSupported) |
|||
{ |
|||
i0 = Sse41.ConvertToVector128Int32(sourceBase + si); |
|||
i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count); |
|||
i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2)); |
|||
i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3)); |
|||
} |
|||
else |
|||
{ |
|||
Vector128<byte> b = Sse2.LoadVector128(sourceBase + si); |
|||
Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16(); |
|||
Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16(); |
|||
|
|||
i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32(); |
|||
i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32(); |
|||
i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32(); |
|||
i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32(); |
|||
} |
|||
|
|||
Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0)); |
|||
Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1)); |
|||
Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2)); |
|||
Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3)); |
|||
|
|||
ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4); |
|||
|
|||
d = f0; |
|||
Unsafe.Add(ref d, 1) = f1; |
|||
Unsafe.Add(ref d, 2) = f2; |
|||
Unsafe.Add(ref d, 3) = f3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void NormalizedFloatToByteSaturateReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (Avx2.IsSupported || Sse2.IsSupported) |
|||
{ |
|||
int remainder; |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count); |
|||
} |
|||
else |
|||
{ |
|||
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count); |
|||
} |
|||
|
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
NormalizedFloatToByteSaturate( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Implementation is based on MagicScaler code:
|
|||
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
|
|||
/// </remarks>
|
|||
internal static void NormalizedFloatToByteSaturate( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
if (Avx2.IsSupported) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector256<byte>.Count; |
|||
|
|||
ref Vector256<float> sourceBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector256<byte> destBase = |
|||
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector256.Create((float)byte.MaxValue); |
|||
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); |
|||
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector256<float> f0 = Avx.Multiply(scale, s); |
|||
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); |
|||
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); |
|||
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); |
|||
|
|||
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0); |
|||
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1); |
|||
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2); |
|||
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3); |
|||
|
|||
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); |
|||
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); |
|||
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); |
|||
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); |
|||
|
|||
Unsafe.Add(ref destBase, i) = b; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
// Sse
|
|||
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector128<byte>.Count; |
|||
|
|||
ref Vector128<float> sourceBase = |
|||
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|||
|
|||
ref Vector128<byte> destBase = |
|||
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var scale = Vector128.Create((float)byte.MaxValue); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector128<float> f0 = Sse.Multiply(scale, s); |
|||
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); |
|||
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); |
|||
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); |
|||
|
|||
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0); |
|||
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1); |
|||
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2); |
|||
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3); |
|||
|
|||
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1); |
|||
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3); |
|||
|
|||
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,275 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats.</param>
|
|||
/// <param name="dest">The destination span of floats.</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
VerifyShuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
Shuffle4Remainder(source, dest, control); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle4 |
|||
{ |
|||
VerifyShuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffle 8-bit integer triplets within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle3<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle3 |
|||
{ |
|||
VerifyShuffle3SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle3Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Pad3Shuffle4<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IPad3Shuffle4 |
|||
{ |
|||
VerifyPad3Shuffle4SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes.</param>
|
|||
/// <param name="dest">The destination span of bytes.</param>
|
|||
/// <param name="shuffle">The type of shuffle to perform.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Slice3<TShuffle>( |
|||
ReadOnlySpan<byte> source, |
|||
Span<byte> dest, |
|||
TShuffle shuffle) |
|||
where TShuffle : struct, IShuffle4Slice3 |
|||
{ |
|||
VerifyShuffle4Slice3SpanInput(source, dest); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, shuffle.Control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
shuffle.RunFallbackShuffle(source, dest); |
|||
} |
|||
} |
|||
|
|||
private static void Shuffle4Remainder( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
ref float sBase = ref MemoryMarshal.GetReference(source); |
|||
ref float dBase = ref MemoryMarshal.GetReference(dest); |
|||
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); |
|||
|
|||
for (int i = 0; i < source.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
|||
} |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle4SpanInput<T>(ReadOnlySpan<T> source, Span<T> dest) |
|||
where T : struct |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length, |
|||
nameof(source), |
|||
"Input spans must be of same length!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length % 4 == 0, |
|||
nameof(source), |
|||
"Input spans must be divisable by 4!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle3SpanInput<T>(ReadOnlySpan<T> source, Span<T> dest) |
|||
where T : struct |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length, |
|||
nameof(source), |
|||
"Input spans must be of same length!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length % 3 == 0, |
|||
nameof(source), |
|||
"Input spans must be divisable by 3!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length % 3 == 0, |
|||
nameof(source), |
|||
"Input span must be divisable by 3!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length % 4 == 0, |
|||
nameof(dest), |
|||
"Output span must be divisable by 4!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length * 3 / 4, |
|||
nameof(source), |
|||
"Input span must be 3/4 the length of the output span!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan<byte> source, Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length % 4 == 0, |
|||
nameof(source), |
|||
"Input span must be divisable by 4!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length % 3 == 0, |
|||
nameof(dest), |
|||
"Output span must be divisable by 3!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
dest.Length >= source.Length * 3 / 4, |
|||
nameof(source), |
|||
"Output span must be at least 3/4 the length of the input span!"); |
|||
} |
|||
|
|||
public static class Shuffle |
|||
{ |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0) |
|||
=> (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void MmShuffleSpan(ref Span<byte> span, byte control) |
|||
{ |
|||
InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
ref byte spanBase = ref MemoryMarshal.GetReference(span); |
|||
|
|||
for (int i = 0; i < span.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref spanBase, i) = (byte)(p0 + i); |
|||
Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i); |
|||
Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i); |
|||
Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i); |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void InverseMmShuffle( |
|||
byte control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0) |
|||
{ |
|||
p3 = control >> 6 & 0x3; |
|||
p2 = control >> 4 & 0x3; |
|||
p1 = control >> 2 & 0x3; |
|||
p0 = control >> 0 & 0x3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,18 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal abstract class Avx2JpegColorConverter : VectorizedJpegColorConverter |
|||
{ |
|||
protected Avx2JpegColorConverter(JpegColorSpace colorSpace, int precision) |
|||
: base(colorSpace, precision, 8) |
|||
{ |
|||
} |
|||
|
|||
protected sealed override bool IsAvailable => SimdUtils.HasAvx2; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,18 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal abstract class BasicJpegColorConverter : JpegColorConverter |
|||
{ |
|||
protected BasicJpegColorConverter(JpegColorSpace colorSpace, int precision) |
|||
: base(colorSpace, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override bool IsAvailable => true; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,81 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using static SixLabors.ImageSharp.SimdUtils; |
|||
#endif
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromCmykAvx2 : Avx2JpegColorConverter |
|||
{ |
|||
public FromCmykAvx2(int precision) |
|||
: base(JpegColorSpace.Cmyk, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
ref Vector256<float> cBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector256<float> mBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector256<float> yBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
ref Vector256<float> kBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
|||
|
|||
ref Vector256<float> resultBase = |
|||
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
// Used for the color conversion
|
|||
var scale = Vector256.Create(1 / this.MaximumValue); |
|||
var one = Vector256.Create(1F); |
|||
|
|||
// Used for packing
|
|||
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
|||
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
|||
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector256<float> k = Avx2.PermuteVar8x32(Unsafe.Add(ref kBase, i), vcontrol); |
|||
Vector256<float> c = Avx2.PermuteVar8x32(Unsafe.Add(ref cBase, i), vcontrol); |
|||
Vector256<float> m = Avx2.PermuteVar8x32(Unsafe.Add(ref mBase, i), vcontrol); |
|||
Vector256<float> y = Avx2.PermuteVar8x32(Unsafe.Add(ref yBase, i), vcontrol); |
|||
|
|||
k = Avx.Multiply(k, scale); |
|||
|
|||
c = Avx.Multiply(Avx.Multiply(c, k), scale); |
|||
m = Avx.Multiply(Avx.Multiply(m, k), scale); |
|||
y = Avx.Multiply(Avx.Multiply(y, k), scale); |
|||
|
|||
Vector256<float> cmLo = Avx.UnpackLow(c, m); |
|||
Vector256<float> yoLo = Avx.UnpackLow(y, one); |
|||
Vector256<float> cmHi = Avx.UnpackHigh(c, m); |
|||
Vector256<float> yoHi = Avx.UnpackHigh(y, one); |
|||
|
|||
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
|||
|
|||
destination = Avx.Shuffle(cmLo, yoLo, 0b01_00_01_00); |
|||
Unsafe.Add(ref destination, 1) = Avx.Shuffle(cmLo, yoLo, 0b11_10_11_10); |
|||
Unsafe.Add(ref destination, 2) = Avx.Shuffle(cmHi, yoHi, 0b01_00_01_00); |
|||
Unsafe.Add(ref destination, 3) = Avx.Shuffle(cmHi, yoHi, 0b11_10_11_10); |
|||
} |
|||
#endif
|
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromCmykBasic.ConvertCore(values, result, this.MaximumValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,71 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromCmykVector8 : Vector8JpegColorConverter |
|||
{ |
|||
public FromCmykVector8(int precision) |
|||
: base(JpegColorSpace.Cmyk, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
ref Vector<float> cBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector<float> mBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector<float> yBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
ref Vector<float> kBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
|||
|
|||
ref Vector4Octet resultBase = |
|||
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
Vector4Pair cc = default; |
|||
Vector4Pair mm = default; |
|||
Vector4Pair yy = default; |
|||
ref Vector<float> ccRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref cc); |
|||
ref Vector<float> mmRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref mm); |
|||
ref Vector<float> yyRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref yy); |
|||
|
|||
var scale = new Vector<float>(1 / this.MaximumValue); |
|||
|
|||
// Walking 8 elements at one step:
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<float> c = Unsafe.Add(ref cBase, i); |
|||
Vector<float> m = Unsafe.Add(ref mBase, i); |
|||
Vector<float> y = Unsafe.Add(ref yBase, i); |
|||
Vector<float> k = Unsafe.Add(ref kBase, i) * scale; |
|||
|
|||
c = (c * k) * scale; |
|||
m = (m * k) * scale; |
|||
y = (y * k) * scale; |
|||
|
|||
ccRefAsVector = c; |
|||
mmRefAsVector = m; |
|||
yyRefAsVector = y; |
|||
|
|||
// Collect (c0,c1...c8) (m0,m1...m8) (y0,y1...y8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
|||
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
|||
destination.Pack(ref cc, ref mm, ref yy); |
|||
} |
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromCmykBasic.ConvertCore(values, result, this.MaximumValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,63 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using static SixLabors.ImageSharp.SimdUtils; |
|||
#endif
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromGrayscaleAvx2 : Avx2JpegColorConverter |
|||
{ |
|||
public FromGrayscaleAvx2(int precision) |
|||
: base(JpegColorSpace.Grayscale, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
ref Vector256<float> gBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
|
|||
ref Vector256<float> resultBase = |
|||
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
// Used for the color conversion
|
|||
var scale = Vector256.Create(1 / this.MaximumValue); |
|||
var one = Vector256.Create(1F); |
|||
|
|||
// Used for packing
|
|||
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
|||
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
|||
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector256<float> g = Avx.Multiply(Unsafe.Add(ref gBase, i), scale); |
|||
|
|||
g = Avx2.PermuteVar8x32(g, vcontrol); |
|||
|
|||
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
|||
|
|||
destination = Avx.Blend(Avx.Permute(g, 0b00_00_00_00), one, 0b1000_1000); |
|||
Unsafe.Add(ref destination, 1) = Avx.Blend(Avx.Shuffle(g, g, 0b01_01_01_01), one, 0b1000_1000); |
|||
Unsafe.Add(ref destination, 2) = Avx.Blend(Avx.Shuffle(g, g, 0b10_10_10_10), one, 0b1000_1000); |
|||
Unsafe.Add(ref destination, 3) = Avx.Blend(Avx.Shuffle(g, g, 0b11_11_11_11), one, 0b1000_1000); |
|||
} |
|||
#endif
|
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromGrayscaleBasic.ConvertCore(values, result, this.MaximumValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,72 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using static SixLabors.ImageSharp.SimdUtils; |
|||
#endif
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromRgbAvx2 : Avx2JpegColorConverter |
|||
{ |
|||
public FromRgbAvx2(int precision) |
|||
: base(JpegColorSpace.RGB, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
ref Vector256<float> rBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector256<float> gBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector256<float> bBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
|
|||
ref Vector256<float> resultBase = |
|||
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
// Used for the color conversion
|
|||
var scale = Vector256.Create(1 / this.MaximumValue); |
|||
var one = Vector256.Create(1F); |
|||
|
|||
// Used for packing
|
|||
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
|||
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
|||
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector256<float> r = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref rBase, i), vcontrol), scale); |
|||
Vector256<float> g = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref gBase, i), vcontrol), scale); |
|||
Vector256<float> b = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref bBase, i), vcontrol), scale); |
|||
|
|||
Vector256<float> rgLo = Avx.UnpackLow(r, g); |
|||
Vector256<float> boLo = Avx.UnpackLow(b, one); |
|||
Vector256<float> rgHi = Avx.UnpackHigh(r, g); |
|||
Vector256<float> boHi = Avx.UnpackHigh(b, one); |
|||
|
|||
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
|||
|
|||
destination = Avx.Shuffle(rgLo, boLo, 0b01_00_01_00); |
|||
Unsafe.Add(ref destination, 1) = Avx.Shuffle(rgLo, boLo, 0b11_10_11_10); |
|||
Unsafe.Add(ref destination, 2) = Avx.Shuffle(rgHi, boHi, 0b01_00_01_00); |
|||
Unsafe.Add(ref destination, 3) = Avx.Shuffle(rgHi, boHi, 0b11_10_11_10); |
|||
} |
|||
#endif
|
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromRgbBasic.ConvertCore(values, result, this.MaximumValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromRgbVector8 : Vector8JpegColorConverter |
|||
{ |
|||
public FromRgbVector8(int precision) |
|||
: base(JpegColorSpace.RGB, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
ref Vector<float> rBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector<float> gBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector<float> bBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
|
|||
ref Vector4Octet resultBase = |
|||
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
Vector4Pair rr = default; |
|||
Vector4Pair gg = default; |
|||
Vector4Pair bb = default; |
|||
ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr); |
|||
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg); |
|||
ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb); |
|||
|
|||
var scale = new Vector<float>(1 / this.MaximumValue); |
|||
|
|||
// Walking 8 elements at one step:
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<float> r = Unsafe.Add(ref rBase, i); |
|||
Vector<float> g = Unsafe.Add(ref gBase, i); |
|||
Vector<float> b = Unsafe.Add(ref bBase, i); |
|||
r *= scale; |
|||
g *= scale; |
|||
b *= scale; |
|||
|
|||
rrRefAsVector = r; |
|||
ggRefAsVector = g; |
|||
bbRefAsVector = b; |
|||
|
|||
// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
|||
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
|||
destination.Pack(ref rr, ref gg, ref bb); |
|||
} |
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromRgbBasic.ConvertCore(values, result, this.MaximumValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,101 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using static SixLabors.ImageSharp.SimdUtils; |
|||
#endif
|
|||
|
|||
// ReSharper disable ImpureMethodCallOnReadonlyValueField
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromYCbCrAvx2 : Avx2JpegColorConverter |
|||
{ |
|||
public FromYCbCrAvx2(int precision) |
|||
: base(JpegColorSpace.YCbCr, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
ref Vector256<float> yBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector256<float> cbBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector256<float> crBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
|
|||
ref Vector256<float> resultBase = |
|||
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
// Used for the color conversion
|
|||
var chromaOffset = Vector256.Create(-this.HalfValue); |
|||
var scale = Vector256.Create(1 / this.MaximumValue); |
|||
var rCrMult = Vector256.Create(1.402F); |
|||
var gCbMult = Vector256.Create(-0.344136F); |
|||
var gCrMult = Vector256.Create(-0.714136F); |
|||
var bCbMult = Vector256.Create(1.772F); |
|||
|
|||
// Used for packing.
|
|||
var va = Vector256.Create(1F); |
|||
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
|||
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
|||
|
|||
// Walking 8 elements at one step:
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// y = yVals[i];
|
|||
// cb = cbVals[i] - 128F;
|
|||
// cr = crVals[i] - 128F;
|
|||
Vector256<float> y = Unsafe.Add(ref yBase, i); |
|||
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); |
|||
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); |
|||
|
|||
y = Avx2.PermuteVar8x32(y, vcontrol); |
|||
cb = Avx2.PermuteVar8x32(cb, vcontrol); |
|||
cr = Avx2.PermuteVar8x32(cr, vcontrol); |
|||
|
|||
// r = y + (1.402F * cr);
|
|||
// g = y - (0.344136F * cb) - (0.714136F * cr);
|
|||
// b = y + (1.772F * cb);
|
|||
// Adding & multiplying 8 elements at one time:
|
|||
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); |
|||
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult); |
|||
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); |
|||
|
|||
// TODO: We should be saving to RGBA not Vector4
|
|||
r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); |
|||
g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); |
|||
b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); |
|||
|
|||
Vector256<float> vte = Avx.UnpackLow(r, b); |
|||
Vector256<float> vto = Avx.UnpackLow(g, va); |
|||
|
|||
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
|||
|
|||
destination = Avx.UnpackLow(vte, vto); |
|||
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); |
|||
|
|||
vte = Avx.UnpackHigh(r, b); |
|||
vto = Avx.UnpackHigh(g, va); |
|||
|
|||
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); |
|||
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); |
|||
} |
|||
#endif
|
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromYCbCrBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,110 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using static SixLabors.ImageSharp.SimdUtils; |
|||
#endif
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromYccKAvx2 : Avx2JpegColorConverter |
|||
{ |
|||
public FromYccKAvx2(int precision) |
|||
: base(JpegColorSpace.Ycck, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
ref Vector256<float> yBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector256<float> cbBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector256<float> crBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
ref Vector256<float> kBase = |
|||
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
|||
|
|||
ref Vector256<float> resultBase = |
|||
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
// Used for the color conversion
|
|||
var chromaOffset = Vector256.Create(-this.HalfValue); |
|||
var scale = Vector256.Create(1 / this.MaximumValue); |
|||
var max = Vector256.Create(this.MaximumValue); |
|||
var rCrMult = Vector256.Create(1.402F); |
|||
var gCbMult = Vector256.Create(-0.344136F); |
|||
var gCrMult = Vector256.Create(-0.714136F); |
|||
var bCbMult = Vector256.Create(1.772F); |
|||
|
|||
// Used for packing.
|
|||
var va = Vector256.Create(1F); |
|||
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
|||
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
|||
|
|||
// Walking 8 elements at one step:
|
|||
int n = result.Length / 8; |
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// y = yVals[i];
|
|||
// cb = cbVals[i] - 128F;
|
|||
// cr = crVals[i] - 128F;
|
|||
// k = kVals[i] / 256F;
|
|||
Vector256<float> y = Unsafe.Add(ref yBase, i); |
|||
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); |
|||
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); |
|||
Vector256<float> k = Avx.Divide(Unsafe.Add(ref kBase, i), max); |
|||
|
|||
y = Avx2.PermuteVar8x32(y, vcontrol); |
|||
cb = Avx2.PermuteVar8x32(cb, vcontrol); |
|||
cr = Avx2.PermuteVar8x32(cr, vcontrol); |
|||
k = Avx2.PermuteVar8x32(k, vcontrol); |
|||
|
|||
// r = y + (1.402F * cr);
|
|||
// g = y - (0.344136F * cb) - (0.714136F * cr);
|
|||
// b = y + (1.772F * cb);
|
|||
// Adding & multiplying 8 elements at one time:
|
|||
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); |
|||
Vector256<float> g = |
|||
HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult); |
|||
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); |
|||
|
|||
r = Avx.Subtract(max, Avx.RoundToNearestInteger(r)); |
|||
g = Avx.Subtract(max, Avx.RoundToNearestInteger(g)); |
|||
b = Avx.Subtract(max, Avx.RoundToNearestInteger(b)); |
|||
|
|||
r = Avx.Multiply(Avx.Multiply(r, k), scale); |
|||
g = Avx.Multiply(Avx.Multiply(g, k), scale); |
|||
b = Avx.Multiply(Avx.Multiply(b, k), scale); |
|||
|
|||
Vector256<float> vte = Avx.UnpackLow(r, b); |
|||
Vector256<float> vto = Avx.UnpackLow(g, va); |
|||
|
|||
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
|||
|
|||
destination = Avx.UnpackLow(vte, vto); |
|||
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); |
|||
|
|||
vte = Avx.UnpackHigh(r, b); |
|||
vto = Avx.UnpackHigh(g, va); |
|||
|
|||
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); |
|||
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); |
|||
} |
|||
#endif
|
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromYccKBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,91 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal sealed class FromYccKVector8 : Vector8JpegColorConverter |
|||
{ |
|||
public FromYccKVector8(int precision) |
|||
: base(JpegColorSpace.Ycck, precision) |
|||
{ |
|||
} |
|||
|
|||
protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
ref Vector<float> yBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
|||
ref Vector<float> cbBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
|||
ref Vector<float> crBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
|||
ref Vector<float> kBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
|||
|
|||
ref Vector4Octet resultBase = |
|||
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
|||
|
|||
var chromaOffset = new Vector<float>(-this.HalfValue); |
|||
|
|||
// Walking 8 elements at one step:
|
|||
int n = result.Length / 8; |
|||
|
|||
Vector4Pair rr = default; |
|||
Vector4Pair gg = default; |
|||
Vector4Pair bb = default; |
|||
|
|||
ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr); |
|||
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg); |
|||
ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb); |
|||
|
|||
var scale = new Vector<float>(1 / this.MaximumValue); |
|||
var max = new Vector<float>(this.MaximumValue); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// y = yVals[i];
|
|||
// cb = cbVals[i] - 128F;
|
|||
// cr = crVals[i] - 128F;
|
|||
// k = kVals[i] / 256F;
|
|||
Vector<float> y = Unsafe.Add(ref yBase, i); |
|||
Vector<float> cb = Unsafe.Add(ref cbBase, i) + chromaOffset; |
|||
Vector<float> cr = Unsafe.Add(ref crBase, i) + chromaOffset; |
|||
Vector<float> k = Unsafe.Add(ref kBase, i) / max; |
|||
|
|||
// r = y + (1.402F * cr);
|
|||
// g = y - (0.344136F * cb) - (0.714136F * cr);
|
|||
// b = y + (1.772F * cb);
|
|||
// Adding & multiplying 8 elements at one time:
|
|||
Vector<float> r = y + (cr * new Vector<float>(1.402F)); |
|||
Vector<float> g = y - (cb * new Vector<float>(0.344136F)) - (cr * new Vector<float>(0.714136F)); |
|||
Vector<float> b = y + (cb * new Vector<float>(1.772F)); |
|||
|
|||
r = (max - r.FastRound()) * k; |
|||
g = (max - g.FastRound()) * k; |
|||
b = (max - b.FastRound()) * k; |
|||
r *= scale; |
|||
g *= scale; |
|||
b *= scale; |
|||
|
|||
rrRefAsVector = r; |
|||
ggRefAsVector = g; |
|||
bbRefAsVector = b; |
|||
|
|||
// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
|||
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
|||
destination.Pack(ref rr, ref gg, ref bb); |
|||
} |
|||
} |
|||
|
|||
protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) => |
|||
FromYccKBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,18 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal abstract class Vector8JpegColorConverter : VectorizedJpegColorConverter |
|||
{ |
|||
protected Vector8JpegColorConverter(JpegColorSpace colorSpace, int precision) |
|||
: base(colorSpace, precision, 8) |
|||
{ |
|||
} |
|||
|
|||
protected sealed override bool IsAvailable => SimdUtils.HasVector8; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,46 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
|||
{ |
|||
internal abstract partial class JpegColorConverter |
|||
{ |
|||
internal abstract class VectorizedJpegColorConverter : JpegColorConverter |
|||
{ |
|||
private readonly int vectorSize; |
|||
|
|||
protected VectorizedJpegColorConverter(JpegColorSpace colorSpace, int precision, int vectorSize) |
|||
: base(colorSpace, precision) |
|||
{ |
|||
this.vectorSize = vectorSize; |
|||
} |
|||
|
|||
public sealed override void ConvertToRgba(in ComponentValues values, Span<Vector4> result) |
|||
{ |
|||
int remainder = result.Length % this.vectorSize; |
|||
int simdCount = result.Length - remainder; |
|||
if (simdCount > 0) |
|||
{ |
|||
// This implementation is actually AVX specific.
|
|||
// An AVX register is capable of storing 8 float-s.
|
|||
if (!this.IsAvailable) |
|||
{ |
|||
throw new InvalidOperationException( |
|||
"This converter can be used only on architecture having 256 byte floating point SIMD registers!"); |
|||
} |
|||
|
|||
this.ConvertCoreVectorized(values.Slice(0, simdCount), result.Slice(0, simdCount)); |
|||
} |
|||
|
|||
this.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder)); |
|||
} |
|||
|
|||
protected abstract void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result); |
|||
|
|||
protected abstract void ConvertCore(in ComponentValues values, Span<Vector4> result); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_AddInPlace |
|||
{ |
|||
[Benchmark] |
|||
public float AddInplace() |
|||
{ |
|||
float f = 42F; |
|||
Block8x8F b = default; |
|||
b.AddInPlace(f); |
|||
return f; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_MultiplyInPlaceBlock |
|||
{ |
|||
private static readonly Block8x8F Source = Create8x8FloatData(); |
|||
|
|||
[Benchmark] |
|||
public void MultiplyInPlaceBlock() |
|||
{ |
|||
Block8x8F dest = default; |
|||
Source.MultiplyInPlace(ref dest); |
|||
} |
|||
|
|||
private static Block8x8F Create8x8FloatData() |
|||
{ |
|||
var result = new float[64]; |
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
for (int j = 0; j < 8; j++) |
|||
{ |
|||
result[(i * 8) + j] = (i * 10) + j; |
|||
} |
|||
} |
|||
|
|||
var source = default(Block8x8F); |
|||
source.LoadFrom(result); |
|||
return source; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_MultiplyInPlaceScalar |
|||
{ |
|||
[Benchmark] |
|||
public float MultiplyInPlaceScalar() |
|||
{ |
|||
float f = 42F; |
|||
Block8x8F b = default; |
|||
b.MultiplyInPlace(f); |
|||
return f; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_Transpose |
|||
{ |
|||
private static readonly Block8x8F Source = Create8x8FloatData(); |
|||
|
|||
[Benchmark] |
|||
public void TransposeInto() |
|||
{ |
|||
var dest = default(Block8x8F); |
|||
Source.TransposeInto(ref dest); |
|||
} |
|||
|
|||
private static Block8x8F Create8x8FloatData() |
|||
{ |
|||
var result = new float[64]; |
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
for (int j = 0; j < 8; j++) |
|||
{ |
|||
result[(i * 8) + j] = (i * 10) + j; |
|||
} |
|||
} |
|||
|
|||
var source = default(Block8x8F); |
|||
source.LoadFrom(result); |
|||
return source; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class CmykColorConversion : ColorConversionBenchmark |
|||
{ |
|||
public CmykColorConversion() |
|||
: base(4) |
|||
{ |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void Scalar() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromCmykBasic(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVector8() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromCmykVector8(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVectorAvx2() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromCmykAvx2(8).ConvertToRgba(values, this.output); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Memory; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg |
|||
{ |
|||
public abstract class ColorConversionBenchmark |
|||
{ |
|||
private readonly int componentCount; |
|||
protected Buffer2D<float>[] input; |
|||
protected Vector4[] output; |
|||
|
|||
protected ColorConversionBenchmark(int componentCount) |
|||
{ |
|||
this.componentCount = componentCount; |
|||
} |
|||
|
|||
public const int Count = 128; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.input = CreateRandomValues(this.componentCount, Count); |
|||
this.output = new Vector4[Count]; |
|||
} |
|||
|
|||
[GlobalCleanup] |
|||
public void Cleanup() |
|||
{ |
|||
foreach (Buffer2D<float> buffer in this.input) |
|||
{ |
|||
buffer.Dispose(); |
|||
} |
|||
} |
|||
|
|||
private static Buffer2D<float>[] CreateRandomValues( |
|||
int componentCount, |
|||
int inputBufferLength, |
|||
float minVal = 0f, |
|||
float maxVal = 255f) |
|||
{ |
|||
var rnd = new Random(42); |
|||
var buffers = new Buffer2D<float>[componentCount]; |
|||
for (int i = 0; i < componentCount; i++) |
|||
{ |
|||
var values = new float[inputBufferLength]; |
|||
|
|||
for (int j = 0; j < inputBufferLength; j++) |
|||
{ |
|||
values[j] = ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; |
|||
} |
|||
|
|||
// no need to dispose when buffer is not array owner
|
|||
buffers[i] = Configuration.Default.MemoryAllocator.Allocate2D<float>(values.Length, 1); |
|||
} |
|||
|
|||
return buffers; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class GrayscaleColorConversion : ColorConversionBenchmark |
|||
{ |
|||
public GrayscaleColorConversion() |
|||
: base(1) |
|||
{ |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void Scalar() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromGrayscaleBasic(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVectorAvx2() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromGrayscaleAvx2(8).ConvertToRgba(values, this.output); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class RgbColorConversion : ColorConversionBenchmark |
|||
{ |
|||
public RgbColorConversion() |
|||
: base(3) |
|||
{ |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void Scalar() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromRgbBasic(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVector8() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromRgbVector8(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVectorAvx2() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromRgbAvx2(8).ConvertToRgba(values, this.output); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class YccKColorConverter : ColorConversionBenchmark |
|||
{ |
|||
public YccKColorConverter() |
|||
: base(4) |
|||
{ |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void Scalar() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromYccKBasic(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVector8() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromYccKVector8(8).ConvertToRgba(values, this.output); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void SimdVectorAvx2() |
|||
{ |
|||
var values = new JpegColorConverter.ComponentValues(this.input, 0); |
|||
|
|||
new JpegColorConverter.FromYccKAvx2(8).ConvertToRgba(values, this.output); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class FromVector4_Rgb24 : FromVector4<Rgb24> |
|||
{ |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
|
|||
// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
|
|||
// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// IterationCount=3 LaunchCount=1 WarmupCount=3
|
|||
//
|
|||
// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 64 | 343.2 ns | 305.91 ns | 16.77 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 64 | 320.8 ns | 19.93 ns | 1.09 ns | 0.94 | 0.05 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 64 | 234.3 ns | 17.98 ns | 0.99 ns | 1.00 | 0.00 | 0.0052 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 64 | 246.0 ns | 82.34 ns | 4.51 ns | 1.05 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 64 | 222.3 ns | 39.46 ns | 2.16 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 64 | 243.4 ns | 33.58 ns | 1.84 ns | 1.09 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 256 | 824.9 ns | 32.77 ns | 1.80 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 256 | 967.0 ns | 39.09 ns | 2.14 ns | 1.17 | 0.01 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 256 | 756.9 ns | 94.43 ns | 5.18 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns | 1.32 | 0.22 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 256 | 748.6 ns | 248.03 ns | 13.60 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 256 | 437.0 ns | 36.48 ns | 2.00 ns | 0.58 | 0.01 | 0.0172 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 2048 | 5,751.6 ns | 704.24 ns | 38.60 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 2048 | 4,391.6 ns | 718.17 ns | 39.37 ns | 0.76 | 0.00 | 0.0153 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 2048 | 6,202.0 ns | 1,815.18 ns | 99.50 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 2048 | 4,225.6 ns | 1,004.03 ns | 55.03 ns | 0.68 | 0.01 | 0.0153 | - | - | 72 B |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 2048 | 1,822.7 ns | 1,764.43 ns | 96.71 ns | 0.30 | 0.02 | 0.0172 | - | - | 72 B |
|
|||
@ -0,0 +1,87 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Pad3Shuffle4Channel |
|||
{ |
|||
private static readonly DefaultPad3Shuffle4 Control = new DefaultPad3Shuffle4(1, 0, 3, 2); |
|||
private static readonly XYZWPad3Shuffle4 ControlFast = default; |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count * 4 / 3]; |
|||
} |
|||
|
|||
[Params(96, 384, 768, 1536)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Pad3Shuffle4() |
|||
{ |
|||
SimdUtils.Pad3Shuffle4(this.source, this.destination, Control); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Pad3Shuffle4FastFallback() |
|||
{ |
|||
SimdUtils.Pad3Shuffle4(this.source, this.destination, ControlFast); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-30
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |------------------------- |------------------- |-------------------------------------------------- |------ |------------:|----------:|----------:|------------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 120.64 ns | 7.190 ns | 21.200 ns | 114.26 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.63 ns | 0.175 ns | 0.155 ns | 23.65 ns | 0.15 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 25.25 ns | 0.356 ns | 0.298 ns | 25.27 ns | 0.17 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 14.80 ns | 0.358 ns | 1.032 ns | 14.64 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 96 | 24.84 ns | 0.376 ns | 0.333 ns | 24.74 ns | 1.57 | 0.06 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 96 | 24.58 ns | 0.471 ns | 0.704 ns | 24.38 ns | 1.60 | 0.09 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 258.92 ns | 4.873 ns | 4.069 ns | 257.95 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 384 | 41.41 ns | 0.859 ns | 1.204 ns | 41.33 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 40.74 ns | 0.848 ns | 0.793 ns | 40.48 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 74.50 ns | 0.490 ns | 0.383 ns | 74.49 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 384 | 40.74 ns | 0.624 ns | 0.584 ns | 40.72 ns | 0.55 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 384 | 38.28 ns | 0.534 ns | 0.417 ns | 38.22 ns | 0.51 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 503.91 ns | 6.466 ns | 6.048 ns | 501.58 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 768 | 62.86 ns | 0.332 ns | 0.277 ns | 62.80 ns | 0.12 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 64.59 ns | 0.469 ns | 0.415 ns | 64.62 ns | 0.13 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 110.51 ns | 0.592 ns | 0.554 ns | 110.33 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 768 | 64.72 ns | 1.306 ns | 1.090 ns | 64.51 ns | 0.59 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.11 ns | 0.816 ns | 0.682 ns | 61.98 ns | 0.56 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 1,005.84 ns | 13.176 ns | 12.325 ns | 1,004.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 110.05 ns | 0.256 ns | 0.214 ns | 110.04 ns | 0.11 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.545 ns | 0.483 ns | 110.09 ns | 0.11 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 220.37 ns | 1.601 ns | 1.419 ns | 220.13 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 2. AVX | Empty | 1536 | 111.54 ns | 2.173 ns | 2.901 ns | 111.27 ns | 0.51 | 0.01 | - | - | - | - |
|
|||
// | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.456 ns | 0.427 ns | 110.25 ns | 0.50 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortCore31))] |
|||
public class PremultiplyVector4 |
|||
{ |
|||
private static readonly Vector4[] Vectors = CreateVectors(); |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void PremultiplyBaseline() |
|||
{ |
|||
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors); |
|||
|
|||
for (int i = 0; i < Vectors.Length; i++) |
|||
{ |
|||
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); |
|||
Premultiply(ref v); |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Premultiply() |
|||
{ |
|||
Vector4Utilities.Premultiply(Vectors); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void Premultiply(ref Vector4 source) |
|||
{ |
|||
float w = source.W; |
|||
source *= w; |
|||
source.W = w; |
|||
} |
|||
|
|||
private static Vector4[] CreateVectors() |
|||
{ |
|||
var rnd = new Random(42); |
|||
return GenerateRandomVectorArray(rnd, 2048, 0, 1); |
|||
} |
|||
|
|||
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) |
|||
{ |
|||
var values = new Vector4[length]; |
|||
|
|||
for (int i = 0; i < length; i++) |
|||
{ |
|||
ref Vector4 v = ref values[i]; |
|||
v.X = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Y = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Z = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.W = GetRandomFloat(rnd, minVal, maxVal); |
|||
} |
|||
|
|||
return values; |
|||
} |
|||
|
|||
private static float GetRandomFloat(Random rnd, float minVal, float maxVal) |
|||
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Shuffle3Channel |
|||
{ |
|||
private static readonly DefaultShuffle3 Control = new DefaultShuffle3(1, 0, 2); |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count]; |
|||
} |
|||
|
|||
[Params(96, 384, 768, 1536)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle3() |
|||
{ |
|||
SimdUtils.Shuffle3(this.source, this.destination, Control); |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |--------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 48.46 ns | 1.034 ns | 2.438 ns | 47.46 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 96 | 32.42 ns | 0.537 ns | 0.476 ns | 32.34 ns | 0.66 | 0.04 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 32.51 ns | 0.373 ns | 0.349 ns | 32.56 ns | 0.66 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 199.04 ns | 1.512 ns | 1.180 ns | 199.17 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 384 | 71.20 ns | 2.654 ns | 7.784 ns | 69.60 ns | 0.41 | 0.02 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 63.23 ns | 0.569 ns | 0.505 ns | 63.21 ns | 0.32 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 391.28 ns | 5.087 ns | 3.972 ns | 391.22 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 768 | 109.12 ns | 2.149 ns | 2.010 ns | 108.66 ns | 0.28 | 0.01 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 106.51 ns | 0.734 ns | 0.613 ns | 106.56 ns | 0.27 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 773.70 ns | 5.516 ns | 4.890 ns | 772.96 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 2. AVX | Empty | 1536 | 190.41 ns | 1.090 ns | 0.851 ns | 190.38 ns | 0.25 | 0.00 | - | - | - | - |
|
|||
// | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 190.94 ns | 0.985 ns | 0.769 ns | 190.85 ns | 0.25 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,95 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Shuffle4Slice3Channel |
|||
{ |
|||
private static readonly DefaultShuffle4Slice3 Control = new DefaultShuffle4Slice3(1, 0, 3, 2); |
|||
private static readonly XYZWShuffle4Slice3 ControlFast = default; |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[(int)(this.Count * (3 / 4F))]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Slice3() |
|||
{ |
|||
SimdUtils.Shuffle4Slice3(this.source, this.destination, Control); |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Slice3FastFallback() |
|||
{ |
|||
SimdUtils.Shuffle4Slice3(this.source, this.destination, ControlFast); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |--------------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 56.44 ns | 2.843 ns | 8.382 ns | 56.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 128 | 27.15 ns | 0.556 ns | 0.762 ns | 27.34 ns | 0.41 | 0.03 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 128 | 26.36 ns | 0.321 ns | 0.268 ns | 26.26 ns | 0.38 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 25.85 ns | 0.494 ns | 0.462 ns | 25.84 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 128 | 26.15 ns | 0.113 ns | 0.106 ns | 26.16 ns | 1.01 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 128 | 25.57 ns | 0.078 ns | 0.061 ns | 25.56 ns | 0.99 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 97.47 ns | 0.327 ns | 0.289 ns | 97.35 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 256 | 32.61 ns | 0.107 ns | 0.095 ns | 32.62 ns | 0.33 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.21 ns | 0.169 ns | 0.150 ns | 33.15 ns | 0.34 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 52.34 ns | 0.779 ns | 0.729 ns | 51.94 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 256 | 32.16 ns | 0.111 ns | 0.104 ns | 32.16 ns | 0.61 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.61 ns | 0.342 ns | 0.319 ns | 33.62 ns | 0.64 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 210.74 ns | 3.825 ns | 5.956 ns | 207.70 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 512 | 51.03 ns | 0.535 ns | 0.501 ns | 51.18 ns | 0.24 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 512 | 66.60 ns | 1.313 ns | 1.613 ns | 65.93 ns | 0.31 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 119.12 ns | 1.905 ns | 1.689 ns | 118.52 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 512 | 50.33 ns | 0.382 ns | 0.339 ns | 50.41 ns | 0.42 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 512 | 49.25 ns | 0.555 ns | 0.492 ns | 49.26 ns | 0.41 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 423.55 ns | 4.891 ns | 4.336 ns | 423.27 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 1024 | 77.13 ns | 1.355 ns | 2.264 ns | 76.19 ns | 0.19 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 79.39 ns | 0.103 ns | 0.086 ns | 79.37 ns | 0.19 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 226.57 ns | 2.930 ns | 2.598 ns | 226.10 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 1024 | 80.25 ns | 1.647 ns | 2.082 ns | 80.98 ns | 0.35 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 84.99 ns | 1.234 ns | 1.155 ns | 85.60 ns | 0.38 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 794.96 ns | 1.735 ns | 1.538 ns | 795.15 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 2. AVX | Empty | 2048 | 128.41 ns | 0.417 ns | 0.390 ns | 128.24 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 127.24 ns | 0.294 ns | 0.229 ns | 127.23 ns | 0.16 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | | |
|
|||
// | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 382.97 ns | 1.064 ns | 0.831 ns | 382.87 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 2. AVX | Empty | 2048 | 126.93 ns | 0.382 ns | 0.339 ns | 126.94 ns | 0.33 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 149.36 ns | 1.875 ns | 1.754 ns | 149.33 ns | 0.39 | 0.00 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class ShuffleByte4Channel |
|||
{ |
|||
private byte[] source; |
|||
private byte[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[this.Count]; |
|||
new Random(this.Count).NextBytes(this.source); |
|||
this.destination = new byte[this.Count]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Channel() |
|||
{ |
|||
SimdUtils.Shuffle4<WXYZShuffle4>(this.source, this.destination, default); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Tests; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class ShuffleFloat4Channel |
|||
{ |
|||
private static readonly byte Control = default(WXYZShuffle4).Control; |
|||
private float[] source; |
|||
private float[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); |
|||
this.destination = new float[this.Count]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Channel() |
|||
{ |
|||
SimdUtils.Shuffle4(this.source, this.destination, Control); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-29
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------- |------------------- |-------------------------------------------------- |------ |-----------:|----------:|----------:|------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.647 ns | 0.5475 ns | 0.4853 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 128 | 9.818 ns | 0.1457 ns | 0.1292 ns | 0.15 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 15.267 ns | 0.1005 ns | 0.0940 ns | 0.24 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 125.586 ns | 1.9312 ns | 1.8064 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 256 | 15.878 ns | 0.1983 ns | 0.1758 ns | 0.13 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 29.170 ns | 0.2925 ns | 0.2442 ns | 0.23 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 263.859 ns | 2.6660 ns | 2.3634 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 512 | 29.452 ns | 0.3334 ns | 0.3118 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 52.912 ns | 0.1932 ns | 0.1713 ns | 0.20 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 495.717 ns | 1.9850 ns | 1.8567 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 1024 | 53.757 ns | 0.3212 ns | 0.2847 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 107.815 ns | 1.6201 ns | 1.3528 ns | 0.22 | - | - | - | - |
|
|||
// | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 980.134 ns | 3.7407 ns | 3.1237 ns | 1.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | 2. AVX | Empty | 2048 | 105.120 ns | 0.6140 ns | 0.5443 ns | 0.11 | - | - | - | - |
|
|||
// | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 216.473 ns | 2.3268 ns | 2.0627 ns | 0.22 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,65 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
using SixLabors.ImageSharp.Memory; |
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class ToVector4_Rgb24 : ToVector4<Rgb24> |
|||
{ |
|||
[Benchmark(Baseline = true)] |
|||
public void PixelOperations_Base() |
|||
{ |
|||
new PixelOperations<Rgb24>().ToVector4( |
|||
this.Configuration, |
|||
this.source.GetSpan(), |
|||
this.destination.GetSpan()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 2020-11-02
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK=3.1.403
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT
|
|||
// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT
|
|||
// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// IterationCount=3 LaunchCount=1 WarmupCount=3
|
|||
//
|
|||
// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:|
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 64 | 298.4 ns | 33.63 ns | 1.84 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 64 | 355.5 ns | 908.51 ns | 49.80 ns | 1.19 | 0.17 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 64 | 220.1 ns | 13.77 ns | 0.75 ns | 1.00 | 0.00 | 0.0055 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 64 | 228.5 ns | 41.41 ns | 2.27 ns | 1.04 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 64 | 213.6 ns | 12.47 ns | 0.68 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 64 | 217.0 ns | 9.95 ns | 0.55 ns | 1.02 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 256 | 829.0 ns | 242.93 ns | 13.32 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 256 | 448.9 ns | 4.04 ns | 0.22 ns | 0.54 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 256 | 863.0 ns | 1,253.26 ns | 68.70 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 256 | 309.2 ns | 66.16 ns | 3.63 ns | 0.36 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 256 | 737.0 ns | 253.90 ns | 13.92 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 256 | 212.3 ns | 1.07 ns | 0.06 ns | 0.29 | 0.01 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 2048 | 5,625.6 ns | 404.35 ns | 22.16 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 2048 | 1,974.1 ns | 229.84 ns | 12.60 ns | 0.35 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 2048 | 5,467.2 ns | 537.29 ns | 29.45 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 2048 | 1,985.5 ns | 4,714.23 ns | 258.40 ns | 0.36 | 0.05 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 2048 | 5,888.2 ns | 1,622.23 ns | 88.92 ns | 1.00 | 0.00 | - | - | - | 24 B |
|
|||
// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 2048 | 1,165.0 ns | 191.71 ns | 10.51 ns | 0.20 | 0.00 | - | - | - | - |
|
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.ShortCore31))] |
|||
public class UnPremultiplyVector4 |
|||
{ |
|||
private static readonly Vector4[] Vectors = CreateVectors(); |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void UnPremultiplyBaseline() |
|||
{ |
|||
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors); |
|||
|
|||
for (int i = 0; i < Vectors.Length; i++) |
|||
{ |
|||
ref Vector4 v = ref Unsafe.Add(ref baseRef, i); |
|||
UnPremultiply(ref v); |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void UnPremultiply() |
|||
{ |
|||
Vector4Utilities.UnPremultiply(Vectors); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static void UnPremultiply(ref Vector4 source) |
|||
{ |
|||
float w = source.W; |
|||
source /= w; |
|||
source.W = w; |
|||
} |
|||
|
|||
private static Vector4[] CreateVectors() |
|||
{ |
|||
var rnd = new Random(42); |
|||
return GenerateRandomVectorArray(rnd, 2048, 0, 1); |
|||
} |
|||
|
|||
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) |
|||
{ |
|||
var values = new Vector4[length]; |
|||
|
|||
for (int i = 0; i < length; i++) |
|||
{ |
|||
ref Vector4 v = ref values[i]; |
|||
v.X = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Y = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.Z = GetRandomFloat(rnd, minVal, maxVal); |
|||
v.W = GetRandomFloat(rnd, minVal, maxVal); |
|||
} |
|||
|
|||
return values; |
|||
} |
|||
|
|||
private static float GetRandomFloat(Random rnd, float minVal, float maxVal) |
|||
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; |
|||
} |
|||
} |
|||
@ -0,0 +1,84 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Runtime.Intrinsics.X86; |
|||
#endif
|
|||
using BenchmarkDotNet.Environments; |
|||
using BenchmarkDotNet.Jobs; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks |
|||
{ |
|||
public partial class Config |
|||
{ |
|||
private const string On = "1"; |
|||
private const string Off = "0"; |
|||
|
|||
// See https://github.com/SixLabors/ImageSharp/pull/1229#discussion_r440477861
|
|||
// * EnableHWIntrinsic
|
|||
// * EnableSSE
|
|||
// * EnableSSE2
|
|||
// * EnableAES
|
|||
// * EnablePCLMULQDQ
|
|||
// * EnableSSE3
|
|||
// * EnableSSSE3
|
|||
// * EnableSSE41
|
|||
// * EnableSSE42
|
|||
// * EnablePOPCNT
|
|||
// * EnableAVX
|
|||
// * EnableFMA
|
|||
// * EnableAVX2
|
|||
// * EnableBMI1
|
|||
// * EnableBMI2
|
|||
// * EnableLZCNT
|
|||
//
|
|||
// `FeatureSIMD` ends up impacting all SIMD support(including `System.Numerics`) but not things
|
|||
// like `LZCNT`, `BMI1`, or `BMI2`
|
|||
// `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
|
|||
private const string EnableAES = "COMPlus_EnableAES"; |
|||
private const string EnableAVX = "COMPlus_EnableAVX"; |
|||
private const string EnableAVX2 = "COMPlus_EnableAVX2"; |
|||
private const string EnableBMI1 = "COMPlus_EnableBMI1"; |
|||
private const string EnableBMI2 = "COMPlus_EnableBMI2"; |
|||
private const string EnableFMA = "COMPlus_EnableFMA"; |
|||
private const string EnableHWIntrinsic = "COMPlus_EnableHWIntrinsic"; |
|||
private const string EnableLZCNT = "COMPlus_EnableLZCNT"; |
|||
private const string EnablePCLMULQDQ = "COMPlus_EnablePCLMULQDQ"; |
|||
private const string EnablePOPCNT = "COMPlus_EnablePOPCNT"; |
|||
private const string EnableSSE = "COMPlus_EnableSSE"; |
|||
private const string EnableSSE2 = "COMPlus_EnableSSE2"; |
|||
private const string EnableSSE3 = "COMPlus_EnableSSE3"; |
|||
private const string EnableSSE3_4 = "COMPlus_EnableSSE3_4"; |
|||
private const string EnableSSE41 = "COMPlus_EnableSSE41"; |
|||
private const string EnableSSE42 = "COMPlus_EnableSSE42"; |
|||
private const string EnableSSSE3 = "COMPlus_EnableSSSE3"; |
|||
private const string FeatureSIMD = "COMPlus_FeatureSIMD"; |
|||
|
|||
public class HwIntrinsics_SSE_AVX : Config |
|||
{ |
|||
public HwIntrinsics_SSE_AVX() |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithEnvironmentVariables( |
|||
new EnvironmentVariable(EnableHWIntrinsic, Off), |
|||
new EnvironmentVariable(FeatureSIMD, Off)) |
|||
.WithId("1. No HwIntrinsics").AsBaseline()); |
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
if (Avx.IsSupported) |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithId("2. AVX")); |
|||
} |
|||
|
|||
if (Sse.IsSupported) |
|||
{ |
|||
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) |
|||
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) |
|||
.WithId("3. SSE")); |
|||
} |
|||
#endif
|
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,53 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System.IO; |
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
using SixLabors.ImageSharp.Processing; |
|||
using SixLabors.ImageSharp.Processing.Processors.Normalization; |
|||
using SixLabors.ImageSharp.Tests; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Processing |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class HistogramEqualization : BenchmarkBase |
|||
{ |
|||
private Image<Rgba32> image; |
|||
|
|||
[GlobalSetup] |
|||
public void ReadImages() |
|||
{ |
|||
if (this.image == null) |
|||
{ |
|||
this.image = Image.Load<Rgba32>(File.OpenRead(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImages.Jpeg.Baseline.HistogramEqImage))); |
|||
} |
|||
} |
|||
|
|||
[GlobalCleanup] |
|||
public void Cleanup() |
|||
{ |
|||
this.image.Dispose(); |
|||
} |
|||
|
|||
[Benchmark(Description = "Global Histogram Equalization")] |
|||
public void GlobalHistogramEqualization() |
|||
{ |
|||
this.image.Mutate(img => img.HistogramEqualization(new HistogramEqualizationOptions() |
|||
{ |
|||
LuminanceLevels = 256, |
|||
Method = HistogramEqualizationMethod.Global |
|||
})); |
|||
} |
|||
|
|||
[Benchmark(Description = "AdaptiveHistogramEqualization (Tile interpolation)")] |
|||
public void AdaptiveHistogramEqualization() |
|||
{ |
|||
this.image.Mutate(img => img.HistogramEqualization(new HistogramEqualizationOptions() |
|||
{ |
|||
LuminanceLevels = 256, |
|||
Method = HistogramEqualizationMethod.AdaptiveTileInterpolation |
|||
})); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,399 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using SixLabors.ImageSharp.Tests.TestUtilities; |
|||
using Xunit; |
|||
|
|||
namespace SixLabors.ImageSharp.Tests.Common |
|||
{ |
|||
public partial class SimdUtilsTests |
|||
{ |
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffleFloat4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
// No need to test multiple shuffle controls as the
|
|||
// pipeline is always the same.
|
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
byte control = default(WZYXShuffle4).Control; |
|||
|
|||
TestShuffleFloat4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, control), |
|||
control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffleByte4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle4 to the generic utils method.
|
|||
WXYZShuffle4 wxyz = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wxyz), |
|||
wxyz.Control); |
|||
|
|||
WZYXShuffle4 wzyx = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wzyx), |
|||
wzyx.Control); |
|||
|
|||
YZWXShuffle4 yzwx = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yzwx), |
|||
yzwx.Control); |
|||
|
|||
ZYXWShuffle4 zyxw = default; |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, zyxw), |
|||
zyxw.Control); |
|||
|
|||
var xwyz = new DefaultShuffle4(2, 1, 3, 0); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultShuffle4(1, 1, 1, 1); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultShuffle4(3, 3, 3, 3); |
|||
TestShuffleByte4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy3))] |
|||
public void BulkShuffleByte3Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle3 to the generic utils method.
|
|||
var zyx = new DefaultShuffle3(0, 1, 2); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zyx), |
|||
zyx.Control); |
|||
|
|||
var xyz = new DefaultShuffle3(2, 1, 0); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, xyz), |
|||
xyz.Control); |
|||
|
|||
var yyy = new DefaultShuffle3(1, 1, 1); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, yyy), |
|||
yyy.Control); |
|||
|
|||
var zzz = new DefaultShuffle3(2, 2, 2); |
|||
TestShuffleByte3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zzz), |
|||
zzz.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy3))] |
|||
public void BulkPad3Shuffle4Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IPad3Shuffle4 to the generic utils method.
|
|||
XYZWPad3Shuffle4 xyzw = default; |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xyzw), |
|||
xyzw.Control); |
|||
|
|||
var xwyz = new DefaultPad3Shuffle4(2, 1, 3, 0); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultPad3Shuffle4(1, 1, 1, 1); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultPad3Shuffle4(3, 3, 3, 3); |
|||
TestPad3Shuffle4Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ArraySizesDivisibleBy4))] |
|||
public void BulkShuffle4Slice3Channel(int count) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
int size = FeatureTestRunner.Deserialize<int>(serialized); |
|||
|
|||
// These cannot be expressed as a theory as you cannot
|
|||
// use RemoteExecutor within generic methods nor pass
|
|||
// IShuffle4Slice3 to the generic utils method.
|
|||
XYZWShuffle4Slice3 xyzw = default; |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xyzw), |
|||
xyzw.Control); |
|||
|
|||
var xwyz = new DefaultShuffle4Slice3(2, 1, 3, 0); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xwyz), |
|||
xwyz.Control); |
|||
|
|||
var yyyy = new DefaultShuffle4Slice3(1, 1, 1, 1); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, yyyy), |
|||
yyyy.Control); |
|||
|
|||
var wwww = new DefaultShuffle4Slice3(3, 3, 3, 3); |
|||
TestShuffle4Slice3Channel( |
|||
size, |
|||
(s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, wwww), |
|||
wwww.Control); |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
count, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
private static void TestShuffleFloat4Channel( |
|||
int count, |
|||
Action<Memory<float>, Memory<float>> convert, |
|||
byte control) |
|||
{ |
|||
float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); |
|||
var result = new float[count]; |
|||
|
|||
float[] expected = new float[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 4) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
expected[i + 3] = source[p3 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); |
|||
} |
|||
|
|||
private static void TestShuffleByte4Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
var result = new byte[count]; |
|||
|
|||
byte[] expected = new byte[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 4) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
expected[i + 3] = source[p3 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestShuffleByte3Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
var result = new byte[count]; |
|||
|
|||
byte[] expected = new byte[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int _, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 3) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestPad3Shuffle4Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
|
|||
var result = new byte[count * 4 / 3]; |
|||
|
|||
byte[] expected = new byte[result.Length]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) |
|||
{ |
|||
expected[p0 + i] = source[j]; |
|||
expected[p1 + i] = source[j + 1]; |
|||
expected[p2 + i] = source[j + 2]; |
|||
expected[p3 + i] = byte.MaxValue; |
|||
} |
|||
|
|||
Span<byte> temp = stackalloc byte[4]; |
|||
for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) |
|||
{ |
|||
temp[0] = source[j]; |
|||
temp[1] = source[j + 1]; |
|||
temp[2] = source[j + 2]; |
|||
temp[3] = byte.MaxValue; |
|||
|
|||
expected[i] = temp[p0]; |
|||
expected[i + 1] = temp[p1]; |
|||
expected[i + 2] = temp[p2]; |
|||
expected[i + 3] = temp[p3]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
for (int i = 0; i < expected.Length; i++) |
|||
{ |
|||
Assert.Equal(expected[i], result[i]); |
|||
} |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
|
|||
private static void TestShuffle4Slice3Channel( |
|||
int count, |
|||
Action<Memory<byte>, Memory<byte>> convert, |
|||
byte control) |
|||
{ |
|||
byte[] source = new byte[count]; |
|||
new Random(count).NextBytes(source); |
|||
|
|||
var result = new byte[count * 3 / 4]; |
|||
|
|||
byte[] expected = new byte[result.Length]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int _, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0, j = 0; i < expected.Length; i += 3, j += 4) |
|||
{ |
|||
expected[i] = source[p0 + j]; |
|||
expected[i + 1] = source[p1 + j]; |
|||
expected[i + 2] = source[p2 + j]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
for (int i = 0; i < expected.Length; i++) |
|||
{ |
|||
Assert.Equal(expected[i], result[i]); |
|||
} |
|||
|
|||
Assert.Equal(expected, result); |
|||
} |
|||
} |
|||
} |
|||
Some files were not shown because too many files changed in this diff
Loading…
Reference in new issue