|
|
@ -4,6 +4,8 @@ |
|
|
using System.Numerics; |
|
|
using System.Numerics; |
|
|
using System.Runtime.CompilerServices; |
|
|
using System.Runtime.CompilerServices; |
|
|
using System.Runtime.InteropServices; |
|
|
using System.Runtime.InteropServices; |
|
|
|
|
|
using System.Runtime.Intrinsics; |
|
|
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces; |
|
|
using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces; |
|
|
|
|
|
|
|
|
namespace SixLabors.ImageSharp.ColorProfiles; |
|
|
namespace SixLabors.ImageSharp.ColorProfiles; |
|
|
@ -105,10 +107,87 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz> |
|
|
{ |
|
|
{ |
|
|
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); |
|
|
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); |
|
|
|
|
|
|
|
|
// TODO: Optimize via SIMD
|
|
|
int length = source.Length; |
|
|
for (int i = 0; i < source.Length; i++) |
|
|
if (length == 0) |
|
|
{ |
|
|
{ |
|
|
destination[i] = source[i].ToScaledVector4(); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
ref Rgb srcRgb = ref MemoryMarshal.GetReference(source); |
|
|
|
|
|
ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination); |
|
|
|
|
|
|
|
|
|
|
|
// Float streams:
|
|
|
|
|
|
// src: r0 g0 b0 r1 g1 b1 ...
|
|
|
|
|
|
// dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
|
|
|
|
|
|
ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb); |
|
|
|
|
|
ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4); |
|
|
|
|
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
|
|
|
|
|
|
if (Avx512F.IsSupported) |
|
|
|
|
|
{ |
|
|
|
|
|
// 4 pixels per iteration. Using overlapped 16-float loads.
|
|
|
|
|
|
Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0); |
|
|
|
|
|
Vector512<float> ones = Vector512.Create(1F); |
|
|
|
|
|
|
|
|
|
|
|
// BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
|
|
|
|
|
|
// Using -0f sets only the sign bit, producing an efficient "select lane" mask.
|
|
|
|
|
|
Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F); |
|
|
|
|
|
|
|
|
|
|
|
int quads = length >> 2; |
|
|
|
|
|
|
|
|
|
|
|
// Leave the last quad (4 pixels) for the scalar tail.
|
|
|
|
|
|
int simdQuads = quads - 1; |
|
|
|
|
|
|
|
|
|
|
|
for (int q = 0; q < simdQuads; q++) |
|
|
|
|
|
{ |
|
|
|
|
|
Vector512<float> v = ReadVector512(ref src); |
|
|
|
|
|
Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm); |
|
|
|
|
|
Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect); |
|
|
|
|
|
|
|
|
|
|
|
WriteVector512(ref dst, rgba); |
|
|
|
|
|
|
|
|
|
|
|
src = ref Unsafe.Add(ref src, 12); |
|
|
|
|
|
dst = ref Unsafe.Add(ref dst, 16); |
|
|
|
|
|
|
|
|
|
|
|
i += 4; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
else if (Avx2.IsSupported) |
|
|
|
|
|
{ |
|
|
|
|
|
// 2 pixels per iteration. Using overlapped 8-float loads.
|
|
|
|
|
|
Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0); |
|
|
|
|
|
|
|
|
|
|
|
Vector256<float> ones = Vector256.Create(1F); |
|
|
|
|
|
|
|
|
|
|
|
// vblendps mask: bit i selects lane i from 'ones' when set.
|
|
|
|
|
|
// We want lanes 3 and 7 -> 0b10001000 = 0x88.
|
|
|
|
|
|
const byte alphaMask = 0x88; |
|
|
|
|
|
|
|
|
|
|
|
int pairs = length >> 1; |
|
|
|
|
|
|
|
|
|
|
|
// Leave the last pair (2 pixels) for the scalar tail.
|
|
|
|
|
|
int simdPairs = pairs - 1; |
|
|
|
|
|
|
|
|
|
|
|
for (int p = 0; p < simdPairs; p++) |
|
|
|
|
|
{ |
|
|
|
|
|
Vector256<float> v = ReadVector256(ref src); |
|
|
|
|
|
Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm); |
|
|
|
|
|
Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask); |
|
|
|
|
|
|
|
|
|
|
|
WriteVector256(ref dst, rgba); |
|
|
|
|
|
|
|
|
|
|
|
src = ref Unsafe.Add(ref src, 6); |
|
|
|
|
|
dst = ref Unsafe.Add(ref dst, 8); |
|
|
|
|
|
|
|
|
|
|
|
i += 2; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Tail (and non-AVX paths)
|
|
|
|
|
|
for (; i < length; i++) |
|
|
|
|
|
{ |
|
|
|
|
|
Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4(); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@ -117,10 +196,75 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz> |
|
|
{ |
|
|
{ |
|
|
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); |
|
|
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); |
|
|
|
|
|
|
|
|
// TODO: Optimize via SIMD
|
|
|
int length = source.Length; |
|
|
for (int i = 0; i < source.Length; i++) |
|
|
if (length == 0) |
|
|
{ |
|
|
{ |
|
|
destination[i] = FromScaledVector4(source[i]); |
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source); |
|
|
|
|
|
ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination); |
|
|
|
|
|
|
|
|
|
|
|
// Float streams:
|
|
|
|
|
|
// src: r0 g0 b0 a0 r1 g1 b1 a1 ...
|
|
|
|
|
|
// dst: r0 g0 b0 r1 g1 b1 ...
|
|
|
|
|
|
ref float src = ref Unsafe.As<Vector4, float>(ref srcV4); |
|
|
|
|
|
ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb); |
|
|
|
|
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
|
|
|
|
|
|
if (Avx512F.IsSupported) |
|
|
|
|
|
{ |
|
|
|
|
|
// 4 pixels per iteration. Using overlapped 16-float stores:
|
|
|
|
|
|
Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); |
|
|
|
|
|
|
|
|
|
|
|
// Number of 4-pixel groups in the input.
|
|
|
|
|
|
int quads = length >> 2; |
|
|
|
|
|
|
|
|
|
|
|
// Leave the last quad (4 pixels) for the scalar tail.
|
|
|
|
|
|
int simdQuads = quads - 1; |
|
|
|
|
|
|
|
|
|
|
|
for (int q = 0; q < simdQuads; q++) |
|
|
|
|
|
{ |
|
|
|
|
|
Vector512<float> v = ReadVector512(ref src); |
|
|
|
|
|
Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx); |
|
|
|
|
|
|
|
|
|
|
|
WriteVector512(ref dst, packed); |
|
|
|
|
|
|
|
|
|
|
|
src = ref Unsafe.Add(ref src, 16); |
|
|
|
|
|
dst = ref Unsafe.Add(ref dst, 12); |
|
|
|
|
|
i += 4; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
else if (Avx2.IsSupported) |
|
|
|
|
|
{ |
|
|
|
|
|
// 2 pixels per iteration, using overlapped 8-float stores:
|
|
|
|
|
|
Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0); |
|
|
|
|
|
|
|
|
|
|
|
int pairs = length >> 1; |
|
|
|
|
|
|
|
|
|
|
|
// Leave the last pair (2 pixels) for the scalar tail.
|
|
|
|
|
|
int simdPairs = pairs - 1; |
|
|
|
|
|
|
|
|
|
|
|
int pairIndex = 0; |
|
|
|
|
|
for (; pairIndex < simdPairs; pairIndex++) |
|
|
|
|
|
{ |
|
|
|
|
|
Vector256<float> v = ReadVector256(ref src); |
|
|
|
|
|
Vector256<float> packed = Avx2.PermuteVar8x32(v, idx); |
|
|
|
|
|
|
|
|
|
|
|
WriteVector256(ref dst, packed); |
|
|
|
|
|
|
|
|
|
|
|
src = ref Unsafe.Add(ref src, 8); |
|
|
|
|
|
dst = ref Unsafe.Add(ref dst, 6); |
|
|
|
|
|
i += 2; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Tail (and non-AVX paths)
|
|
|
|
|
|
for (; i < length; i++) |
|
|
|
|
|
{ |
|
|
|
|
|
Vector4 v = Unsafe.Add(ref srcV4, i); |
|
|
|
|
|
Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@ -288,4 +432,32 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz> |
|
|
M44 = 1F |
|
|
M44 = 1F |
|
|
}; |
|
|
}; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
|
|
private static Vector512<float> ReadVector512(ref float src) |
|
|
|
|
|
{ |
|
|
|
|
|
ref byte b = ref Unsafe.As<float, byte>(ref src); |
|
|
|
|
|
return Unsafe.ReadUnaligned<Vector512<float>>(ref b); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
|
|
private static Vector256<float> ReadVector256(ref float src) |
|
|
|
|
|
{ |
|
|
|
|
|
ref byte b = ref Unsafe.As<float, byte>(ref src); |
|
|
|
|
|
return Unsafe.ReadUnaligned<Vector256<float>>(ref b); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
|
|
private static void WriteVector512(ref float dst, Vector512<float> value) |
|
|
|
|
|
{ |
|
|
|
|
|
ref byte b = ref Unsafe.As<float, byte>(ref dst); |
|
|
|
|
|
Unsafe.WriteUnaligned(ref b, value); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
|
|
private static void WriteVector256(ref float dst, Vector256<float> value) |
|
|
|
|
|
{ |
|
|
|
|
|
ref byte b = ref Unsafe.As<float, byte>(ref dst); |
|
|
|
|
|
Unsafe.WriteUnaligned(ref b, value); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|