Browse Source

Optimize conversion with Avx

pull/3051/head
James Jackson-South 1 week ago
parent
commit
b8d288874c
  1. 122
      src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs
  2. 184
      src/ImageSharp/ColorProfiles/Rgb.cs

122
src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs

@ -5,6 +5,8 @@ using System.Buffers;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Processing;
@ -60,8 +62,126 @@ internal static class ColorProfileConverterExtensionsPixelCompatible
converter.ConvertUsingIccProfile<Rgb, Rgb>(rgbSpan, rgbSpan);
// Copy the converted Rgb pixels back to the row as TPixel.
// Important: Preserve alpha from the existing row Vector4 values.
// We merge RGB from rgbSpan into row, leaving W untouched.
ref float srcRgb = ref Unsafe.As<Rgb, float>(ref MemoryMarshal.GetReference(rgbSpan));
ref float dstRow = ref Unsafe.As<Vector4, float>(ref MemoryMarshal.GetReference(row));
int count = rgbSpan.Length;
int i = 0;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector512<float> ReadVector512(ref float f)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void WriteVector512(ref float f, Vector512<float> v)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
Unsafe.WriteUnaligned(ref b, v);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector256<float> ReadVector256(ref float f)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void WriteVector256(ref float f, Vector256<float> v)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
Unsafe.WriteUnaligned(ref b, v);
}
if (Avx512F.IsSupported)
{
// 4 pixels per iteration.
//
// Source layout (Rgb float stream, 12 floats):
// [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3]
//
// Destination layout (row Vector4 float stream, 16 floats):
// [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3]
//
// We use an overlapped load (16 floats) from the 3-float stride source.
// The permute selects the RGB we need and inserts placeholders for alpha lanes.
//
// Then we blend RGB lanes into the existing destination, preserving alpha lanes.
Vector512<int> rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
// BlendVariable selects from the second operand where the sign bit of the mask lane is set.
// We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha).
Vector512<float> rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
int quads = count >> 2;
int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end.
for (int q = 0; q < simdQuads; q++)
{
Vector512<float> dst = ReadVector512(ref dstRow);
Vector512<float> src = ReadVector512(ref srcRgb);
Vector512<float> rgbx = Avx512F.PermuteVar16x32(src, rgbPerm);
Vector512<float> merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect);
WriteVector512(ref dstRow, merged);
// Advance input by 4 pixels (4 * 3 = 12 floats)
srcRgb = ref Unsafe.Add(ref srcRgb, 12);
// Advance output by 4 pixels (4 * 4 = 16 floats)
dstRow = ref Unsafe.Add(ref dstRow, 16);
i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration.
//
// Same idea as AVX-512, but on 256-bit vectors.
// We permute packed RGB into rgbx layout and blend into the existing destination,
// preserving alpha lanes.
Vector256<int> rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
Vector256<float> rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
int pairs = count >> 1;
int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end.
for (int p = 0; p < simdPairs; p++)
{
Vector256<float> dst = ReadVector256(ref dstRow);
Vector256<float> src = ReadVector256(ref srcRgb);
Vector256<float> rgbx = Avx2.PermuteVar8x32(src, rgbPerm);
Vector256<float> merged = Avx.BlendVariable(dst, rgbx, rgbSelect);
WriteVector256(ref dstRow, merged);
// Advance input by 2 pixels (2 * 3 = 6 floats)
srcRgb = ref Unsafe.Add(ref srcRgb, 6);
// Advance output by 2 pixels (2 * 4 = 8 floats)
dstRow = ref Unsafe.Add(ref dstRow, 8);
i += 2;
}
}
// Scalar tail.
// Handles:
// - the last skipped SIMD block (quad or pair)
// - any remainder
//
// Preserve alpha by writing Vector3 into the Vector4 storage.
ref Vector4 rowRef = ref MemoryMarshal.GetReference(row);
for (int i = 0; i < rgbSpan.Length; i++)
for (; i < count; i++)
{
Vector3 rgb = rgbSpan[i].AsVector3Unsafe();
Unsafe.As<Vector4, Vector3>(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb;

184
src/ImageSharp/ColorProfiles/Rgb.cs

@ -4,6 +4,8 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces;
namespace SixLabors.ImageSharp.ColorProfiles;
@ -105,10 +107,87 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
{
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
// TODO: Optimize via SIMD
for (int i = 0; i < source.Length; i++)
int length = source.Length;
if (length == 0)
{
destination[i] = source[i].ToScaledVector4();
return;
}
ref Rgb srcRgb = ref MemoryMarshal.GetReference(source);
ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination);
// Float streams:
// src: r0 g0 b0 r1 g1 b1 ...
// dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb);
ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4);
int i = 0;
if (Avx512F.IsSupported)
{
// 4 pixels per iteration. Using overlapped 16-float loads.
Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
Vector512<float> ones = Vector512.Create(1F);
// BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
// Using -0f sets only the sign bit, producing an efficient "select lane" mask.
Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F);
int quads = length >> 2;
// Leave the last quad (4 pixels) for the scalar tail.
int simdQuads = quads - 1;
for (int q = 0; q < simdQuads; q++)
{
Vector512<float> v = ReadVector512(ref src);
Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm);
Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect);
WriteVector512(ref dst, rgba);
src = ref Unsafe.Add(ref src, 12);
dst = ref Unsafe.Add(ref dst, 16);
i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration. Using overlapped 8-float loads.
Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
Vector256<float> ones = Vector256.Create(1F);
// vblendps mask: bit i selects lane i from 'ones' when set.
// We want lanes 3 and 7 -> 0b10001000 = 0x88.
const byte alphaMask = 0x88;
int pairs = length >> 1;
// Leave the last pair (2 pixels) for the scalar tail.
int simdPairs = pairs - 1;
for (int p = 0; p < simdPairs; p++)
{
Vector256<float> v = ReadVector256(ref src);
Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm);
Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask);
WriteVector256(ref dst, rgba);
src = ref Unsafe.Add(ref src, 6);
dst = ref Unsafe.Add(ref dst, 8);
i += 2;
}
}
// Tail (and non-AVX paths)
for (; i < length; i++)
{
Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4();
}
}
@ -117,10 +196,75 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
{
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
// TODO: Optimize via SIMD
for (int i = 0; i < source.Length; i++)
int length = source.Length;
if (length == 0)
{
destination[i] = FromScaledVector4(source[i]);
return;
}
ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source);
ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination);
// Float streams:
// src: r0 g0 b0 a0 r1 g1 b1 a1 ...
// dst: r0 g0 b0 r1 g1 b1 ...
ref float src = ref Unsafe.As<Vector4, float>(ref srcV4);
ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb);
int i = 0;
if (Avx512F.IsSupported)
{
// 4 pixels per iteration. Using overlapped 16-float stores:
Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
// Number of 4-pixel groups in the input.
int quads = length >> 2;
// Leave the last quad (4 pixels) for the scalar tail.
int simdQuads = quads - 1;
for (int q = 0; q < simdQuads; q++)
{
Vector512<float> v = ReadVector512(ref src);
Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx);
WriteVector512(ref dst, packed);
src = ref Unsafe.Add(ref src, 16);
dst = ref Unsafe.Add(ref dst, 12);
i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration, using overlapped 8-float stores:
Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0);
int pairs = length >> 1;
// Leave the last pair (2 pixels) for the scalar tail.
int simdPairs = pairs - 1;
int pairIndex = 0;
for (; pairIndex < simdPairs; pairIndex++)
{
Vector256<float> v = ReadVector256(ref src);
Vector256<float> packed = Avx2.PermuteVar8x32(v, idx);
WriteVector256(ref dst, packed);
src = ref Unsafe.Add(ref src, 8);
dst = ref Unsafe.Add(ref dst, 6);
i += 2;
}
}
// Tail (and non-AVX paths)
for (; i < length; i++)
{
Vector4 v = Unsafe.Add(ref srcV4, i);
Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v);
}
}
@ -288,4 +432,32 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
M44 = 1F
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector512<float> ReadVector512(ref float src)
{
ref byte b = ref Unsafe.As<float, byte>(ref src);
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<float> ReadVector256(ref float src)
{
ref byte b = ref Unsafe.As<float, byte>(ref src);
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteVector512(ref float dst, Vector512<float> value)
{
ref byte b = ref Unsafe.As<float, byte>(ref dst);
Unsafe.WriteUnaligned(ref b, value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteVector256(ref float dst, Vector256<float> value)
{
ref byte b = ref Unsafe.As<float, byte>(ref dst);
Unsafe.WriteUnaligned(ref b, value);
}
}

Loading…
Cancel
Save