Optimize conversion with Avx

1 week ago · b8d288874c
2 changed files with 299 additions and 7 deletions
--- a/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs
+++ b/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs
@ -5,6 +5,8 @@ using System.Buffers;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Processing;

@ -60,8 +62,126 @@ internal static class ColorProfileConverterExtensionsPixelCompatible
                converter.ConvertUsingIccProfile<Rgb, Rgb>(rgbSpan, rgbSpan);

                // Copy the converted Rgb pixels back to the row as TPixel.
+                // Important: Preserve alpha from the existing row Vector4 values.
+                // We merge RGB from rgbSpan into row, leaving W untouched.
+                ref float srcRgb = ref Unsafe.As<Rgb, float>(ref MemoryMarshal.GetReference(rgbSpan));
+                ref float dstRow = ref Unsafe.As<Vector4, float>(ref MemoryMarshal.GetReference(row));
+
+                int count = rgbSpan.Length;
+                int i = 0;
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector512<float> ReadVector512(ref float f)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static void WriteVector512(ref float f, Vector512<float> v)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    Unsafe.WriteUnaligned(ref b, v);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector256<float> ReadVector256(ref float f)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static void WriteVector256(ref float f, Vector256<float> v)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    Unsafe.WriteUnaligned(ref b, v);
+                }
+
+                if (Avx512F.IsSupported)
+                {
+                    // 4 pixels per iteration.
+                    //
+                    // Source layout (Rgb float stream, 12 floats):
+                    // [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3]
+                    //
+                    // Destination layout (row Vector4 float stream, 16 floats):
+                    // [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3]
+                    //
+                    // We use an overlapped load (16 floats) from the 3-float stride source.
+                    // The permute selects the RGB we need and inserts placeholders for alpha lanes.
+                    //
+                    // Then we blend RGB lanes into the existing destination, preserving alpha lanes.
+                    Vector512<int> rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
+
+                    // BlendVariable selects from the second operand where the sign bit of the mask lane is set.
+                    // We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha).
+                    Vector512<float> rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
+
+                    int quads = count >> 2;
+                    int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end.
+
+                    for (int q = 0; q < simdQuads; q++)
+                    {
+                        Vector512<float> dst = ReadVector512(ref dstRow);
+                        Vector512<float> src = ReadVector512(ref srcRgb);
+
+                        Vector512<float> rgbx = Avx512F.PermuteVar16x32(src, rgbPerm);
+                        Vector512<float> merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect);
+
+                        WriteVector512(ref dstRow, merged);
+
+                        // Advance input by 4 pixels (4 * 3 = 12 floats)
+                        srcRgb = ref Unsafe.Add(ref srcRgb, 12);
+
+                        // Advance output by 4 pixels (4 * 4 = 16 floats)
+                        dstRow = ref Unsafe.Add(ref dstRow, 16);
+
+                        i += 4;
+                    }
+                }
+                else if (Avx2.IsSupported)
+                {
+                    // 2 pixels per iteration.
+                    //
+                    // Same idea as AVX-512, but on 256-bit vectors.
+                    // We permute packed RGB into rgbx layout and blend into the existing destination,
+                    // preserving alpha lanes.
+                    Vector256<int> rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
+
+                    Vector256<float> rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
+
+                    int pairs = count >> 1;
+                    int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end.
+
+                    for (int p = 0; p < simdPairs; p++)
+                    {
+                        Vector256<float> dst = ReadVector256(ref dstRow);
+                        Vector256<float> src = ReadVector256(ref srcRgb);
+
+                        Vector256<float> rgbx = Avx2.PermuteVar8x32(src, rgbPerm);
+                        Vector256<float> merged = Avx.BlendVariable(dst, rgbx, rgbSelect);
+
+                        WriteVector256(ref dstRow, merged);
+
+                        // Advance input by 2 pixels (2 * 3 = 6 floats)
+                        srcRgb = ref Unsafe.Add(ref srcRgb, 6);
+
+                        // Advance output by 2 pixels (2 * 4 = 8 floats)
+                        dstRow = ref Unsafe.Add(ref dstRow, 8);
+
+                        i += 2;
+                    }
+                }
+
+                // Scalar tail.
+                // Handles:
+                // - the last skipped SIMD block (quad or pair)
+                // - any remainder
+                //
+                // Preserve alpha by writing Vector3 into the Vector4 storage.
                ref Vector4 rowRef = ref MemoryMarshal.GetReference(row);
-                for (int i = 0; i < rgbSpan.Length; i++)
+                for (; i < count; i++)
                {
                    Vector3 rgb = rgbSpan[i].AsVector3Unsafe();
                    Unsafe.As<Vector4, Vector3>(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb;
--- a/src/ImageSharp/ColorProfiles/Rgb.cs
+++ b/src/ImageSharp/ColorProfiles/Rgb.cs
@ -4,6 +4,8 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces;

 namespace SixLabors.ImageSharp.ColorProfiles;
@ -105,10 +107,87 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
    {
        Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-        // TODO: Optimize via SIMD
-        for (int i = 0; i < source.Length; i++)
+        int length = source.Length;
+        if (length == 0)
        {
-            destination[i] = source[i].ToScaledVector4();
+            return;
+        }
+
+        ref Rgb srcRgb = ref MemoryMarshal.GetReference(source);
+        ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination);
+
+        // Float streams:
+        // src: r0 g0 b0 r1 g1 b1 ...
+        // dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
+        ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb);
+        ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4);
+
+        int i = 0;
+
+        if (Avx512F.IsSupported)
+        {
+            // 4 pixels per iteration. Using overlapped 16-float loads.
+            Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
+            Vector512<float> ones = Vector512.Create(1F);
+
+            // BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
+            // Using -0f sets only the sign bit, producing an efficient "select lane" mask.
+            Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F);
+
+            int quads = length >> 2;
+
+            // Leave the last quad (4 pixels) for the scalar tail.
+            int simdQuads = quads - 1;
+
+            for (int q = 0; q < simdQuads; q++)
+            {
+                Vector512<float> v = ReadVector512(ref src);
+                Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm);
+                Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect);
+
+                WriteVector512(ref dst, rgba);
+
+                src = ref Unsafe.Add(ref src, 12);
+                dst = ref Unsafe.Add(ref dst, 16);
+
+                i += 4;
+            }
+        }
+        else if (Avx2.IsSupported)
+        {
+            // 2 pixels per iteration. Using overlapped 8-float loads.
+            Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
+
+            Vector256<float> ones = Vector256.Create(1F);
+
+            // vblendps mask: bit i selects lane i from 'ones' when set.
+            // We want lanes 3 and 7 -> 0b10001000 = 0x88.
+            const byte alphaMask = 0x88;
+
+            int pairs = length >> 1;
+
+            // Leave the last pair (2 pixels) for the scalar tail.
+            int simdPairs = pairs - 1;
+
+            for (int p = 0; p < simdPairs; p++)
+            {
+                Vector256<float> v = ReadVector256(ref src);
+                Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm);
+                Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask);
+
+                WriteVector256(ref dst, rgba);
+
+                src = ref Unsafe.Add(ref src, 6);
+                dst = ref Unsafe.Add(ref dst, 8);
+
+                i += 2;
+            }
+        }
+
+        // Tail (and non-AVX paths)
+        for (; i < length; i++)
+        {
+            Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4();
        }
    }

@ -117,10 +196,75 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
    {
        Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-        // TODO: Optimize via SIMD
-        for (int i = 0; i < source.Length; i++)
+        int length = source.Length;
+        if (length == 0)
        {
-            destination[i] = FromScaledVector4(source[i]);
+            return;
+        }
+
+        ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source);
+        ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination);
+
+        // Float streams:
+        // src: r0 g0 b0 a0 r1 g1 b1 a1 ...
+        // dst: r0 g0 b0 r1 g1 b1 ...
+        ref float src = ref Unsafe.As<Vector4, float>(ref srcV4);
+        ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb);
+
+        int i = 0;
+
+        if (Avx512F.IsSupported)
+        {
+            // 4 pixels per iteration. Using overlapped 16-float stores:
+            Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
+
+            // Number of 4-pixel groups in the input.
+            int quads = length >> 2;
+
+            // Leave the last quad (4 pixels) for the scalar tail.
+            int simdQuads = quads - 1;
+
+            for (int q = 0; q < simdQuads; q++)
+            {
+                Vector512<float> v = ReadVector512(ref src);
+                Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx);
+
+                WriteVector512(ref dst, packed);
+
+                src = ref Unsafe.Add(ref src, 16);
+                dst = ref Unsafe.Add(ref dst, 12);
+                i += 4;
+            }
+        }
+        else if (Avx2.IsSupported)
+        {
+            // 2 pixels per iteration, using overlapped 8-float stores:
+            Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0);
+
+            int pairs = length >> 1;
+
+            // Leave the last pair (2 pixels) for the scalar tail.
+            int simdPairs = pairs - 1;
+
+            int pairIndex = 0;
+            for (; pairIndex < simdPairs; pairIndex++)
+            {
+                Vector256<float> v = ReadVector256(ref src);
+                Vector256<float> packed = Avx2.PermuteVar8x32(v, idx);
+
+                WriteVector256(ref dst, packed);
+
+                src = ref Unsafe.Add(ref src, 8);
+                dst = ref Unsafe.Add(ref dst, 6);
+                i += 2;
+            }
+        }
+
+        // Tail (and non-AVX paths)
+        for (; i < length; i++)
+        {
+            Vector4 v = Unsafe.Add(ref srcV4, i);
+            Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v);
        }
    }

@ -288,4 +432,32 @@ public readonly struct Rgb : IProfileConnectingSpace<Rgb, CieXyz>
            M44 = 1F
        };
    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector512<float> ReadVector512(ref float src)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref src);
+        return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector256<float> ReadVector256(ref float src)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref src);
+        return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static void WriteVector512(ref float dst, Vector512<float> value)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref dst);
+        Unsafe.WriteUnaligned(ref b, value);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static void WriteVector256(ref float dst, Vector256<float> value)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref dst);
+        Unsafe.WriteUnaligned(ref b, value);
+    }
 }