diff --git a/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs b/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs index 2780f04ba..3c6cdba4a 100644 --- a/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs +++ b/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs @@ -5,6 +5,8 @@ using System.Buffers; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.Processing; @@ -60,8 +62,126 @@ internal static class ColorProfileConverterExtensionsPixelCompatible converter.ConvertUsingIccProfile(rgbSpan, rgbSpan); // Copy the converted Rgb pixels back to the row as TPixel. + // Important: Preserve alpha from the existing row Vector4 values. + // We merge RGB from rgbSpan into row, leaving W untouched. + ref float srcRgb = ref Unsafe.As(ref MemoryMarshal.GetReference(rgbSpan)); + ref float dstRow = ref Unsafe.As(ref MemoryMarshal.GetReference(row)); + + int count = rgbSpan.Length; + int i = 0; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector512 ReadVector512(ref float f) + { + ref byte b = ref Unsafe.As(ref f); + return Unsafe.ReadUnaligned>(ref b); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void WriteVector512(ref float f, Vector512 v) + { + ref byte b = ref Unsafe.As(ref f); + Unsafe.WriteUnaligned(ref b, v); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector256 ReadVector256(ref float f) + { + ref byte b = ref Unsafe.As(ref f); + return Unsafe.ReadUnaligned>(ref b); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void WriteVector256(ref float f, Vector256 v) + { + ref byte b = ref Unsafe.As(ref f); + Unsafe.WriteUnaligned(ref b, v); + } + + if (Avx512F.IsSupported) + { + // 4 pixels per iteration. + // + // Source layout (Rgb float stream, 12 floats): + // [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3] + // + // Destination layout (row Vector4 float stream, 16 floats): + // [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3] + // + // We use an overlapped load (16 floats) from the 3-float stride source. + // The permute selects the RGB we need and inserts placeholders for alpha lanes. + // + // Then we blend RGB lanes into the existing destination, preserving alpha lanes. + Vector512 rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0); + + // BlendVariable selects from the second operand where the sign bit of the mask lane is set. + // We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha). + Vector512 rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F); + + int quads = count >> 2; + int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end. + + for (int q = 0; q < simdQuads; q++) + { + Vector512 dst = ReadVector512(ref dstRow); + Vector512 src = ReadVector512(ref srcRgb); + + Vector512 rgbx = Avx512F.PermuteVar16x32(src, rgbPerm); + Vector512 merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect); + + WriteVector512(ref dstRow, merged); + + // Advance input by 4 pixels (4 * 3 = 12 floats) + srcRgb = ref Unsafe.Add(ref srcRgb, 12); + + // Advance output by 4 pixels (4 * 4 = 16 floats) + dstRow = ref Unsafe.Add(ref dstRow, 16); + + i += 4; + } + } + else if (Avx2.IsSupported) + { + // 2 pixels per iteration. + // + // Same idea as AVX-512, but on 256-bit vectors. + // We permute packed RGB into rgbx layout and blend into the existing destination, + // preserving alpha lanes. + Vector256 rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0); + + Vector256 rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F); + + int pairs = count >> 1; + int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end. + + for (int p = 0; p < simdPairs; p++) + { + Vector256 dst = ReadVector256(ref dstRow); + Vector256 src = ReadVector256(ref srcRgb); + + Vector256 rgbx = Avx2.PermuteVar8x32(src, rgbPerm); + Vector256 merged = Avx.BlendVariable(dst, rgbx, rgbSelect); + + WriteVector256(ref dstRow, merged); + + // Advance input by 2 pixels (2 * 3 = 6 floats) + srcRgb = ref Unsafe.Add(ref srcRgb, 6); + + // Advance output by 2 pixels (2 * 4 = 8 floats) + dstRow = ref Unsafe.Add(ref dstRow, 8); + + i += 2; + } + } + + // Scalar tail. + // Handles: + // - the last skipped SIMD block (quad or pair) + // - any remainder + // + // Preserve alpha by writing Vector3 into the Vector4 storage. ref Vector4 rowRef = ref MemoryMarshal.GetReference(row); - for (int i = 0; i < rgbSpan.Length; i++) + for (; i < count; i++) { Vector3 rgb = rgbSpan[i].AsVector3Unsafe(); Unsafe.As(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb; diff --git a/src/ImageSharp/ColorProfiles/Rgb.cs b/src/ImageSharp/ColorProfiles/Rgb.cs index 42e502592..c95e54192 100644 --- a/src/ImageSharp/ColorProfiles/Rgb.cs +++ b/src/ImageSharp/ColorProfiles/Rgb.cs @@ -4,6 +4,8 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces; namespace SixLabors.ImageSharp.ColorProfiles; @@ -105,10 +107,87 @@ public readonly struct Rgb : IProfileConnectingSpace { Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); - // TODO: Optimize via SIMD - for (int i = 0; i < source.Length; i++) + int length = source.Length; + if (length == 0) { - destination[i] = source[i].ToScaledVector4(); + return; + } + + ref Rgb srcRgb = ref MemoryMarshal.GetReference(source); + ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination); + + // Float streams: + // src: r0 g0 b0 r1 g1 b1 ... + // dst: r0 g0 b0 a0 r1 g1 b1 a1 ... + ref float src = ref Unsafe.As(ref srcRgb); + ref float dst = ref Unsafe.As(ref dstV4); + + int i = 0; + + if (Avx512F.IsSupported) + { + // 4 pixels per iteration. Using overlapped 16-float loads. + Vector512 perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0); + Vector512 ones = Vector512.Create(1F); + + // BlendVariable selects from 'ones' where the sign-bit of mask lane is set. + // Using -0f sets only the sign bit, producing an efficient "select lane" mask. + Vector512 alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F); + + int quads = length >> 2; + + // Leave the last quad (4 pixels) for the scalar tail. + int simdQuads = quads - 1; + + for (int q = 0; q < simdQuads; q++) + { + Vector512 v = ReadVector512(ref src); + Vector512 rgbx = Avx512F.PermuteVar16x32(v, perm); + Vector512 rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect); + + WriteVector512(ref dst, rgba); + + src = ref Unsafe.Add(ref src, 12); + dst = ref Unsafe.Add(ref dst, 16); + + i += 4; + } + } + else if (Avx2.IsSupported) + { + // 2 pixels per iteration. Using overlapped 8-float loads. + Vector256 perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0); + + Vector256 ones = Vector256.Create(1F); + + // vblendps mask: bit i selects lane i from 'ones' when set. + // We want lanes 3 and 7 -> 0b10001000 = 0x88. + const byte alphaMask = 0x88; + + int pairs = length >> 1; + + // Leave the last pair (2 pixels) for the scalar tail. + int simdPairs = pairs - 1; + + for (int p = 0; p < simdPairs; p++) + { + Vector256 v = ReadVector256(ref src); + Vector256 rgbx = Avx2.PermuteVar8x32(v, perm); + Vector256 rgba = Avx.Blend(rgbx, ones, alphaMask); + + WriteVector256(ref dst, rgba); + + src = ref Unsafe.Add(ref src, 6); + dst = ref Unsafe.Add(ref dst, 8); + + i += 2; + } + } + + // Tail (and non-AVX paths) + for (; i < length; i++) + { + Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4(); } } @@ -117,10 +196,75 @@ public readonly struct Rgb : IProfileConnectingSpace { Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); - // TODO: Optimize via SIMD - for (int i = 0; i < source.Length; i++) + int length = source.Length; + if (length == 0) { - destination[i] = FromScaledVector4(source[i]); + return; + } + + ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source); + ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination); + + // Float streams: + // src: r0 g0 b0 a0 r1 g1 b1 a1 ... + // dst: r0 g0 b0 r1 g1 b1 ... + ref float src = ref Unsafe.As(ref srcV4); + ref float dst = ref Unsafe.As(ref dstRgb); + + int i = 0; + + if (Avx512F.IsSupported) + { + // 4 pixels per iteration. Using overlapped 16-float stores: + Vector512 idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); + + // Number of 4-pixel groups in the input. + int quads = length >> 2; + + // Leave the last quad (4 pixels) for the scalar tail. + int simdQuads = quads - 1; + + for (int q = 0; q < simdQuads; q++) + { + Vector512 v = ReadVector512(ref src); + Vector512 packed = Avx512F.PermuteVar16x32(v, idx); + + WriteVector512(ref dst, packed); + + src = ref Unsafe.Add(ref src, 16); + dst = ref Unsafe.Add(ref dst, 12); + i += 4; + } + } + else if (Avx2.IsSupported) + { + // 2 pixels per iteration, using overlapped 8-float stores: + Vector256 idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0); + + int pairs = length >> 1; + + // Leave the last pair (2 pixels) for the scalar tail. + int simdPairs = pairs - 1; + + int pairIndex = 0; + for (; pairIndex < simdPairs; pairIndex++) + { + Vector256 v = ReadVector256(ref src); + Vector256 packed = Avx2.PermuteVar8x32(v, idx); + + WriteVector256(ref dst, packed); + + src = ref Unsafe.Add(ref src, 8); + dst = ref Unsafe.Add(ref dst, 6); + i += 2; + } + } + + // Tail (and non-AVX paths) + for (; i < length; i++) + { + Vector4 v = Unsafe.Add(ref srcV4, i); + Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v); } } @@ -288,4 +432,32 @@ public readonly struct Rgb : IProfileConnectingSpace M44 = 1F }; } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector512 ReadVector512(ref float src) + { + ref byte b = ref Unsafe.As(ref src); + return Unsafe.ReadUnaligned>(ref b); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ReadVector256(ref float src) + { + ref byte b = ref Unsafe.As(ref src); + return Unsafe.ReadUnaligned>(ref b); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteVector512(ref float dst, Vector512 value) + { + ref byte b = ref Unsafe.As(ref dst); + Unsafe.WriteUnaligned(ref b, value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteVector256(ref float dst, Vector256 value) + { + ref byte b = ref Unsafe.As(ref dst); + Unsafe.WriteUnaligned(ref b, value); + } } diff --git a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs index fd31a7fad..d6a07eeca 100644 --- a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs +++ b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs @@ -122,6 +122,7 @@ internal sealed class WebpDecoderCore : ImageDecoderCore, IDisposable this.ParseOptionalChunks(stream, metadata, this.webImageInfo.Features, buffer); } + _ = this.TryConvertIccProfile(image); return image; } } diff --git a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs index c0abed214..a3e3b81cf 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs @@ -608,4 +608,17 @@ public class WebpDecoderTests image.DebugSave(provider); image.CompareToOriginal(provider, ReferenceDecoder); } + + [Theory] + [WithFile(Icc.Perceptual, PixelTypes.Rgba32)] + [WithFile(Icc.PerceptualcLUTOnly, PixelTypes.Rgba32)] + public void Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile(TestImageProvider provider) + where TPixel : unmanaged, IPixel + { + using Image image = provider.GetImage(WebpDecoder.Instance, new DecoderOptions { ColorProfileHandling = ColorProfileHandling.Convert }); + + image.DebugSave(provider); + image.CompareToReferenceOutput(provider); + Assert.Null(image.Metadata.IccProfile); + } } diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index 6b4a86666..dc3275999 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -901,6 +901,12 @@ public static class TestImages public const string AlphaBlend2 = "Webp/alpha-blend-2.webp"; public const string AlphaBlend3 = "Webp/alpha-blend-3.webp"; public const string AlphaBlend4 = "Webp/alpha-blend-4.webp"; + + public static class Icc + { + public const string Perceptual = "Webp/icc-profiles/Perceptual.webp"; + public const string PerceptualcLUTOnly = "Webp/icc-profiles/Perceptual-cLUT-only.webp"; + } } public static class Tiff diff --git a/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual-cLUT-only.png b/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual-cLUT-only.png new file mode 100644 index 000000000..4a01423ee --- /dev/null +++ b/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual-cLUT-only.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cbd483eed31cf7b410ef1ee45ae78e77b36e5b9c31f87764a1dfdb9c4e5c8 +size 79768 diff --git a/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual.png b/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual.png new file mode 100644 index 000000000..c46b369ef --- /dev/null +++ b/tests/Images/External/ReferenceOutput/WebpDecoderTests/Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547f3ddb5bdb3a3cb5c87440b65728be295c2b5f3c10a1b0b44299bb3d80e8d7 +size 79651 diff --git a/tests/Images/Input/Webp/icc-profiles/Perceptual-cLUT-only.webp b/tests/Images/Input/Webp/icc-profiles/Perceptual-cLUT-only.webp new file mode 100644 index 000000000..4787b792a --- /dev/null +++ b/tests/Images/Input/Webp/icc-profiles/Perceptual-cLUT-only.webp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b7ef9eedd1b2a4b93f11ac52807c071f7d0a24513008c3e69b0d4a0fd9b70db +size 186596 diff --git a/tests/Images/Input/Webp/icc-profiles/Perceptual.webp b/tests/Images/Input/Webp/icc-profiles/Perceptual.webp new file mode 100644 index 000000000..b78504f43 --- /dev/null +++ b/tests/Images/Input/Webp/icc-profiles/Perceptual.webp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aad13221b103b60c21a19e7ecfbab047404f25269485d26fc5dee4be11188865 +size 189800