From 415836fdcca9edbe244ceedcbf897a23504c31e2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 1 Jul 2021 20:52:52 +0200 Subject: [PATCH] Add Avx2 version of CheckNonOpaque --- .../Formats/WebP/Lossy/YuvConversion.cs | 129 ++++++++++++++---- .../Formats/WebP/YuvConversionTests.cs | 66 ++++++++- 2 files changed, 169 insertions(+), 26 deletions(-) diff --git a/src/ImageSharp/Formats/WebP/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/WebP/Lossy/YuvConversion.cs index 4bd70add6..3c67bfb57 100644 --- a/src/ImageSharp/Formats/WebP/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/WebP/Lossy/YuvConversion.cs @@ -101,13 +101,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// The row to check. /// Returns true if alpha has non-0xff values. - [MethodImpl(InliningOptions.ShortMethod)] public static unsafe bool CheckNonOpaque(Span row) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Avx2.IsSupported) { ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); + var alphaMaskVector256 = Vector256.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); + Vector256 all0x80Vector256 = Vector256.Create((byte)0x80).AsByte(); var alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); Vector128 all0x80 = Vector128.Create((byte)0x80).AsByte(); @@ -115,22 +116,30 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int length = (row.Length * 4) - 3; fixed (byte* src = rowBytes) { + for (; i + 128 <= length; i += 128) + { + Vector256 a0 = Avx.LoadVector256(src + i).AsByte(); + Vector256 a1 = Avx.LoadVector256(src + i + 32).AsByte(); + Vector256 a2 = Avx.LoadVector256(src + i + 64).AsByte(); + Vector256 a3 = Avx.LoadVector256(src + i + 96).AsByte(); + Vector256 b0 = Avx2.And(a0, alphaMaskVector256).AsInt32(); + Vector256 b1 = Avx2.And(a1, alphaMaskVector256).AsInt32(); + Vector256 b2 = Avx2.And(a2, alphaMaskVector256).AsInt32(); + Vector256 b3 = Avx2.And(a3, alphaMaskVector256).AsInt32(); + Vector256 c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16(); + Vector256 c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16(); + Vector256 d = Avx2.PackSignedSaturate(c0, c1).AsByte(); + Vector256 bits = Avx2.CompareEqual(d, all0x80Vector256); + int mask = Avx2.MoveMask(bits); + if (mask != -1) + { + return true; + } + } + for (; i + 64 <= length; i += 64) { - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 a2 = Sse2.LoadVector128(src + i + 32).AsByte(); - Vector128 a3 = Sse2.LoadVector128(src + i + 48).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 b2 = Sse2.And(a2, alphaMask).AsInt32(); - Vector128 b3 = Sse2.And(a3, alphaMask).AsInt32(); - Vector128 c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c0, c1).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, all0x80); - int mask = Sse2.MoveMask(bits); - if (mask != 0xFFFF) + if (IsNoneOpaque64Bytes(src, i, alphaMask, all0x80)) { return true; } @@ -138,15 +147,42 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (; i + 32 <= length; i += 32) { - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 c = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c, c).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, all0x80); - int mask = Sse2.MoveMask(bits); - if (mask != 0xFFFF) + if (IsNoneOpaque32Bytes(src, i, alphaMask, all0x80)) + { + return true; + } + } + + for (; i <= length; i += 4) + { + if (src[i + 3] != 0xFF) + { + return true; + } + } + } + } + else if (Sse2.IsSupported) + { + ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); + var alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); + Vector128 all0x80 = Vector128.Create((byte)0x80).AsByte(); + + int i = 0; + int length = (row.Length * 4) - 3; + fixed (byte* src = rowBytes) + { + for (; i + 64 <= length; i += 64) + { + if (IsNoneOpaque64Bytes(src, i, alphaMask, all0x80)) + { + return true; + } + } + + for (; i + 32 <= length; i += 32) + { + if (IsNoneOpaque32Bytes(src, i, alphaMask, all0x80)) { return true; } @@ -176,6 +212,49 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return false; } +#if SUPPORTS_RUNTIME_INTRINSICS + private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i, Vector128 alphaMask, Vector128 all0x80) + { + Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); + Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); + Vector128 a2 = Sse2.LoadVector128(src + i + 32).AsByte(); + Vector128 a3 = Sse2.LoadVector128(src + i + 48).AsByte(); + Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); + Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); + Vector128 b2 = Sse2.And(a2, alphaMask).AsInt32(); + Vector128 b3 = Sse2.And(a3, alphaMask).AsInt32(); + Vector128 c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16(); + Vector128 d = Sse2.PackSignedSaturate(c0, c1).AsByte(); + Vector128 bits = Sse2.CompareEqual(d, all0x80); + int mask = Sse2.MoveMask(bits); + if (mask != 0xFFFF) + { + return true; + } + + return false; + } + + private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i, Vector128 alphaMask, Vector128 all0x80) + { + Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); + Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); + Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); + Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); + Vector128 c = Sse2.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 d = Sse2.PackSignedSaturate(c, c).AsByte(); + Vector128 bits = Sse2.CompareEqual(d, all0x80); + int mask = Sse2.MoveMask(bits); + if (mask != 0xFFFF) + { + return true; + } + + return false; + } +#endif + /// /// Converts a rgba pixel row to Y. /// diff --git a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs index 79b2315a2..59ef221bf 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs @@ -287,13 +287,45 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp 148, 158, 158, 255, 171, 165, 151, 255, 209, 208, 210, 255, - 174, 183, 189, 100, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, 148, 158, 158, 255, 171, 165, 151, 255, 209, 208, 210, 255, 174, 183, 189, 255, 148, 158, 158, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 100, + 171, 165, 151, 0, + 209, 208, 210, 100, + 174, 183, 189, 255, + 148, 158, 158, 255, }; Span row = MemoryMarshal.Cast(rowBytes); @@ -334,6 +366,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp 148, 158, 158, 255, 171, 165, 151, 255, 209, 208, 210, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, + 174, 183, 189, 255, + 148, 158, 158, 255, + 148, 158, 158, 255, + 171, 165, 151, 255, + 209, 208, 210, 255, 174, 183, 189, 255, 148, 158, 158, 255, };