diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index f9b97c6c44..defa65b4be 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -744,6 +744,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (float)retVal; } + [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorRed(sbyte greenToRed, uint argb) { sbyte green = U32ToS8(argb >> 8); @@ -752,6 +753,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (byte)(newRed & 0xff); } + [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorBlue(sbyte greenToBlue, sbyte redToBlue, uint argb) { sbyte green = U32ToS8(argb >> 8); diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 99504dd488..3d4696d8dd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -48,6 +48,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + + private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); + + private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255); + + private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + + private static readonly Vector256 CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + + private static readonly Vector256 CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + #endif // This uses C#'s compiler optimization to refer to assembly's static data directly. @@ -1128,7 +1139,54 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Avx2.IsSupported && tileWidth > 16) + { + const int span = 16; + Span values = stackalloc ushort[span]; + var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); +#pragma warning disable SA1503 // Braces should not be omitted + fixed (uint* src = srcSpan) + fixed (ushort* dst = values) + { + for (int x = 0; x + span <= tileWidth; x += span) + { + uint* input0Idx = src + x; + uint* input1Idx = src + x + (span / 2); + Vector256 input0 = Avx.LoadVector256(input0Idx).AsByte(); + Vector256 input1 = Avx.LoadVector256(input1Idx).AsByte(); + Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); + Vector256 r = Avx2.Or(r0, r1); + Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); + Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); + Vector256 d = Avx2.Subtract(c, a.AsByte()); + Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); + Avx.Store(dst, e.AsUInt16()); + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } +#pragma warning restore SA1503 // Braces should not be omitted + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + } + } + } + else if (Sse41.IsSupported) { const int span = 8; Span values = stackalloc ushort[span];