From e12fd7ba9e11a91cdebc45dee1e2beab53f087a8 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 23 Dec 2020 14:37:25 +0100 Subject: [PATCH] Add SSE4 version of CollectColorBlueTransforms --- .../Formats/WebP/Lossless/PredictorEncoder.cs | 64 ++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/WebP/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/WebP/Lossless/PredictorEncoder.cs index d846708892..d2f810949f 100644 --- a/src/ImageSharp/Formats/WebP/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/WebP/Lossless/PredictorEncoder.cs @@ -936,9 +936,8 @@ namespace SixLabors.ImageSharp.Formats.Experimental.Webp.Lossless var mask = Vector128.Create((short)0xff); const int span = 8; - int y; Span values = stackalloc ushort[span]; - for (y = 0; y < tileHeight; ++y) + for (int y = 0; y < tileHeight; ++y) { Span srcSpan = bgra.Slice(y * stride); fixed (uint* src = srcSpan) @@ -998,6 +997,67 @@ namespace SixLabors.ImageSharp.Formats.Experimental.Webp.Lossless private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, int[] histo) { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + const int span = 8; + Span values = stackalloc ushort[span]; + var multsr = Vector128.Create((short)((redToBlue << 8) >> 5)); + var multsg = Vector128.Create((short)((greenToBlue << 8) >> 5)); + var maskgreen = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + var maskgreenblue = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector128 maskblue = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + var shufflerLow = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); + var shufflerHigh = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + + for (int y = 0; y < tileHeight; ++y) + { + Span srcSpan = bgra.Slice(y * stride); + fixed (uint* src = srcSpan) + fixed (ushort* dst = values) + { + for (int x = 0; x + span <= tileWidth; x += span) + { + uint* input0Idx = src + x; + uint* input1Idx = src + x + (span / 2); + Vector128 input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte(); + Vector128 input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte(); + Vector128 r0 = Ssse3.Shuffle(input0, shufflerLow); + Vector128 r1 = Ssse3.Shuffle(input1, shufflerHigh); + Vector128 r = Sse2.Or(r0, r1); + Vector128 gb0 = Sse2.And(input0, maskgreenblue); + Vector128 gb1 = Sse2.And(input1, maskgreenblue); + Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = Sse2.And(gb.AsByte(), maskgreen); + Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); + Vector128 d = Sse2.Subtract(c, a.AsByte()); + Vector128 e = Sse2.And(d, maskblue); + Sse2.Store(dst, e.AsUInt16()); + for (int i = 0; i < span; ++i) + { + ++histo[values[i]]; + } + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + } + } + else +#endif + { + CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + } + } + + private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, int[] histo) + { int pos = 0; while (tileHeight-- > 0) {