From 7191acaf34f535bc883b99114eb811708ddb5064 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 17 Nov 2021 10:58:52 +0100 Subject: [PATCH 01/20] Move UpSample to YuvConversion class --- .../Formats/Webp/Lossy/WebpLossyDecoder.cs | 62 ++----------------- .../Formats/Webp/Lossy/YuvConversion.cs | 54 ++++++++++++++++ 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 2f78842c6..b27ef88fb 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -696,12 +696,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - this.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); + YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); } else { // We can finish the left-over line from previous call. - this.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); + YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); numLinesOut++; } @@ -714,7 +714,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy topV = curV; curU = curU.Slice(io.UvStride); curV = curV.Slice(io.UvStride); - this.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); + YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); curY = curY.Slice(ioStride2); dst = dst.Slice(bufferStride2); } @@ -736,67 +736,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - this.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); + YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); } } return numLinesOut; } - private void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) - { - int xStep = 3; - int lastPixelPair = (len - 1) >> 1; - uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample - uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample - uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); - - if (bottomY != null) - { - uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); - } - - for (int x = 1; x <= lastPixelPair; x++) - { - uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample - uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample - - // Precompute invariant values associated with first and second diagonals. - uint avg = tluv + tuv + luv + uv + 0x00080008u; - uint diag12 = (avg + (2 * (tuv + luv))) >> 3; - uint diag03 = (avg + (2 * (tluv + uv))) >> 3; - uv0 = (diag12 + tluv) >> 1; - uint uv1 = (diag03 + tuv) >> 1; - int xMul2 = x * 2; - YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); - YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); - - if (bottomY != null) - { - uv0 = (diag03 + luv) >> 1; - uv1 = (diag12 + uv) >> 1; - YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); - YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); - } - - tluv = tuv; - luv = uv; - } - - if ((len & 1) == 0) - { - uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); - if (bottomY != null) - { - uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); - } - } - } - private void DoTransform(uint bits, Span src, Span dst, Span scratch) { switch (bits >> 30) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index a9cf876c8..182437e54 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -18,6 +18,60 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private const int YuvHalf = 1 << (YuvFix - 1); + public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { + int xStep = 3; + int lastPixelPair = (len - 1) >> 1; + uint tluv = LoadUv(topU[0], topV[0]); // top-left sample + uint luv = LoadUv(curU[0], curV[0]); // left-sample + uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; + YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); + + if (bottomY != null) + { + uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; + YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); + } + + for (int x = 1; x <= lastPixelPair; x++) + { + uint tuv = LoadUv(topU[x], topV[x]); // top sample + uint uv = LoadUv(curU[x], curV[x]); // sample + + // Precompute invariant values associated with first and second diagonals. + uint avg = tluv + tuv + luv + uv + 0x00080008u; + uint diag12 = (avg + (2 * (tuv + luv))) >> 3; + uint diag03 = (avg + (2 * (tluv + uv))) >> 3; + uv0 = (diag12 + tluv) >> 1; + uint uv1 = (diag03 + tuv) >> 1; + int xMul2 = x * 2; + YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); + YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); + + if (bottomY != null) + { + uv0 = (diag03 + luv) >> 1; + uv1 = (diag12 + uv) >> 1; + YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); + YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); + } + + tluv = tuv; + luv = uv; + } + + if ((len & 1) == 0) + { + uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; + YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); + if (bottomY != null) + { + uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; + YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); + } + } + } + /// /// Converts the RGB values of the image to YUV. /// From 59a11bf9011729ad5cf8f30f3ef21adf616bf0b9 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 17 Nov 2021 21:45:32 +0100 Subject: [PATCH 02/20] Add SSE41 version of UpSample --- .../Formats/Webp/Lossy/YuvConversion.cs | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 182437e54..0f5c56c74 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -4,6 +4,11 @@ using System; using System.Buffers; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -18,8 +23,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private const int YuvHalf = 1 << (YuvFix - 1); +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 One = Vector128.Create((byte)1); + + // These constants are 14b fixed-point version of ITU-R BT.601 constants. + // R = (19077 * y + 26149 * v - 14234) >> 6 + // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 + // B = (19077 * y + 33050 * u - 17685) >> 6 + private static readonly Vector128 K19077 = Vector128.Create((short)19077).AsByte(); + + private static readonly Vector128 K26149 = Vector128.Create((short)26149).AsByte(); + + private static readonly Vector128 K14234 = Vector128.Create((short)14234).AsByte(); + + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + private static readonly Vector128 K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129); + + private static readonly Vector128 K17685 = Vector128.Create((short)17685).AsByte(); + + private static readonly Vector128 K6419 = Vector128.Create((short)6419).AsByte(); + + private static readonly Vector128 K13320 = Vector128.Create((short)13320).AsByte(); + + private static readonly Vector128 K8708 = Vector128.Create((short)8708).AsByte(); + + private static readonly Vector128 PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5); + + private static readonly Vector128 PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255); + + private static readonly Vector128 PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10); + + private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15, 255); + + private static readonly Vector128 PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255); + + private static readonly Vector128 PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15); +#endif + + // UpSample from YUV to RGB. public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + } + else +#endif + { + UpSampleScalar(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + } + } + + public static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { int xStep = 3; int lastPixelPair = (len - 1) >> 1; uint tluv = LoadUv(topU[0], topV[0]); // top-left sample @@ -72,6 +135,106 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } +#if SUPPORTS_RUNTIME_INTRINSICS + // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows + // u = (9*a + 3*b + 3*c + d + 8) / 16 + // = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2 + // = (a + m + 1) / 2 + // where m = (a + 3*b + 3*c + d) / 8 + // = ((a + b + c + d) / 2 + b + c) / 4 + // + // Let's say k = (a + b + c + d) / 4. + // We can compute k as + // k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1 + // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2 + // + // Then m can be written as + // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 + public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { + const int xStep = 3; + byte[] uvBuffer = new byte[(14 * 32) + 15]; + Span ru = uvBuffer.AsSpan(15); + Span rv = ru.Slice(32); + + // Treat the first pixel in regular way. + int uDiag = ((topU[0] + curU[0]) >> 1) + 1; + int vDiag = ((topV[0] + curV[0]) >> 1) + 1; + int u0t = (topU[0] + uDiag) >> 1; + int v0t = (topV[0] + vDiag) >> 1; + YuvToBgr(topY[0], u0t, v0t, topDst); + if (bottomY != null) + { + int u0b = (curU[0] + uDiag) >> 1; + int v0b = (curV[0] + vDiag) >> 1; + YuvToBgr(bottomY[0], u0b, v0b, bottomDst); + } + + // For UpSample32Pixels, 17 u/v values must be read-able for each block. + for (int pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); + UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); + ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + } + } + + // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. + public static void UpSample32Pixels(Span r1, Span r2, Span output) + { + // Load inputs. + Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); + Vector128 b = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r1), 1)); + Vector128 c = Unsafe.As>(ref MemoryMarshal.GetReference(r2)); + Vector128 d = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r2), 1)); + + Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 + Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 + Vector128 st = Sse2.Xor(s, t); // st = s^t + + Vector128 ad = Sse2.Xor(a, d); // ad = a^d + Vector128 bc = Sse2.Xor(b, c); // bc = b^c + + Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c) + Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t) + Vector128 t3 = Sse2.And(t2, One); // (a^d) | (b^c) | (s^t) & 1 + Vector128 t4 = Sse2.Average(s, t); + Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4 + + Vector128 diag1 = GetM(k, st, bc, t); + Vector128 diag2 = GetM(k, st, ad, s); + + // Pack the alternate pixels. + PackAndStore(a, b, diag1, diag2, output); // store top. + PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32)); + } + + // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 + private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) + { + Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2 + Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t) + Vector128 tmp2 = Sse2.Xor(k, input); // (k^in) + Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in) + Vector128 tmp4 = Sse2.And(tmp3, One); // & 1 -> lsb_correction + + return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction + } + + private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) + { + Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 + Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 + Vector128 t1 = Sse2.UnpackLow(ta, tb); + Vector128 t2 = Sse2.UnpackHigh(ta, tb); + + ref byte output0Ref = ref MemoryMarshal.GetReference(output); + ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16); + Unsafe.As>(ref output0Ref) = t1; + Unsafe.As>(ref output1Ref) = t2; + } +#endif + /// /// Converts the RGB values of the image to YUV. /// @@ -366,6 +529,175 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy bgr[0] = (byte)YuvToB(y, u); } +#if SUPPORTS_RUNTIME_INTRINSICS + + private static void ConvertToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + { + YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); + + if (bottomY != null) + { + YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); + } + } + + public static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + { + ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToRgbSse41(y.Slice(16), u.Slice(16), v.Slice(16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToRgbSse41(y.Slice(24), u.Slice(24), v.Slice(24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + + // Cast to 8b and store as BBBBGGGGRRRR. + Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); + Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3); + Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1); + Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3); + Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1); + Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); + } + + // Pack the planar buffers + // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... + // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... + private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) + { + // The input is 6 registers of sixteen 8b but for the sake of explanation, + // let's take 6 registers of four 8b values. + // To pack, we will keep taking one every two 8b integer and move it + // around as follows: + // Input: + // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 + // Split the 6 registers in two sets of 3 registers: the first set as the even + // 8b bytes, the second the odd ones: + // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 + // Repeat the same permutations twice more: + // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 + // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 + + // Process R. + ChannelMixing( + input0, + input1, + PlanarTo24Shuffle0, + PlanarTo24Shuffle1, + PlanarTo24Shuffle2, + out Vector128 r0, + out Vector128 r1, + out Vector128 r2, + out Vector128 r3, + out Vector128 r4, + out Vector128 r5); + + // Process G. + // Same as before, just shifted to the left by one and including the right padding. + ChannelMixing( + input2, + input3, + PlanarTo24Shuffle3, + PlanarTo24Shuffle4, + PlanarTo24Shuffle5, + out Vector128 g0, + out Vector128 g1, + out Vector128 g2, + out Vector128 g3, + out Vector128 g4, + out Vector128 g5); + + // Process B. + ChannelMixing( + input4, + input5, + PlanarTo24Shuffle6, + PlanarTo24Shuffle7, + PlanarTo24Shuffle8, + out Vector128 b0, + out Vector128 b1, + out Vector128 b2, + out Vector128 b3, + out Vector128 b4, + out Vector128 b5); + + // OR the different channels. + Vector128 rg0 = Sse2.Or(r0, g0); + Vector128 rg1 = Sse2.Or(r1, g1); + Vector128 rg2 = Sse2.Or(r2, g2); + Vector128 rg3 = Sse2.Or(r3, g3); + Vector128 rg4 = Sse2.Or(r4, g4); + Vector128 rg5 = Sse2.Or(r5, g5); + + ref byte outputRef = ref MemoryMarshal.GetReference(rgb); + Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5); + } + + // Shuffles the input buffer as A0 0 0 A1 0 0 A2 + private static void ChannelMixing( + Vector128 input0, + Vector128 input1, + Vector128 shuffle0, + Vector128 shuffle1, + Vector128 shuffle2, + out Vector128 output0, + out Vector128 output1, + out Vector128 output2, + out Vector128 output3, + out Vector128 output4, + out Vector128 output5) + { + output0 = Ssse3.Shuffle(input0, shuffle0); + output1 = Ssse3.Shuffle(input0, shuffle1); + output2 = Ssse3.Shuffle(input0, shuffle2); + output3 = Ssse3.Shuffle(input1, shuffle0); + output4 = Ssse3.Shuffle(input1, shuffle1); + output5 = Ssse3.Shuffle(input1, shuffle2); + } + + // Convert 32 samples of YUV444 to R/G/B + private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) + { + Vector128 y0 = LoadHigh(y); + Vector128 u0 = LoadHigh(u); + Vector128 v0 = LoadHigh(v); + + Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); + + Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); + Vector128 r2 = Sse2.Add(r1, r0); + + Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); + Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); + Vector128 g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16()); + Vector128 g3 = Sse2.Add(g0, g1); + Vector128 g4 = Sse2.Subtract(g2, g3); + + Vector128 b0 = Sse2.MultiplyHigh(u0.AsUInt16(), K33050.AsUInt16()); + Vector128 b1 = Sse2.AddSaturate(b0, y1); + Vector128 b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16()); + + // use logical shift for B2, which can be larger than 32767 + r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] + g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] + b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] + } + + // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + private static Vector128 LoadHigh(Span src) + { + Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); + Vector128 tmp2 = Unsafe.As, Vector128>(ref tmp); + return Sse2.UnpackLow(Vector128.Zero, tmp2); + } +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); From 2a03d00c680da4f0f112eea1401421c94dd7e96e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 11:21:25 +0100 Subject: [PATCH 03/20] Upsample last block --- .../Formats/Webp/Lossy/YuvConversion.cs | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 0f5c56c74..1c2208732 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -67,6 +67,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #endif // UpSample from YUV to RGB. + // Given samples laid out in a square as: + // [a b] + // [c d] + // we interpolate u/v as: + // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 + // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -171,12 +177,33 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // For UpSample32Pixels, 17 u/v values must be read-able for each block. - for (int pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + int pos; + int uvPos; + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } + + // Process last block. + if (len > 1) + { + int leftOver = ((len + 1) >> 1) - (pos >> 1); + Span tmpTopDst = ru.Slice(4 * 32); + Span tmpBottomDst = tmpTopDst.Slice(4 * 32); + Span tmpTop = tmpBottomDst.Slice(4 * 32); + Span tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32); + UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); + UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); + topY.Slice(pos, len - pos).CopyTo(tmpTop); + ConvertToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); + if (bottomY != null) + { + tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep)); + } + } } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. @@ -209,6 +236,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32)); } + private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output) + { + Span r1 = stackalloc byte[17]; + Span r2 = stackalloc byte[17]; + tb.Slice(0, numPixels).CopyTo(r1); + bb.Slice(0, numPixels).CopyTo(r2); + + // Replicate last byte. + int length = 17 - numPixels; + if (length > 0) + { + r1.Slice(numPixels, length).Fill(r1[numPixels - 1]); + r2.Slice(numPixels, length).Fill(r2[numPixels - 1]); + } + + UpSample32Pixels(r1, r2, output); + } + // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) { From 3f4388323b3d8ac4efc50606ddb19128cc96b6cf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 13:28:35 +0100 Subject: [PATCH 04/20] Fix shuffle masks --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 1c2208732..080739db9 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -57,13 +57,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static readonly Vector128 PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10); - private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15, 255); + private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255); private static readonly Vector128 PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255); private static readonly Vector128 PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255); - private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15); + private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15); #endif // UpSample from YUV to RGB. @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); - ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } // Process last block. @@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); topY.Slice(pos, len - pos).CopyTo(tmpTop); - ConvertToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != null) { @@ -576,7 +576,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS - private static void ConvertToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + private static void ConvertYuvToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); From ec18321a814739bfd5ab93423c562e17168f6364 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 14:56:21 +0100 Subject: [PATCH 05/20] Fix last block --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 080739db9..3413f6f18 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -196,7 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Span tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32); UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); + topY.Slice(pos, len - pos).CopyTo(tmpTop); + if (bottomY != null) + { + bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); + } + ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != null) From c223d2eadbc0be25d6a493251a765cd6944719ac Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 19 Nov 2021 22:04:03 +1100 Subject: [PATCH 06/20] Avoid implicit casting --- .../Formats/Webp/Lossy/WebpLossyDecoder.cs | 4 ++-- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index b27ef88fb..4d21333e6 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -696,7 +696,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw); } else { @@ -736,7 +736,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw); } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 3413f6f18..342fc330c 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -96,7 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); - if (bottomY != null) + if (bottomY != default) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); @@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); - if (bottomY != null) + if (bottomY != default) { uv0 = (diag03 + luv) >> 1; uv1 = (diag12 + uv) >> 1; @@ -133,7 +133,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); - if (bottomY != null) + if (bottomY != default) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); @@ -169,7 +169,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int u0t = (topU[0] + uDiag) >> 1; int v0t = (topV[0] + vDiag) >> 1; YuvToBgr(topY[0], u0t, v0t, topDst); - if (bottomY != null) + if (bottomY != default) { int u0b = (curU[0] + uDiag) >> 1; int v0b = (curV[0] + vDiag) >> 1; @@ -198,14 +198,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); topY.Slice(pos, len - pos).CopyTo(tmpTop); - if (bottomY != null) + if (bottomY != default) { bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); } ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); - if (bottomY != null) + if (bottomY != default) { tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep)); } From 595492491e54ecfc760efa745ce170002ac3a3c0 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 13:10:50 +0100 Subject: [PATCH 07/20] Add upsample tests --- .../Formats/WebP/YuvConversionTests.cs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs index 65b4b987e..76dd207fc 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs @@ -2,10 +2,14 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.IO; using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.Formats.Webp; using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.TestUtilities; +using SixLabors.ImageSharp.Tests.TestUtilities.ReferenceCodecs; using Xunit; namespace SixLabors.ImageSharp.Tests.Formats.Webp @@ -13,6 +17,34 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class YuvConversionTests { + private static WebpDecoder WebpDecoder => new(); + + private static MagickReferenceDecoder ReferenceDecoder => new(); + + private static string TestImageLossyFullPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImages.Webp.Lossy.NoFilter06); + + public static void RunUpSampleYuvToRgbTest() + { + var provider = TestImageProvider.File(TestImageLossyFullPath); + using (Image image = provider.GetImage(WebpDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ReferenceDecoder); + } + } + + [Fact] + public void UpSampleYuvToRgb_Works() => RunUpSampleYuvToRgbTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void UpSampleYuvToRgb_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.AllowAll); + + [Fact] + public void UpSampleYuvToRgb_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.DisableSSE2); + +#endif + [Theory] [WithFile(TestImages.Webp.Yuv, PixelTypes.Rgba32)] public void ConvertRgbToYuv_Works(TestImageProvider provider) From 1eb1e82a2f608c24b7d1ca40a7ab7f579dcdfe8b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 13:25:01 +0100 Subject: [PATCH 08/20] Avoid allocating uvBuffer on each upscale call --- src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs | 9 +++++---- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 4d21333e6..202df9039 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -692,16 +692,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int mbw = io.MbW; int uvw = (mbw + 1) / 2; int y = io.MbY; + byte[] uvBuffer = new byte[(14 * 32) + 15]; if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw, uvBuffer); } else { // We can finish the left-over line from previous call. - YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); + YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw, uvBuffer); numLinesOut++; } @@ -714,7 +715,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy topV = curV; curU = curU.Slice(io.UvStride); curV = curV.Slice(io.UvStride); - YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); + YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw, uvBuffer); curY = curY.Slice(ioStride2); dst = dst.Slice(bufferStride2); } @@ -736,7 +737,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw, uvBuffer); } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 342fc330c..54d7ed65d 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -73,12 +73,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // we interpolate u/v as: // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 - public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { - UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); } else #endif @@ -156,10 +156,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; - byte[] uvBuffer = new byte[(14 * 32) + 15]; + Array.Clear(uvBuffer, 0, uvBuffer.Length); Span ru = uvBuffer.AsSpan(15); Span rv = ru.Slice(32); From c59ae02e64ce3c905e566837e48a66ceac3b3459 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 14:02:45 +0100 Subject: [PATCH 09/20] Change some methods to be private --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 54d7ed65d..18cff1578 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } - public static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + private static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { int xStep = 3; int lastPixelPair = (len - 1) >> 1; @@ -156,7 +156,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) + private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; Array.Clear(uvBuffer, 0, uvBuffer.Length); @@ -213,7 +213,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - public static void UpSample32Pixels(Span r1, Span r2, Span output) + private static void UpSample32Pixels(Span r1, Span r2, Span output) { // Load inputs. Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); @@ -592,7 +592,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } - public static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) { ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); From c5170f950418c7ced1d9bf5fd75ada5a51180e0a Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Fri, 19 Nov 2021 15:50:03 +0100 Subject: [PATCH 10/20] Re-grouping the code to do identical operations Co-authored-by: Anton Firszov --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 18cff1578..251060cee 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -719,13 +719,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 v0 = LoadHigh(v); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); - Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); + Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); + Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); Vector128 r2 = Sse2.Add(r1, r0); - Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); - Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); Vector128 g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16()); Vector128 g3 = Sse2.Add(g0, g1); Vector128 g4 = Sse2.Subtract(g2, g3); From 0c057278fdac4ceec2568b1e6d6faa3dd2ce2945 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 15:52:06 +0100 Subject: [PATCH 11/20] Add InliningOptions.ShortMethod to LoadHigh --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 251060cee..d40b674e6 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -722,7 +722,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); - + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); Vector128 r2 = Sse2.Add(r1, r0); @@ -734,13 +734,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 b1 = Sse2.AddSaturate(b0, y1); Vector128 b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16()); - // use logical shift for B2, which can be larger than 32767 + // Use logical shift for B2, which can be larger than 32767. r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 LoadHigh(Span src) { Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); From d58dde006067a56ba289b6ce5bb93a502fb5ec30 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 16:02:23 +0100 Subject: [PATCH 12/20] Group load uv vectors together --- .../Formats/Webp/Lossy/YuvConversion.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index d40b674e6..75a9963a5 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -714,9 +714,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to R/G/B private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(y); - Vector128 u0 = LoadHigh(u); - Vector128 v0 = LoadHigh(v); + Vector64 yTmp = Unsafe.As>(ref MemoryMarshal.GetReference(y)); + Vector64 uTmp = Unsafe.As>(ref MemoryMarshal.GetReference(u)); + Vector64 vTmp = Unsafe.As>(ref MemoryMarshal.GetReference(v)); + Vector128 y0 = LoadHigh(yTmp); + Vector128 u0 = LoadHigh(uTmp); + Vector128 v0 = LoadHigh(vTmp); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -742,11 +745,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(Span src) + private static Vector128 LoadHigh(Vector64 src) { - Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); - Vector128 tmp2 = Unsafe.As, Vector128>(ref tmp); - return Sse2.UnpackLow(Vector128.Zero, tmp2); + Vector128 tmp = Unsafe.As, Vector128>(ref src); + return Sse2.UnpackLow(Vector128.Zero, tmp); } #endif From 7cf0c32e9f02677d95399f00f3f4735364ccbaec Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 16:51:31 +0100 Subject: [PATCH 13/20] Pass in parameters as ref to UpSample32Pixels --- .../Formats/Webp/Lossy/YuvConversion.cs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 75a9963a5..8e3b15389 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -179,10 +179,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // For UpSample32Pixels, 17 u/v values must be read-able for each block. int pos; int uvPos; + ref byte topURef = ref MemoryMarshal.GetReference(topU); + ref byte topVRef = ref MemoryMarshal.GetReference(topV); + ref byte curURef = ref MemoryMarshal.GetReference(curU); + ref byte curVRef = ref MemoryMarshal.GetReference(curV); for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); - UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } @@ -213,13 +217,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - private static void UpSample32Pixels(Span r1, Span r2, Span output) + private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output) { // Load inputs. - Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); - Vector128 b = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r1), 1)); - Vector128 c = Unsafe.As>(ref MemoryMarshal.GetReference(r2)); - Vector128 d = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r2), 1)); + Vector128 a = Unsafe.As>(ref r1); + Vector128 b = Unsafe.As>(ref Unsafe.Add(ref r1, 1)); + Vector128 c = Unsafe.As>(ref r2); + Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1)); Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 @@ -257,7 +261,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy r2.Slice(numPixels, length).Fill(r2[numPixels - 1]); } - UpSample32Pixels(r1, r2, output); + ref byte r1Ref = ref MemoryMarshal.GetReference(r1); + ref byte r2Ref = ref MemoryMarshal.GetReference(r2); + UpSample32Pixels(ref r1Ref, ref r2Ref, output); } // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 From cc5f7af71c2fc42f317a5244f0ea660aa3123636 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:50:39 +0100 Subject: [PATCH 14/20] Better version of LoadHigh --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 8e3b15389..d0a14db33 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -720,12 +720,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to R/G/B private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector64 yTmp = Unsafe.As>(ref MemoryMarshal.GetReference(y)); - Vector64 uTmp = Unsafe.As>(ref MemoryMarshal.GetReference(u)); - Vector64 vTmp = Unsafe.As>(ref MemoryMarshal.GetReference(v)); - Vector128 y0 = LoadHigh(yTmp); - Vector128 u0 = LoadHigh(uTmp); - Vector128 v0 = LoadHigh(vTmp); + Vector128 y0 = LoadHigh(ref MemoryMarshal.GetReference(y)); + Vector128 u0 = LoadHigh(ref MemoryMarshal.GetReference(u)); + Vector128 v0 = LoadHigh(ref MemoryMarshal.GetReference(v)); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -751,9 +748,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(Vector64 src) + private static Vector128 LoadHigh(ref byte src) { - Vector128 tmp = Unsafe.As, Vector128>(ref src); + Vector128 tmp = Unsafe.As>(ref src); return Sse2.UnpackLow(Vector128.Zero, tmp); } #endif From 65870b96f429d03beb75173447fc56a01fd6b305 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 13:14:38 +0100 Subject: [PATCH 15/20] Avoid branching inside loop --- .../Formats/Webp/Lossy/YuvConversion.cs | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index d0a14db33..a8286037b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -183,11 +183,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref byte topVRef = ref MemoryMarshal.GetReference(topV); ref byte curURef = ref MemoryMarshal.GetReference(curU); ref byte curVRef = ref MemoryMarshal.GetReference(curV); - for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + if (bottomY != null) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); - ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); + ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + } + } + else + { + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); + ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep); + } } // Process last block. @@ -205,9 +217,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (bottomY != default) { bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); + ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + } + else + { + ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep); } - ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != default) { @@ -588,14 +604,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS - private static void ConvertYuvToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); + + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); - - if (bottomY != null) - { - YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); - } + YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); } private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) From 6293f72c809ab18ba844ba33ba0801474f6e81c5 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 13:57:23 +0100 Subject: [PATCH 16/20] Use ref parameters in ConvertYuv444ToBgrSse41 --- .../Formats/Webp/Lossy/YuvConversion.cs | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index a8286037b..cf211c16e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -616,10 +616,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) { - ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); - ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); - ConvertYuv444ToRgbSse41(y.Slice(16), u.Slice(16), v.Slice(16), out Vector128 r2, out Vector128 g2, out Vector128 b2); - ConvertYuv444ToRgbSse41(y.Slice(24), u.Slice(24), v.Slice(24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + ref byte yRef = ref MemoryMarshal.GetReference(y); + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); // Cast to 8b and store as BBBBGGGGRRRR. Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); @@ -733,12 +736,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy output5 = Ssse3.Shuffle(input1, shuffle2); } - // Convert 32 samples of YUV444 to R/G/B - private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) + // Convert 32 samples of YUV444 to B/G/R + private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(ref MemoryMarshal.GetReference(y)); - Vector128 u0 = LoadHigh(ref MemoryMarshal.GetReference(u)); - Vector128 v0 = LoadHigh(ref MemoryMarshal.GetReference(v)); + Vector128 y0 = LoadHigh(ref y); + Vector128 u0 = LoadHigh(ref u); + Vector128 v0 = LoadHigh(ref v); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); From 2ca81aec3c83b36060bdd021a5f52688778fab6e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 14:07:57 +0100 Subject: [PATCH 17/20] Fill buffers with default values only in Debug mode --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index d62d23e17..14bc19e8a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -76,10 +76,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.TmpVBuffer = memoryAllocator.Allocate((int)width); this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); +#if DEBUG + // Filling those buffers with 205, is only useful for debugging, + // so the default values are the same as the reference libwebp implementation. this.YuvBuffer.Memory.Span.Fill(205); this.CacheY.Memory.Span.Fill(205); this.CacheU.Memory.Span.Fill(205); this.CacheV.Memory.Span.Fill(205); +#endif this.Vp8BitReaders = new Vp8BitReader[WebpConstants.MaxNumPartitions]; } From cded607d5cd0d30bc381f08170f9e7f9dc8d91e8 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 14:27:41 +0100 Subject: [PATCH 18/20] Allocate clean buffers --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index 14bc19e8a..003bdc268 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -66,15 +66,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int extraRows = WebpConstants.FilterExtraRows[(int)LoopFilter.Complex]; // assuming worst case: complex filter int extraY = extraRows * this.CacheYStride; int extraUv = extraRows / 2 * this.CacheUvStride; - this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY); - this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY); + this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY, AllocationOptions.Clean); + this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY, AllocationOptions.Clean); int cacheUvSize = (16 * this.CacheUvStride) + extraUv; - this.CacheU = memoryAllocator.Allocate(cacheUvSize); - this.CacheV = memoryAllocator.Allocate(cacheUvSize); - this.TmpYBuffer = memoryAllocator.Allocate((int)width); - this.TmpUBuffer = memoryAllocator.Allocate((int)width); - this.TmpVBuffer = memoryAllocator.Allocate((int)width); - this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); + this.CacheU = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); + this.CacheV = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); + this.TmpYBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.TmpUBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.TmpVBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.Pixels = memoryAllocator.Allocate((int)(width * height * 4), AllocationOptions.Clean); #if DEBUG // Filling those buffers with 205, is only useful for debugging, From 22537b226b6c9f7517ec1cee670157f756abbecf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 15:31:40 +0100 Subject: [PATCH 19/20] Revert "Allocate clean buffers": the tmp buffers does not need to be clean, they will be overwritten anyway This reverts commit cded607d5cd0d30bc381f08170f9e7f9dc8d91e8. --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index 003bdc268..14bc19e8a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -66,15 +66,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int extraRows = WebpConstants.FilterExtraRows[(int)LoopFilter.Complex]; // assuming worst case: complex filter int extraY = extraRows * this.CacheYStride; int extraUv = extraRows / 2 * this.CacheUvStride; - this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY, AllocationOptions.Clean); - this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY, AllocationOptions.Clean); + this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY); + this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY); int cacheUvSize = (16 * this.CacheUvStride) + extraUv; - this.CacheU = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); - this.CacheV = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); - this.TmpYBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.TmpUBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.TmpVBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.Pixels = memoryAllocator.Allocate((int)(width * height * 4), AllocationOptions.Clean); + this.CacheU = memoryAllocator.Allocate(cacheUvSize); + this.CacheV = memoryAllocator.Allocate(cacheUvSize); + this.TmpYBuffer = memoryAllocator.Allocate((int)width); + this.TmpUBuffer = memoryAllocator.Allocate((int)width); + this.TmpVBuffer = memoryAllocator.Allocate((int)width); + this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); #if DEBUG // Filling those buffers with 205, is only useful for debugging, From 7775c343049e1640dfd699aff0d005355081f042 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 25 Nov 2021 14:18:53 +0100 Subject: [PATCH 20/20] Group loading y, u, v together --- .../Formats/Webp/Lossy/YuvConversion.cs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index cf211c16e..16d458ed8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -739,9 +739,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to B/G/R private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(ref y); - Vector128 u0 = LoadHigh(ref u); - Vector128 v0 = LoadHigh(ref v); + // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + Vector128 y0 = Unsafe.As>(ref y); + Vector128 u0 = Unsafe.As>(ref u); + Vector128 v0 = Unsafe.As>(ref v); + y0 = Sse2.UnpackLow(Vector128.Zero, y0); + u0 = Sse2.UnpackLow(Vector128.Zero, u0); + v0 = Sse2.UnpackLow(Vector128.Zero, v0); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -765,13 +769,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } - // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(ref byte src) - { - Vector128 tmp = Unsafe.As>(ref src); - return Sse2.UnpackLow(Vector128.Zero, tmp); - } #endif [MethodImpl(InliningOptions.ShortMethod)]