From 7191acaf34f535bc883b99114eb811708ddb5064 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 17 Nov 2021 10:58:52 +0100 Subject: [PATCH 01/47] Move UpSample to YuvConversion class --- .../Formats/Webp/Lossy/WebpLossyDecoder.cs | 62 ++----------------- .../Formats/Webp/Lossy/YuvConversion.cs | 54 ++++++++++++++++ 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 2f78842c63..b27ef88fbc 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -696,12 +696,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - this.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); + YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); } else { // We can finish the left-over line from previous call. - this.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); + YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); numLinesOut++; } @@ -714,7 +714,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy topV = curV; curU = curU.Slice(io.UvStride); curV = curV.Slice(io.UvStride); - this.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); + YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); curY = curY.Slice(ioStride2); dst = dst.Slice(bufferStride2); } @@ -736,67 +736,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - this.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); + YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); } } return numLinesOut; } - private void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) - { - int xStep = 3; - int lastPixelPair = (len - 1) >> 1; - uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample - uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample - uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); - - if (bottomY != null) - { - uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); - } - - for (int x = 1; x <= lastPixelPair; x++) - { - uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample - uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample - - // Precompute invariant values associated with first and second diagonals. - uint avg = tluv + tuv + luv + uv + 0x00080008u; - uint diag12 = (avg + (2 * (tuv + luv))) >> 3; - uint diag03 = (avg + (2 * (tluv + uv))) >> 3; - uv0 = (diag12 + tluv) >> 1; - uint uv1 = (diag03 + tuv) >> 1; - int xMul2 = x * 2; - YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); - YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); - - if (bottomY != null) - { - uv0 = (diag03 + luv) >> 1; - uv1 = (diag12 + uv) >> 1; - YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); - YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); - } - - tluv = tuv; - luv = uv; - } - - if ((len & 1) == 0) - { - uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); - if (bottomY != null) - { - uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); - } - } - } - private void DoTransform(uint bits, Span src, Span dst, Span scratch) { switch (bits >> 30) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index a9cf876c80..182437e54f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -18,6 +18,60 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private const int YuvHalf = 1 << (YuvFix - 1); + public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { + int xStep = 3; + int lastPixelPair = (len - 1) >> 1; + uint tluv = LoadUv(topU[0], topV[0]); // top-left sample + uint luv = LoadUv(curU[0], curV[0]); // left-sample + uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; + YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); + + if (bottomY != null) + { + uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; + YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); + } + + for (int x = 1; x <= lastPixelPair; x++) + { + uint tuv = LoadUv(topU[x], topV[x]); // top sample + uint uv = LoadUv(curU[x], curV[x]); // sample + + // Precompute invariant values associated with first and second diagonals. + uint avg = tluv + tuv + luv + uv + 0x00080008u; + uint diag12 = (avg + (2 * (tuv + luv))) >> 3; + uint diag03 = (avg + (2 * (tluv + uv))) >> 3; + uv0 = (diag12 + tluv) >> 1; + uint uv1 = (diag03 + tuv) >> 1; + int xMul2 = x * 2; + YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); + YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); + + if (bottomY != null) + { + uv0 = (diag03 + luv) >> 1; + uv1 = (diag12 + uv) >> 1; + YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); + YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); + } + + tluv = tuv; + luv = uv; + } + + if ((len & 1) == 0) + { + uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; + YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); + if (bottomY != null) + { + uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; + YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); + } + } + } + /// /// Converts the RGB values of the image to YUV. /// From 59a11bf9011729ad5cf8f30f3ef21adf616bf0b9 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 17 Nov 2021 21:45:32 +0100 Subject: [PATCH 02/47] Add SSE41 version of UpSample --- .../Formats/Webp/Lossy/YuvConversion.cs | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 182437e54f..0f5c56c746 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -4,6 +4,11 @@ using System; using System.Buffers; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -18,8 +23,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private const int YuvHalf = 1 << (YuvFix - 1); +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 One = Vector128.Create((byte)1); + + // These constants are 14b fixed-point version of ITU-R BT.601 constants. + // R = (19077 * y + 26149 * v - 14234) >> 6 + // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 + // B = (19077 * y + 33050 * u - 17685) >> 6 + private static readonly Vector128 K19077 = Vector128.Create((short)19077).AsByte(); + + private static readonly Vector128 K26149 = Vector128.Create((short)26149).AsByte(); + + private static readonly Vector128 K14234 = Vector128.Create((short)14234).AsByte(); + + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + private static readonly Vector128 K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129); + + private static readonly Vector128 K17685 = Vector128.Create((short)17685).AsByte(); + + private static readonly Vector128 K6419 = Vector128.Create((short)6419).AsByte(); + + private static readonly Vector128 K13320 = Vector128.Create((short)13320).AsByte(); + + private static readonly Vector128 K8708 = Vector128.Create((short)8708).AsByte(); + + private static readonly Vector128 PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5); + + private static readonly Vector128 PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255); + + private static readonly Vector128 PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10); + + private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15, 255); + + private static readonly Vector128 PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255); + + private static readonly Vector128 PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255); + + private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15); +#endif + + // UpSample from YUV to RGB. public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + } + else +#endif + { + UpSampleScalar(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + } + } + + public static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { int xStep = 3; int lastPixelPair = (len - 1) >> 1; uint tluv = LoadUv(topU[0], topV[0]); // top-left sample @@ -72,6 +135,106 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } +#if SUPPORTS_RUNTIME_INTRINSICS + // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows + // u = (9*a + 3*b + 3*c + d + 8) / 16 + // = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2 + // = (a + m + 1) / 2 + // where m = (a + 3*b + 3*c + d) / 8 + // = ((a + b + c + d) / 2 + b + c) / 4 + // + // Let's say k = (a + b + c + d) / 4. + // We can compute k as + // k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1 + // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2 + // + // Then m can be written as + // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 + public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + { + const int xStep = 3; + byte[] uvBuffer = new byte[(14 * 32) + 15]; + Span ru = uvBuffer.AsSpan(15); + Span rv = ru.Slice(32); + + // Treat the first pixel in regular way. + int uDiag = ((topU[0] + curU[0]) >> 1) + 1; + int vDiag = ((topV[0] + curV[0]) >> 1) + 1; + int u0t = (topU[0] + uDiag) >> 1; + int v0t = (topV[0] + vDiag) >> 1; + YuvToBgr(topY[0], u0t, v0t, topDst); + if (bottomY != null) + { + int u0b = (curU[0] + uDiag) >> 1; + int v0b = (curV[0] + vDiag) >> 1; + YuvToBgr(bottomY[0], u0b, v0b, bottomDst); + } + + // For UpSample32Pixels, 17 u/v values must be read-able for each block. + for (int pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); + UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); + ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + } + } + + // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. + public static void UpSample32Pixels(Span r1, Span r2, Span output) + { + // Load inputs. + Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); + Vector128 b = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r1), 1)); + Vector128 c = Unsafe.As>(ref MemoryMarshal.GetReference(r2)); + Vector128 d = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r2), 1)); + + Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 + Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 + Vector128 st = Sse2.Xor(s, t); // st = s^t + + Vector128 ad = Sse2.Xor(a, d); // ad = a^d + Vector128 bc = Sse2.Xor(b, c); // bc = b^c + + Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c) + Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t) + Vector128 t3 = Sse2.And(t2, One); // (a^d) | (b^c) | (s^t) & 1 + Vector128 t4 = Sse2.Average(s, t); + Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4 + + Vector128 diag1 = GetM(k, st, bc, t); + Vector128 diag2 = GetM(k, st, ad, s); + + // Pack the alternate pixels. + PackAndStore(a, b, diag1, diag2, output); // store top. + PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32)); + } + + // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 + private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) + { + Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2 + Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t) + Vector128 tmp2 = Sse2.Xor(k, input); // (k^in) + Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in) + Vector128 tmp4 = Sse2.And(tmp3, One); // & 1 -> lsb_correction + + return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction + } + + private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) + { + Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 + Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 + Vector128 t1 = Sse2.UnpackLow(ta, tb); + Vector128 t2 = Sse2.UnpackHigh(ta, tb); + + ref byte output0Ref = ref MemoryMarshal.GetReference(output); + ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16); + Unsafe.As>(ref output0Ref) = t1; + Unsafe.As>(ref output1Ref) = t2; + } +#endif + /// /// Converts the RGB values of the image to YUV. /// @@ -366,6 +529,175 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy bgr[0] = (byte)YuvToB(y, u); } +#if SUPPORTS_RUNTIME_INTRINSICS + + private static void ConvertToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + { + YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); + + if (bottomY != null) + { + YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); + } + } + + public static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + { + ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToRgbSse41(y.Slice(16), u.Slice(16), v.Slice(16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToRgbSse41(y.Slice(24), u.Slice(24), v.Slice(24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + + // Cast to 8b and store as BBBBGGGGRRRR. + Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); + Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3); + Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1); + Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3); + Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1); + Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); + } + + // Pack the planar buffers + // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... + // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... + private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) + { + // The input is 6 registers of sixteen 8b but for the sake of explanation, + // let's take 6 registers of four 8b values. + // To pack, we will keep taking one every two 8b integer and move it + // around as follows: + // Input: + // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 + // Split the 6 registers in two sets of 3 registers: the first set as the even + // 8b bytes, the second the odd ones: + // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 + // Repeat the same permutations twice more: + // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 + // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 + + // Process R. + ChannelMixing( + input0, + input1, + PlanarTo24Shuffle0, + PlanarTo24Shuffle1, + PlanarTo24Shuffle2, + out Vector128 r0, + out Vector128 r1, + out Vector128 r2, + out Vector128 r3, + out Vector128 r4, + out Vector128 r5); + + // Process G. + // Same as before, just shifted to the left by one and including the right padding. + ChannelMixing( + input2, + input3, + PlanarTo24Shuffle3, + PlanarTo24Shuffle4, + PlanarTo24Shuffle5, + out Vector128 g0, + out Vector128 g1, + out Vector128 g2, + out Vector128 g3, + out Vector128 g4, + out Vector128 g5); + + // Process B. + ChannelMixing( + input4, + input5, + PlanarTo24Shuffle6, + PlanarTo24Shuffle7, + PlanarTo24Shuffle8, + out Vector128 b0, + out Vector128 b1, + out Vector128 b2, + out Vector128 b3, + out Vector128 b4, + out Vector128 b5); + + // OR the different channels. + Vector128 rg0 = Sse2.Or(r0, g0); + Vector128 rg1 = Sse2.Or(r1, g1); + Vector128 rg2 = Sse2.Or(r2, g2); + Vector128 rg3 = Sse2.Or(r3, g3); + Vector128 rg4 = Sse2.Or(r4, g4); + Vector128 rg5 = Sse2.Or(r5, g5); + + ref byte outputRef = ref MemoryMarshal.GetReference(rgb); + Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5); + } + + // Shuffles the input buffer as A0 0 0 A1 0 0 A2 + private static void ChannelMixing( + Vector128 input0, + Vector128 input1, + Vector128 shuffle0, + Vector128 shuffle1, + Vector128 shuffle2, + out Vector128 output0, + out Vector128 output1, + out Vector128 output2, + out Vector128 output3, + out Vector128 output4, + out Vector128 output5) + { + output0 = Ssse3.Shuffle(input0, shuffle0); + output1 = Ssse3.Shuffle(input0, shuffle1); + output2 = Ssse3.Shuffle(input0, shuffle2); + output3 = Ssse3.Shuffle(input1, shuffle0); + output4 = Ssse3.Shuffle(input1, shuffle1); + output5 = Ssse3.Shuffle(input1, shuffle2); + } + + // Convert 32 samples of YUV444 to R/G/B + private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) + { + Vector128 y0 = LoadHigh(y); + Vector128 u0 = LoadHigh(u); + Vector128 v0 = LoadHigh(v); + + Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); + + Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); + Vector128 r2 = Sse2.Add(r1, r0); + + Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); + Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); + Vector128 g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16()); + Vector128 g3 = Sse2.Add(g0, g1); + Vector128 g4 = Sse2.Subtract(g2, g3); + + Vector128 b0 = Sse2.MultiplyHigh(u0.AsUInt16(), K33050.AsUInt16()); + Vector128 b1 = Sse2.AddSaturate(b0, y1); + Vector128 b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16()); + + // use logical shift for B2, which can be larger than 32767 + r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] + g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] + b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] + } + + // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + private static Vector128 LoadHigh(Span src) + { + Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); + Vector128 tmp2 = Unsafe.As, Vector128>(ref tmp); + return Sse2.UnpackLow(Vector128.Zero, tmp2); + } +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); From 2a03d00c680da4f0f112eea1401421c94dd7e96e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 11:21:25 +0100 Subject: [PATCH 03/47] Upsample last block --- .../Formats/Webp/Lossy/YuvConversion.cs | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 0f5c56c746..1c22087320 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -67,6 +67,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #endif // UpSample from YUV to RGB. + // Given samples laid out in a square as: + // [a b] + // [c d] + // we interpolate u/v as: + // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 + // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -171,12 +177,33 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // For UpSample32Pixels, 17 u/v values must be read-able for each block. - for (int pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + int pos; + int uvPos; + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } + + // Process last block. + if (len > 1) + { + int leftOver = ((len + 1) >> 1) - (pos >> 1); + Span tmpTopDst = ru.Slice(4 * 32); + Span tmpBottomDst = tmpTopDst.Slice(4 * 32); + Span tmpTop = tmpBottomDst.Slice(4 * 32); + Span tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32); + UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); + UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); + topY.Slice(pos, len - pos).CopyTo(tmpTop); + ConvertToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); + if (bottomY != null) + { + tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep)); + } + } } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. @@ -209,6 +236,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32)); } + private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output) + { + Span r1 = stackalloc byte[17]; + Span r2 = stackalloc byte[17]; + tb.Slice(0, numPixels).CopyTo(r1); + bb.Slice(0, numPixels).CopyTo(r2); + + // Replicate last byte. + int length = 17 - numPixels; + if (length > 0) + { + r1.Slice(numPixels, length).Fill(r1[numPixels - 1]); + r2.Slice(numPixels, length).Fill(r2[numPixels - 1]); + } + + UpSample32Pixels(r1, r2, output); + } + // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) { From 3f4388323b3d8ac4efc50606ddb19128cc96b6cf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 13:28:35 +0100 Subject: [PATCH 04/47] Fix shuffle masks --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 1c22087320..080739db9a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -57,13 +57,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static readonly Vector128 PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10); - private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15, 255); + private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255); private static readonly Vector128 PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255); private static readonly Vector128 PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255); - private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15); + private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15); #endif // UpSample from YUV to RGB. @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); - ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } // Process last block. @@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); topY.Slice(pos, len - pos).CopyTo(tmpTop); - ConvertToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != null) { @@ -576,7 +576,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS - private static void ConvertToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + private static void ConvertYuvToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); From ec18321a814739bfd5ab93423c562e17168f6364 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 18 Nov 2021 14:56:21 +0100 Subject: [PATCH 05/47] Fix last block --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 080739db9a..3413f6f183 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -196,7 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Span tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32); UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru); UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); + topY.Slice(pos, len - pos).CopyTo(tmpTop); + if (bottomY != null) + { + bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); + } + ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != null) From c223d2eadbc0be25d6a493251a765cd6944719ac Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 19 Nov 2021 22:04:03 +1100 Subject: [PATCH 06/47] Avoid implicit casting --- .../Formats/Webp/Lossy/WebpLossyDecoder.cs | 4 ++-- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index b27ef88fbc..4d21333e6e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -696,7 +696,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw); } else { @@ -736,7 +736,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - YuvConversion.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw); } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 3413f6f183..342fc330c9 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -96,7 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); - if (bottomY != null) + if (bottomY != default) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); @@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); - if (bottomY != null) + if (bottomY != default) { uv0 = (diag03 + luv) >> 1; uv1 = (diag12 + uv) >> 1; @@ -133,7 +133,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); - if (bottomY != null) + if (bottomY != default) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); @@ -169,7 +169,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int u0t = (topU[0] + uDiag) >> 1; int v0t = (topV[0] + vDiag) >> 1; YuvToBgr(topY[0], u0t, v0t, topDst); - if (bottomY != null) + if (bottomY != default) { int u0b = (curU[0] + uDiag) >> 1; int v0b = (curV[0] + vDiag) >> 1; @@ -198,14 +198,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv); topY.Slice(pos, len - pos).CopyTo(tmpTop); - if (bottomY != null) + if (bottomY != default) { bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); } ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); - if (bottomY != null) + if (bottomY != default) { tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep)); } From 595492491e54ecfc760efa745ce170002ac3a3c0 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 13:10:50 +0100 Subject: [PATCH 07/47] Add upsample tests --- .../Formats/WebP/YuvConversionTests.cs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs index 65b4b987e3..76dd207fce 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs @@ -2,10 +2,14 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.IO; using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.Formats.Webp; using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.TestUtilities; +using SixLabors.ImageSharp.Tests.TestUtilities.ReferenceCodecs; using Xunit; namespace SixLabors.ImageSharp.Tests.Formats.Webp @@ -13,6 +17,34 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class YuvConversionTests { + private static WebpDecoder WebpDecoder => new(); + + private static MagickReferenceDecoder ReferenceDecoder => new(); + + private static string TestImageLossyFullPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImages.Webp.Lossy.NoFilter06); + + public static void RunUpSampleYuvToRgbTest() + { + var provider = TestImageProvider.File(TestImageLossyFullPath); + using (Image image = provider.GetImage(WebpDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ReferenceDecoder); + } + } + + [Fact] + public void UpSampleYuvToRgb_Works() => RunUpSampleYuvToRgbTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void UpSampleYuvToRgb_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.AllowAll); + + [Fact] + public void UpSampleYuvToRgb_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.DisableSSE2); + +#endif + [Theory] [WithFile(TestImages.Webp.Yuv, PixelTypes.Rgba32)] public void ConvertRgbToYuv_Works(TestImageProvider provider) From 1eb1e82a2f608c24b7d1ca40a7ab7f579dcdfe8b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 13:25:01 +0100 Subject: [PATCH 08/47] Avoid allocating uvBuffer on each upscale call --- src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs | 9 +++++---- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 4d21333e6e..202df9039e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -692,16 +692,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int mbw = io.MbW; int uvw = (mbw + 1) / 2; int y = io.MbY; + byte[] uvBuffer = new byte[(14 * 32) + 15]; if (y == 0) { // First line is special cased. We mirror the u/v samples at boundary. - YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw, uvBuffer); } else { // We can finish the left-over line from previous call. - YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw); + YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw, uvBuffer); numLinesOut++; } @@ -714,7 +715,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy topV = curV; curU = curU.Slice(io.UvStride); curV = curV.Slice(io.UvStride); - YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw); + YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw, uvBuffer); curY = curY.Slice(ioStride2); dst = dst.Slice(bufferStride2); } @@ -736,7 +737,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Process the very last row of even-sized picture. if ((yEnd & 1) == 0) { - YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw); + YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw, uvBuffer); } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 342fc330c9..54d7ed65da 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -73,12 +73,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // we interpolate u/v as: // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 - public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { - UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len); + UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); } else #endif @@ -156,10 +156,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; - byte[] uvBuffer = new byte[(14 * 32) + 15]; + Array.Clear(uvBuffer, 0, uvBuffer.Length); Span ru = uvBuffer.AsSpan(15); Span rv = ru.Slice(32); From c59ae02e64ce3c905e566837e48a66ceac3b3459 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 14:02:45 +0100 Subject: [PATCH 09/47] Change some methods to be private --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 54d7ed65da..18cff15781 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } - public static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) + private static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len) { int xStep = 3; int lastPixelPair = (len - 1) >> 1; @@ -156,7 +156,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - public static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) + private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; Array.Clear(uvBuffer, 0, uvBuffer.Length); @@ -213,7 +213,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - public static void UpSample32Pixels(Span r1, Span r2, Span output) + private static void UpSample32Pixels(Span r1, Span r2, Span output) { // Load inputs. Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); @@ -592,7 +592,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } - public static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) { ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); From c5170f950418c7ced1d9bf5fd75ada5a51180e0a Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Fri, 19 Nov 2021 15:50:03 +0100 Subject: [PATCH 10/47] Re-grouping the code to do identical operations Co-authored-by: Anton Firszov --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 18cff15781..251060ceef 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -719,13 +719,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 v0 = LoadHigh(v); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); - Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); + Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); + Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); Vector128 r2 = Sse2.Add(r1, r0); - Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); - Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); Vector128 g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16()); Vector128 g3 = Sse2.Add(g0, g1); Vector128 g4 = Sse2.Subtract(g2, g3); From 0c057278fdac4ceec2568b1e6d6faa3dd2ce2945 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 15:52:06 +0100 Subject: [PATCH 11/47] Add InliningOptions.ShortMethod to LoadHigh --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 251060ceef..d40b674e6e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -722,7 +722,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16()); Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16()); - + Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16()); Vector128 r2 = Sse2.Add(r1, r0); @@ -734,13 +734,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 b1 = Sse2.AddSaturate(b0, y1); Vector128 b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16()); - // use logical shift for B2, which can be larger than 32767 + // Use logical shift for B2, which can be larger than 32767. r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 LoadHigh(Span src) { Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); From d58dde006067a56ba289b6ce5bb93a502fb5ec30 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 16:02:23 +0100 Subject: [PATCH 12/47] Group load uv vectors together --- .../Formats/Webp/Lossy/YuvConversion.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index d40b674e6e..75a9963a52 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -714,9 +714,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to R/G/B private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(y); - Vector128 u0 = LoadHigh(u); - Vector128 v0 = LoadHigh(v); + Vector64 yTmp = Unsafe.As>(ref MemoryMarshal.GetReference(y)); + Vector64 uTmp = Unsafe.As>(ref MemoryMarshal.GetReference(u)); + Vector64 vTmp = Unsafe.As>(ref MemoryMarshal.GetReference(v)); + Vector128 y0 = LoadHigh(yTmp); + Vector128 u0 = LoadHigh(uTmp); + Vector128 v0 = LoadHigh(vTmp); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -742,11 +745,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(Span src) + private static Vector128 LoadHigh(Vector64 src) { - Vector64 tmp = Unsafe.As>(ref MemoryMarshal.GetReference(src)); - Vector128 tmp2 = Unsafe.As, Vector128>(ref tmp); - return Sse2.UnpackLow(Vector128.Zero, tmp2); + Vector128 tmp = Unsafe.As, Vector128>(ref src); + return Sse2.UnpackLow(Vector128.Zero, tmp); } #endif From 7cf0c32e9f02677d95399f00f3f4735364ccbaec Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 19 Nov 2021 16:51:31 +0100 Subject: [PATCH 13/47] Pass in parameters as ref to UpSample32Pixels --- .../Formats/Webp/Lossy/YuvConversion.cs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 75a9963a52..8e3b153891 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -179,10 +179,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // For UpSample32Pixels, 17 u/v values must be read-able for each block. int pos; int uvPos; + ref byte topURef = ref MemoryMarshal.GetReference(topU); + ref byte topVRef = ref MemoryMarshal.GetReference(topV); + ref byte curURef = ref MemoryMarshal.GetReference(curU); + ref byte curVRef = ref MemoryMarshal.GetReference(curV); for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru); - UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv); + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } @@ -213,13 +217,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - private static void UpSample32Pixels(Span r1, Span r2, Span output) + private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output) { // Load inputs. - Vector128 a = Unsafe.As>(ref MemoryMarshal.GetReference(r1)); - Vector128 b = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r1), 1)); - Vector128 c = Unsafe.As>(ref MemoryMarshal.GetReference(r2)); - Vector128 d = Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r2), 1)); + Vector128 a = Unsafe.As>(ref r1); + Vector128 b = Unsafe.As>(ref Unsafe.Add(ref r1, 1)); + Vector128 c = Unsafe.As>(ref r2); + Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1)); Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 @@ -257,7 +261,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy r2.Slice(numPixels, length).Fill(r2[numPixels - 1]); } - UpSample32Pixels(r1, r2, output); + ref byte r1Ref = ref MemoryMarshal.GetReference(r1); + ref byte r2Ref = ref MemoryMarshal.GetReference(r2); + UpSample32Pixels(ref r1Ref, ref r2Ref, output); } // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 From bab85d4372ee7cc784acc7d743ffd2c6886ea460 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 22:17:12 +0100 Subject: [PATCH 14/47] Add SSE version of CombinedShannonEntropy --- .../Formats/Webp/Lossless/LosslessUtils.cs | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 471c083cda..52453c77fb 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -759,28 +759,147 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// Shanon entropy. public static float CombinedShannonEntropy(Span x, Span y) { - double retVal = 0.0d; - uint sumX = 0, sumXY = 0; - for (int i = 0; i < 256; i++) - { - uint xi = (uint)x[i]; - if (xi != 0) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + double retVal = 0.0d; + Span tmp = stackalloc int[4]; + ref int xRef = ref MemoryMarshal.GetReference(x); + ref int yRef = ref MemoryMarshal.GetReference(y); + Vector128 sumXY128 = Vector128.Zero; + Vector128 sumX128 = Vector128.Zero; + ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + for (int i = 0; i < 256; i += 4) { - uint xy = xi + (uint)y[i]; - sumX += xi; - retVal -= FastSLog2(xi); - sumXY += xy; - retVal -= FastSLog2(xy); + Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + + // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. + if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) + { + Vector128 xy128 = Sse2.Add(xVec, yVec); + sumXY128 = Sse2.Add(sumXY128, xy128); + sumX128 = Sse2.Add(sumX128, xVec); + + // Analyze the different X + Y. + Unsafe.As>(ref tmpRef) = xy128; + if (tmp[0] != 0) + { + retVal -= FastSLog2((uint)tmp[0]); + if (x[i + 0] != 0) + { + retVal -= FastSLog2((uint)x[i + 0]); + } + } + + if (tmp[1] != 0) + { + retVal -= FastSLog2((uint)tmp[1]); + if (x[i + 1] != 0) + { + retVal -= FastSLog2((uint)x[i + 1]); + } + } + + if (tmp[2] != 0) + { + retVal -= FastSLog2((uint)tmp[2]); + if (x[i + 2] != 0) + { + retVal -= FastSLog2((uint)x[i + 2]); + } + } + + if (tmp[3] != 0) + { + retVal -= FastSLog2((uint)tmp[3]); + if (x[i + 3] != 0) + { + retVal -= FastSLog2((uint)x[i + 3]); + } + } + } + else + { + // X is fully 0, so only deal with Y. + sumXY128 = Sse2.Add(sumXY128, yVec); + + if (y[i] != 0) + { + retVal -= FastSLog2((uint)y[i]); + } + + if (y[i + 1] != 0) + { + retVal -= FastSLog2((uint)y[i + 1]); + } + + if (y[i + 2] != 0) + { + retVal -= FastSLog2((uint)y[i + 2]); + } + + if (y[i + 3] != 0) + { + retVal -= FastSLog2((uint)y[i + 3]); + } + } } - else if (y[i] != 0) + + // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. + // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. + Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); + Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); + Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); + Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); + Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); + Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); + int sumX = Sse2.ConvertToInt32(tmpSumX); + int sumXY = Sse2.ConvertToInt32(tmpSumXY); + + retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); + + return (float)retVal; + } + else +#endif + { + double retVal = 0.0d; + uint sumX = 0, sumXY = 0; + for (int i = 0; i < 256; i++) { - sumXY += (uint)y[i]; - retVal -= FastSLog2((uint)y[i]); + uint xi = (uint)x[i]; + if (xi != 0) + { + uint xy = xi + (uint)y[i]; + sumX += xi; + retVal -= FastSLog2(xi); + sumXY += xy; + retVal -= FastSLog2(xy); + } + else if (y[i] != 0) + { + sumXY += (uint)y[i]; + retVal -= FastSLog2((uint)y[i]); + } } + + retVal += FastSLog2(sumX) + FastSLog2(sumXY); + return (float)retVal; } + } - retVal += FastSLog2(sumX) + FastSLog2(sumXY); - return (float)retVal; + [MethodImpl(InliningOptions.ShortMethod)] + private static void AnalyzeXy(Span tmp, Span x, int i, int pos, ref double retVal) + { + if (tmp[pos] != 0) + { + retVal -= FastSLog2((uint)tmp[pos]); + if (x[i + pos] != 0) + { + retVal -= FastSLog2((uint)x[i + pos]); + } + } } [MethodImpl(InliningOptions.ShortMethod)] @@ -836,6 +955,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static float FastSLog2Slow(uint v) { DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + if (v < ApproxLogWithCorrectionMax) { int logCnt = 0; @@ -865,7 +985,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static float FastLog2Slow(uint v) { - Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); if (v < ApproxLogWithCorrectionMax) { From cc430cc84626edf63c187f97fe37f6d4ad2ca0da Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 12:56:45 +0100 Subject: [PATCH 15/47] Avoid bounds checks --- .../Formats/Webp/Lossless/LosslessUtils.cs | 61 ++++++++----------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 52453c77fb..0f24e8e8f3 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -783,39 +783,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless // Analyze the different X + Y. Unsafe.As>(ref tmpRef) = xy128; - if (tmp[0] != 0) + if (tmpRef != 0) { - retVal -= FastSLog2((uint)tmp[0]); - if (x[i + 0] != 0) + retVal -= FastSLog2((uint)tmpRef); + if (Unsafe.Add(ref xRef, i) != 0) { - retVal -= FastSLog2((uint)x[i + 0]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); } } - if (tmp[1] != 0) + if (Unsafe.Add(ref tmpRef, 1) != 0) { - retVal -= FastSLog2((uint)tmp[1]); - if (x[i + 1] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); + if (Unsafe.Add(ref xRef, i + 1) != 0) { - retVal -= FastSLog2((uint)x[i + 1]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); } } - if (tmp[2] != 0) + if (Unsafe.Add(ref tmpRef, 2) != 0) { - retVal -= FastSLog2((uint)tmp[2]); - if (x[i + 2] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); + if (Unsafe.Add(ref xRef, i + 2) != 0) { - retVal -= FastSLog2((uint)x[i + 2]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); } } - if (tmp[3] != 0) + if (Unsafe.Add(ref tmpRef, 3) != 0) { - retVal -= FastSLog2((uint)tmp[3]); - if (x[i + 3] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); + if (Unsafe.Add(ref xRef, i + 3) != 0) { - retVal -= FastSLog2((uint)x[i + 3]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); } } } @@ -824,24 +824,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless // X is fully 0, so only deal with Y. sumXY128 = Sse2.Add(sumXY128, yVec); - if (y[i] != 0) + if (Unsafe.Add(ref yRef, i) != 0) { - retVal -= FastSLog2((uint)y[i]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); } - if (y[i + 1] != 0) + if (Unsafe.Add(ref yRef, i + 1) != 0) { - retVal -= FastSLog2((uint)y[i + 1]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); } - if (y[i + 2] != 0) + if (Unsafe.Add(ref yRef, i + 2) != 0) { - retVal -= FastSLog2((uint)y[i + 2]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); } - if (y[i + 3] != 0) + if (Unsafe.Add(ref yRef, i + 3) != 0) { - retVal -= FastSLog2((uint)y[i + 3]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); } } } @@ -889,19 +889,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } } - [MethodImpl(InliningOptions.ShortMethod)] - private static void AnalyzeXy(Span tmp, Span x, int i, int pos, ref double retVal) - { - if (tmp[pos] != 0) - { - retVal -= FastSLog2((uint)tmp[pos]); - if (x[i + pos] != 0) - { - retVal -= FastSLog2((uint)x[i + pos]); - } - } - } - [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorRed(sbyte greenToRed, uint argb) { From ed8bd615f2be3cafd1a23782e9f7d07c6375d967 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 15:38:46 +0100 Subject: [PATCH 16/47] Faster SSE2 version of ShanonEntropy --- .../Formats/Webp/Lossless/LosslessUtils.cs | 115 +++++------------- 1 file changed, 29 insertions(+), 86 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 0f24e8e8f3..68004275bd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; #if SUPPORTS_RUNTIME_INTRINSICS +using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -706,7 +707,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int colorMapLength4 = 4 * newColorMap.Length; for (; i < colorMapLength4; i++) { - newData[i] = 0; // black tail. + newData[i] = 0; // black tail. } } @@ -760,103 +761,45 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static float CombinedShannonEntropy(Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Sse2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[4]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - Vector128 sumXY128 = Vector128.Zero; - Vector128 sumX128 = Vector128.Zero; - ref int tmpRef = ref MemoryMarshal.GetReference(tmp); - for (int i = 0; i < 256; i += 4) - { - Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); - - // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. - if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) - { - Vector128 xy128 = Sse2.Add(xVec, yVec); - sumXY128 = Sse2.Add(sumXY128, xy128); - sumX128 = Sse2.Add(sumX128, xVec); - - // Analyze the different X + Y. - Unsafe.As>(ref tmpRef) = xy128; - if (tmpRef != 0) - { - retVal -= FastSLog2((uint)tmpRef); - if (Unsafe.Add(ref xRef, i) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); - } - } - if (Unsafe.Add(ref tmpRef, 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); - if (Unsafe.Add(ref xRef, i + 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); - } - } - - if (Unsafe.Add(ref tmpRef, 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); - if (Unsafe.Add(ref xRef, i + 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); - } - } - - if (Unsafe.Add(ref tmpRef, 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); - if (Unsafe.Add(ref xRef, i + 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); - } - } - } - else + int sumXY = 0; + int sumX = 0; + for (int i = 0; i < 256; i += 16) + { + Vector128 x0 = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 y0 = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + Vector128 x1 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 4)); + Vector128 y1 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 4)); + Vector128 x2 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 8)); + Vector128 y2 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 8)); + Vector128 x3 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 12)); + Vector128 y3 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 12)); + Vector128 x4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(x0, x1), Sse2.PackSignedSaturate(x2, x3)); + Vector128 y4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(y0, y1), Sse2.PackSignedSaturate(y2, y3)); + int mx = Sse2.MoveMask(Sse2.CompareGreaterThan(x4, Vector128.Zero).AsByte()); + int my = Sse2.MoveMask(Sse2.CompareGreaterThan(y4, Vector128.Zero).AsByte()) | mx; + while (my != 0) { - // X is fully 0, so only deal with Y. - sumXY128 = Sse2.Add(sumXY128, yVec); - - if (Unsafe.Add(ref yRef, i) != 0) + int j = BitOperations.TrailingZeroCount(my); + if (((mx >> j) & 1) != 0) { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); + int xij = Unsafe.Add(ref xRef, i + j); + sumXY += xij; + retVal -= FastSLog2((uint)xij); } - if (Unsafe.Add(ref yRef, i + 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); - } - - if (Unsafe.Add(ref yRef, i + 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); - } - - if (Unsafe.Add(ref yRef, i + 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); - } + int xy = Unsafe.Add(ref xRef, i + j) + Unsafe.Add(ref yRef, i + j); + sumX += xy; + retVal -= FastSLog2((uint)xy); + my &= my - 1; } } - // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. - // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. - Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); - Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); - Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); - Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); - Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); - Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); - int sumX = Sse2.ConvertToInt32(tmpSumX); - int sumXY = Sse2.ConvertToInt32(tmpSumXY); - retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); return (float)retVal; From b1df6a97487f1d8ae68da60eaf3953fe6727f523 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 20:34:22 +0100 Subject: [PATCH 17/47] Revert "Faster SSE2 version of ShanonEntropy" Profiling does not proof that this version is actually faster. --- .../Formats/Webp/Lossless/LosslessUtils.cs | 115 +++++++++++++----- 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 68004275bd..0f24e8e8f3 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,7 +6,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; #if SUPPORTS_RUNTIME_INTRINSICS -using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -707,7 +706,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int colorMapLength4 = 4 * newColorMap.Length; for (; i < colorMapLength4; i++) { - newData[i] = 0; // black tail. + newData[i] = 0; // black tail. } } @@ -761,45 +760,103 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static float CombinedShannonEntropy(Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Sse41.IsSupported) { double retVal = 0.0d; + Span tmp = stackalloc int[4]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - - int sumXY = 0; - int sumX = 0; - for (int i = 0; i < 256; i += 16) + Vector128 sumXY128 = Vector128.Zero; + Vector128 sumX128 = Vector128.Zero; + ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + for (int i = 0; i < 256; i += 4) { - Vector128 x0 = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 y0 = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); - Vector128 x1 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 4)); - Vector128 y1 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 4)); - Vector128 x2 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 8)); - Vector128 y2 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 8)); - Vector128 x3 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 12)); - Vector128 y3 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 12)); - Vector128 x4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(x0, x1), Sse2.PackSignedSaturate(x2, x3)); - Vector128 y4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(y0, y1), Sse2.PackSignedSaturate(y2, y3)); - int mx = Sse2.MoveMask(Sse2.CompareGreaterThan(x4, Vector128.Zero).AsByte()); - int my = Sse2.MoveMask(Sse2.CompareGreaterThan(y4, Vector128.Zero).AsByte()) | mx; - while (my != 0) + Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + + // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. + if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) { - int j = BitOperations.TrailingZeroCount(my); - if (((mx >> j) & 1) != 0) + Vector128 xy128 = Sse2.Add(xVec, yVec); + sumXY128 = Sse2.Add(sumXY128, xy128); + sumX128 = Sse2.Add(sumX128, xVec); + + // Analyze the different X + Y. + Unsafe.As>(ref tmpRef) = xy128; + if (tmpRef != 0) + { + retVal -= FastSLog2((uint)tmpRef); + if (Unsafe.Add(ref xRef, i) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); + } + } + + if (Unsafe.Add(ref tmpRef, 1) != 0) { - int xij = Unsafe.Add(ref xRef, i + j); - sumXY += xij; - retVal -= FastSLog2((uint)xij); + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); + if (Unsafe.Add(ref xRef, i + 1) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); + } } - int xy = Unsafe.Add(ref xRef, i + j) + Unsafe.Add(ref yRef, i + j); - sumX += xy; - retVal -= FastSLog2((uint)xy); - my &= my - 1; + if (Unsafe.Add(ref tmpRef, 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); + if (Unsafe.Add(ref xRef, i + 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); + } + } + + if (Unsafe.Add(ref tmpRef, 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); + if (Unsafe.Add(ref xRef, i + 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); + } + } + } + else + { + // X is fully 0, so only deal with Y. + sumXY128 = Sse2.Add(sumXY128, yVec); + + if (Unsafe.Add(ref yRef, i) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); + } + + if (Unsafe.Add(ref yRef, i + 1) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); + } + + if (Unsafe.Add(ref yRef, i + 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); + } + + if (Unsafe.Add(ref yRef, i + 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); + } } } + // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. + // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. + Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); + Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); + Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); + Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); + Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); + Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); + int sumX = Sse2.ConvertToInt32(tmpSumX); + int sumXY = Sse2.ConvertToInt32(tmpSumXY); + retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); return (float)retVal; From 32b97f41fc564d32110d6939f398618a7d683fc6 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 22:21:46 +0100 Subject: [PATCH 18/47] Add AVX2 version of CombinedShannonEntropy --- src/ImageSharp/Common/Helpers/Numerics.cs | 15 +++ .../Formats/Webp/Lossless/LosslessUtils.cs | 97 ++++++++++++++----- 2 files changed, 89 insertions(+), 23 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ba5c588ca5..9dc13079d6 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -820,6 +820,21 @@ namespace SixLabors.ImageSharp } } + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int ReduceSum(Vector256 accumulator) + { + Vector128 vec0 = Avx2.ExtractVector128(accumulator, 0); + Vector128 vec1 = Avx2.ExtractVector128(accumulator, 1); + Vector128 sum128 = Sse2.Add(vec0, vec1); + + return ReduceSum(sum128); + } + /// /// Reduces even elements of the vector into one sum. /// diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 0f24e8e8f3..314f26d64d 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; @@ -760,29 +761,30 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static float CombinedShannonEntropy(Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Avx2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[4]; + Span tmp = stackalloc int[8]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - Vector128 sumXY128 = Vector128.Zero; - Vector128 sumX128 = Vector128.Zero; + Vector256 sumXY256 = Vector256.Zero; + Vector256 sumX256 = Vector256.Zero; ref int tmpRef = ref MemoryMarshal.GetReference(tmp); - for (int i = 0; i < 256; i += 4) + for (nint i = 0; i < 256; i += 8) { - Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + Vector256 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector256 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. - if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) + int mask = Avx2.MoveMask(Avx2.CompareEqual(xVec, Vector256.Zero).AsByte()); + if (mask != -1) { - Vector128 xy128 = Sse2.Add(xVec, yVec); - sumXY128 = Sse2.Add(sumXY128, xy128); - sumX128 = Sse2.Add(sumX128, xVec); + Vector256 xy256 = Avx2.Add(xVec, yVec); + sumXY256 = Avx2.Add(sumXY256, xy256); + sumX256 = Avx2.Add(sumX256, xVec); // Analyze the different X + Y. - Unsafe.As>(ref tmpRef) = xy128; + Unsafe.As>(ref tmpRef) = xy256; if (tmpRef != 0) { retVal -= FastSLog2((uint)tmpRef); @@ -818,11 +820,47 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); } } + + if (Unsafe.Add(ref tmpRef, 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 4)); + if (Unsafe.Add(ref xRef, i + 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 4)); + } + } + + if (Unsafe.Add(ref tmpRef, 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 5)); + if (Unsafe.Add(ref xRef, i + 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 5)); + } + } + + if (Unsafe.Add(ref tmpRef, 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 6)); + if (Unsafe.Add(ref xRef, i + 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 6)); + } + } + + if (Unsafe.Add(ref tmpRef, 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 7)); + if (Unsafe.Add(ref xRef, i + 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 7)); + } + } } else { // X is fully 0, so only deal with Y. - sumXY128 = Sse2.Add(sumXY128, yVec); + sumXY256 = Avx2.Add(sumXY256, yVec); if (Unsafe.Add(ref yRef, i) != 0) { @@ -843,19 +881,32 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); } + + if (Unsafe.Add(ref yRef, i + 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 4)); + } + + if (Unsafe.Add(ref yRef, i + 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 5)); + } + + if (Unsafe.Add(ref yRef, i + 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 6)); + } + + if (Unsafe.Add(ref yRef, i + 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 7)); + } } } - // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. - // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. - Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); - Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); - Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); - Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); - Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); - Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); - int sumX = Sse2.ConvertToInt32(tmpSumX); - int sumXY = Sse2.ConvertToInt32(tmpSumXY); + // Sum up sumX256 to get sumX and sum up sumXY256 to get sumXY. + int sumX = Numerics.ReduceSum(sumX256); + int sumXY = Numerics.ReduceSum(sumXY256); retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); From 0fc3ce721270c50999b715fbff6d9663a8386cc4 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 11:12:46 +0100 Subject: [PATCH 19/47] Add CombinedShannonEntropy tests --- .../Formats/WebP/LosslessUtilsTests.cs | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index 97567ba218..9c7a2f7588 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,6 +10,17 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { + private static void RunCombinedShannonEntropyTest() + { + int[] x = { 3, 5, 2, 5, 3, 1, 2, 2, 3, 3, 1, 2, 1, 2, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 0, 0, 2, 1, 1, 0, 3, 1, 2, 3, 2, 3 }; + int[] y = { 11, 12, 8, 3, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2, 4, 6, 4 }; + float expected = 884.7585f; + + float actual = LosslessUtils.CombinedShannonEntropy(x, y); + + Assert.Equal(expected, actual, 5); + } + private static void RunSubtractGreenTest() { uint[] pixelData = @@ -193,6 +204,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp } } + [Fact] + public void CombinedShannonEntropy_Works() => RunCombinedShannonEntropyTest(); + [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -215,6 +229,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void CombinedShannonEntropy_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.AllowAll); + + [Fact] + public void CombinedShannonEntropy_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.DisableAVX2); + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); @@ -237,19 +257,19 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll); [Fact] - public void SubtractGreen_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX); + public void SubtractGreen_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2); [Fact] - public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSSE3); + public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSSE3); [Fact] public void AddGreenToBlueAndRed_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.AllowAll); [Fact] - public void AddGreenToBlueAndRed_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX); + public void AddGreenToBlueAndRed_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2); [Fact] - public void AddGreenToBlueAndRed_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3); + public void AddGreenToBlueAndRed_WithoutAVX2OrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3); [Fact] public void TransformColor_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.AllowAll); From 8192e116f2b88eb65f80a366c63e1abee6415915 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 15:09:48 +0100 Subject: [PATCH 20/47] Add AVX2 version of AddVector --- .../Formats/Webp/Lossless/Vp8LHistogram.cs | 51 +++++++++++++++++-- .../Formats/Webp/Lossy/Vp8Histogram.cs | 2 +- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs index bdb53f5c6a..ac8cc0f655 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs @@ -3,10 +3,16 @@ using System; using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal class Vp8LHistogram : IDeepCloneable + internal sealed class Vp8LHistogram : IDeepCloneable { private const uint NonTrivialSym = 0xffffffff; @@ -505,11 +511,48 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return cost; } - private static void AddVector(uint[] a, uint[] b, uint[] output, int size) + private static void AddVector(Span a, Span b, Span output, int size) { - for (int i = 0; i < size; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) { - output[i] = a[i] + b[i]; + ref uint aRef = ref MemoryMarshal.GetReference(a); + ref uint bRef = ref MemoryMarshal.GetReference(b); + ref uint outputRef = ref MemoryMarshal.GetReference(output); + int i; + + for (i = 0; i + 32 <= size; i += 32) + { + // Load values. + Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, i)); + Vector256 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 8)); + Vector256 a2 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 16)); + Vector256 a3 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 24)); + Vector256 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, i)); + Vector256 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 8)); + Vector256 b2 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 16)); + Vector256 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 24)); + + // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But + // that's ok since the histogram values are less than 1<<28 (max picture size). + Unsafe.As>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3); + } + + for (; i < size; i++) + { + output[i] = a[i] + b[i]; + } + } + else +#endif + { + for (int i = 0; i < size; i++) + { + output[i] = a[i] + b[i]; + } } } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs index 6e724e4758..89e7baff39 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs @@ -6,7 +6,7 @@ using System.Runtime.CompilerServices; namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal class Vp8Histogram + internal sealed class Vp8Histogram { private readonly int[] scratch = new int[16]; From a45f49517b7fb4ed7981becb58898ea175273e80 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 15:59:59 +0100 Subject: [PATCH 21/47] Avoid bounds checks in VectorMismatch --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 471c083cda..d24431600d 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -81,7 +81,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int matchLen = 0; - while (matchLen < length && array1[matchLen] == array2[matchLen]) + ref uint array1Ref = ref MemoryMarshal.GetReference(array1); + ref uint array2Ref = ref MemoryMarshal.GetReference(array2); + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) { matchLen++; } From 6393484e4283719575451aba7d18d91d7d86b6af Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 17:15:42 +0100 Subject: [PATCH 22/47] Remove duplicate FTransform method --- .../Formats/Webp/Lossy/Vp8Histogram.cs | 44 +------------------ 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs index 89e7baff39..d384302b94 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs @@ -49,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.distribution.AsSpan().Clear(); for (j = startBlock; j < endBlock; j++) { - this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output); + Vp8Encoding.FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output, this.scratch); // Convert coefficients to bin. for (int k = 0; k < 16; ++k) @@ -98,48 +98,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.lastNonZero = lastNonZero; } - private void Vp8FTransform(Span src, Span reference, Span output) - { - int i; - Span tmp = this.scratch; - tmp.Clear(); - - for (i = 0; i < 4; i++) - { - int d0 = src[0] - reference[0]; // 9bit dynamic range ([-255,255]) - int d1 = src[1] - reference[1]; - int d2 = src[2] - reference[2]; - int d3 = src[3] - reference[3]; - int a0 = d0 + d3; // 10b [-510,510] - int a1 = d1 + d2; - int a2 = d1 - d2; - int a3 = d0 - d3; - tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] - tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] - tmp[2 + (i * 4)] = (a0 - a1) * 8; - tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; - - // Do not change the span in the last iteration. - if (i < 3) - { - src = src.Slice(WebpConstants.Bps); - reference = reference.Slice(WebpConstants.Bps); - } - } - - for (i = 0; i < 4; i++) - { - int a0 = tmp[0 + i] + tmp[12 + i]; // 15b - int a1 = tmp[4 + i] + tmp[8 + i]; - int a2 = tmp[4 + i] - tmp[8 + i]; - int a3 = tmp[0 + i] - tmp[12 + i]; - output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b - output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); - output[8 + i] = (short)((a0 - a1 + 7) >> 4); - output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); - } - } - [MethodImpl(InliningOptions.ShortMethod)] private static int ClipMax(int v, int max) => v > max ? max : v; } From 491b742ae4a70dc6c5ff1e6c3a9db9cb7f00fcf3 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 18:38:19 +0100 Subject: [PATCH 23/47] Add SSE2 version of VectorMismatch --- .../Formats/Webp/Lossless/LosslessUtils.cs | 65 +++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index d24431600d..319aa8c3d6 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -80,15 +80,72 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static int VectorMismatch(ReadOnlySpan array1, ReadOnlySpan array2, int length) { int matchLen = 0; - ref uint array1Ref = ref MemoryMarshal.GetReference(array1); ref uint array2Ref = ref MemoryMarshal.GetReference(array2); - while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - matchLen++; + if (length >= 12) + { + Vector128 a0 = Unsafe.As>(ref array1Ref); + Vector128 a1 = Unsafe.As>(ref array2Ref); + + do + { + // Loop unrolling and early load both provide a speedup. + Vector128 cmpA = Sse2.CompareEqual(a0, a1); + Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); + Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); + if (Sse2.MoveMask(cmpA.AsByte()) != 0xffff) + { + break; + } + + matchLen += 4; + + Vector128 cmpB = Sse2.CompareEqual(b0, b1); + a0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); + a1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); + if (Sse2.MoveMask(cmpB.AsByte()) != 0xffff) + { + break; + } + + matchLen += 4; + } + while (matchLen + 12 < length); + } + else + { + // Unroll the potential first two loops. + if (length >= 4 + && Sse2.MoveMask( + Sse2.CompareEqual( + Unsafe.As>(ref array1Ref), + Unsafe.As>(ref array2Ref)).AsByte()) == 0xffff) + { + matchLen = 4; + if (length >= 8 + && Sse2.MoveMask( + Sse2.CompareEqual( + Unsafe.As>(ref Unsafe.Add(ref array1Ref, 4)), + Unsafe.As>(ref Unsafe.Add(ref array2Ref, 4))).AsByte()) == 0xffff) + { + matchLen = 8; + } + } + } } +#endif + { + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) + { + matchLen++; + } - return matchLen; + return matchLen; + } } [MethodImpl(InliningOptions.ShortMethod)] From 427a39213e2e7429667d376a8b86c09b47f6da62 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 23:11:06 +0100 Subject: [PATCH 24/47] Add VectorMismatch tests --- .../Formats/WebP/LosslessUtilsTests.cs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index 97567ba218..b6f15e5376 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,6 +10,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { + private static void RunVectorMismatchTest() + { + uint[] array1 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; + uint[] array2 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; + int expected1 = 18; + + // Test unroll first two loops path also. + uint[] array3 = { 4279238656, 4278714368, 4279238656, 4279238656, 4279238656, 4279238656, 4279238896, 4279238896, 4279238884 }; + uint[] array4 = { 4279238656, 4278714368, 4279238656, 4279238656, 4278190080, 4278190080, 4278190080, 4278190080, 4278190080 }; + int expected2 = 4; + + int actual1 = LosslessUtils.VectorMismatch(array1, array2, 18); + int actual2 = LosslessUtils.VectorMismatch(array3, array4, 9); + + Assert.Equal(expected1, actual1); + Assert.Equal(expected2, actual2); + } + private static void RunSubtractGreenTest() { uint[] pixelData = @@ -193,6 +211,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp } } + [Fact] + public void VectorMismatch_Works() => RunVectorMismatchTest(); + [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -215,6 +236,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void VectorMismatch_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.AllowAll); + + [Fact] + public void VectorMismatch_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.DisableSSE2); + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); From fa24760a88e0669a49eaae2e0ef5ef209a86c0d1 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:20:28 +0100 Subject: [PATCH 25/47] Add AddVector tests --- .../Formats/WebP/Vp8LHistogramTests.cs | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs new file mode 100644 index 0000000000..f39e16bc24 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs @@ -0,0 +1,109 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossless; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + public class Vp8LHistogramTests + { + private static void RunAddVectorTest() + { + // arrange + uint[] pixelData = + { + 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4294577152, + 4294707200, 4294707200, 4294707200, 4294707200, 4294837248, 4294837248, 4293926912, 4294316544, + 4278191104, 4278191104, 4294837248, 4294837248, 4280287232, 4280350720, 4294447104, 4294707200, + 4294838272, 4278516736, 4294837248, 4294837248, 4278516736, 4294707200, 4279298048, 4294837248, + 4294837248, 4294837248, 4294837248, 4280287232, 4280287232, 4292670464, 4279633408, 4294838272, + 4294837248, 4278516736, 4278516736, 4278516736, 4278516736, 4278516736, 4278778880, 4278193152, + 4278191104, 4280287232, 4280287232, 4280287232, 4280287232, 4293971968, 4280612864, 4292802560, + 4294837760, 4278516736, 4278516736, 4294837760, 4294707712, 4278516736, 4294837248, 4278193152, + 4280287232, 4278984704, 4280287232, 4278243328, 4280287232, 4278244352, 4280287232, 4280025088, + 4280025088, 4294837760, 4278192128, 4294838784, 4294837760, 4294707712, 4278778880, 4278324224, + 4280287232, 4280287232, 4278202368, 4279115776, 4280287232, 4278243328, 4280287232, 4280287232, + 4280025088, 4280287232, 4278192128, 4294838272, 4294838272, 4294837760, 4278190592, 4278778880, + 4280875008, 4280287232, 4279896576, 4281075712, 4281075712, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4278190592, 4294709248, 4278516736, 4278516736, 4278584832, 4278909440, + 4280287232, 4280287232, 4294367744, 4294621184, 4279115776, 4280287232, 4280287232, 4280351744, + 4280287232, 4280287232, 4280287232, 4278513664, 4278516736, 4278716416, 4278584832, 4280291328, + 4293062144, 4280287232, 4280287232, 4280287232, 4294456320, 4280291328, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4278513152, 4278716416, 4278584832, 4280291328, + 4278198272, 4278198272, 4278589952, 4278198272, 4278198272, 4280287232, 4278765568, 4280287232, + 4280287232, 4280287232, 4280287232, 4294712832, 4278513152, 4278716640, 4279300608, 4278584832, + 4280156672, 4279373312, 4278589952, 4279373312, 4278328832, 4278328832, 4278328832, 4279634432, + 4280287232, 4280287232, 4280287232, 4280287232, 4278457344, 4280483328, 4278584832, 4278385664, + 4279634432, 4279373312, 4279634432, 4280287232, 4280287232, 4280156672, 4278589952, 4278328832, + 4278198272, 4280156672, 4280483328, 4294363648, 4280287232, 4278376448, 4280287232, 4278647808, + 4280287232, 4280287232, 4279373312, 4280287232, 4280287232, 4280156672, 4280287232, 4278198272, + 4278198272, 4280156672, 4280287232, 4280287232, 4293669888, 4278765568, 4278765568, 4280287232, + 4280287232, 4280287232, 4279634432, 4279634432, 4280287232, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4279373312, 4279764992, 4293539328, 4279896576, + 4280287232, 4280287232, 4280287232, 4279634432, 4278198272, 4279634432, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4279503872, 4279503872, 4280288256, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232 + }; + + uint[] literals = + { + 198, 0, 14, 0, 46, 0, 22, 0, 36, 0, 24, 0, 12, 0, 10, 0, 10, 0, 2, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, + 10, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 6, 0, 2, 0, 2, 0, 2, 0, 0, 0, 8, 0, 2, 0, 38, 0, 4 + }; + + uint[] expectedLiterals = new uint[1305]; + + // All remaining values are expected to be zero. + literals.AsSpan().CopyTo(expectedLiterals); + + var backwardRefs = new Vp8LBackwardRefs(pixelData.Length); + for (int i = 0; i < pixelData.Length; i++) + { + backwardRefs.Add(new PixOrCopy() + { + BgraOrDistance = pixelData[i], + Len = 1, + Mode = PixOrCopyMode.Literal + }); + } + + var histogram0 = new Vp8LHistogram(backwardRefs, 3); + var histogram1 = new Vp8LHistogram(backwardRefs, 3); + for (int i = 0; i < 5; i++) + { + histogram0.IsUsed[i] = true; + histogram1.IsUsed[i] = true; + } + + var output = new Vp8LHistogram(3); + + // act + histogram0.Add(histogram1, output); + + // assert + Assert.True(output.Literal.SequenceEqual(expectedLiterals)); + } + + [Fact] + public void AddVector_Works() => RunAddVectorTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void AddVector_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.AllowAll); + + [Fact] + public void AddVector_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.DisableAVX2); +#endif + } +} From 110ff3d9e8f9651d73286576e17f69f1e2ecfa31 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Wed, 24 Nov 2021 12:38:44 +0100 Subject: [PATCH 26/47] Avoid using Span tmp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 314f26d64d..4f247c434b 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -764,12 +764,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (Avx2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[8]; + Vector256 tmp = Vector256.Zero; // has the size of the scratch space of sizeof(int) * 8 ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); Vector256 sumXY256 = Vector256.Zero; Vector256 sumX256 = Vector256.Zero; - ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + ref int tmpRef = ref Unsafe.As, int>(ref tmp); for (nint i = 0; i < 256; i += 8) { Vector256 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); From 5403fbd8b2a4f42e9a9deed923d3017d449b3ab9 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:34:34 +0100 Subject: [PATCH 27/47] Add better version of ReduceSum for Vector 256 --- src/ImageSharp/Common/Helpers/Numerics.cs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 9dc13079d6..fa0af823d5 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -828,11 +828,16 @@ namespace SixLabors.ImageSharp [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int ReduceSum(Vector256 accumulator) { - Vector128 vec0 = Avx2.ExtractVector128(accumulator, 0); - Vector128 vec1 = Avx2.ExtractVector128(accumulator, 1); - Vector128 sum128 = Sse2.Add(vec0, vec1); + // Add upper lane to lower lane. + Vector128 vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); - return ReduceSum(sum128); + // Add odd to even. + vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01)); + + // Add high to low. + vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); + + return Sse2.ConvertToInt32(vsum); } /// From cc5f7af71c2fc42f317a5244f0ea660aa3123636 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:50:39 +0100 Subject: [PATCH 28/47] Better version of LoadHigh --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 8e3b153891..d0a14db33d 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -720,12 +720,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to R/G/B private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector64 yTmp = Unsafe.As>(ref MemoryMarshal.GetReference(y)); - Vector64 uTmp = Unsafe.As>(ref MemoryMarshal.GetReference(u)); - Vector64 vTmp = Unsafe.As>(ref MemoryMarshal.GetReference(v)); - Vector128 y0 = LoadHigh(yTmp); - Vector128 u0 = LoadHigh(uTmp); - Vector128 v0 = LoadHigh(vTmp); + Vector128 y0 = LoadHigh(ref MemoryMarshal.GetReference(y)); + Vector128 u0 = LoadHigh(ref MemoryMarshal.GetReference(u)); + Vector128 v0 = LoadHigh(ref MemoryMarshal.GetReference(v)); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -751,9 +748,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(Vector64 src) + private static Vector128 LoadHigh(ref byte src) { - Vector128 tmp = Unsafe.As, Vector128>(ref src); + Vector128 tmp = Unsafe.As>(ref src); return Sse2.UnpackLow(Vector128.Zero, tmp); } #endif From 65870b96f429d03beb75173447fc56a01fd6b305 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 13:14:38 +0100 Subject: [PATCH 29/47] Avoid branching inside loop --- .../Formats/Webp/Lossy/YuvConversion.cs | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index d0a14db33d..a8286037ba 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -183,11 +183,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref byte topVRef = ref MemoryMarshal.GetReference(topV); ref byte curURef = ref MemoryMarshal.GetReference(curU); ref byte curVRef = ref MemoryMarshal.GetReference(curV); - for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + if (bottomY != null) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); - ConvertYuvToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); + ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + } + } + else + { + for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) + { + UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru); + UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv); + ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep); + } } // Process last block. @@ -205,9 +217,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (bottomY != default) { bottomY.Slice(pos, len - pos).CopyTo(tmpBottom); + ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + } + else + { + ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep); } - ConvertYuvToBgrSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep)); if (bottomY != default) { @@ -588,14 +604,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS - private static void ConvertYuvToBgrSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); + + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step)); - - if (bottomY != null) - { - YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); - } + YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step)); } private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) From 6293f72c809ab18ba844ba33ba0801474f6e81c5 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 13:57:23 +0100 Subject: [PATCH 30/47] Use ref parameters in ConvertYuv444ToBgrSse41 --- .../Formats/Webp/Lossy/YuvConversion.cs | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index a8286037ba..cf211c16ec 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -616,10 +616,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) { - ConvertYuv444ToRgbSse41(y, u, v, out Vector128 r0, out Vector128 g0, out Vector128 b0); - ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128 r1, out Vector128 g1, out Vector128 b1); - ConvertYuv444ToRgbSse41(y.Slice(16), u.Slice(16), v.Slice(16), out Vector128 r2, out Vector128 g2, out Vector128 b2); - ConvertYuv444ToRgbSse41(y.Slice(24), u.Slice(24), v.Slice(24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + ref byte yRef = ref MemoryMarshal.GetReference(y); + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); // Cast to 8b and store as BBBBGGGGRRRR. Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); @@ -733,12 +736,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy output5 = Ssse3.Shuffle(input1, shuffle2); } - // Convert 32 samples of YUV444 to R/G/B - private static void ConvertYuv444ToRgbSse41(Span y, Span u, Span v, out Vector128 r, out Vector128 g, out Vector128 b) + // Convert 32 samples of YUV444 to B/G/R + private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(ref MemoryMarshal.GetReference(y)); - Vector128 u0 = LoadHigh(ref MemoryMarshal.GetReference(u)); - Vector128 v0 = LoadHigh(ref MemoryMarshal.GetReference(v)); + Vector128 y0 = LoadHigh(ref y); + Vector128 u0 = LoadHigh(ref u); + Vector128 v0 = LoadHigh(ref v); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); From 2ca81aec3c83b36060bdd021a5f52688778fab6e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 14:07:57 +0100 Subject: [PATCH 31/47] Fill buffers with default values only in Debug mode --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index d62d23e172..14bc19e8a2 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -76,10 +76,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.TmpVBuffer = memoryAllocator.Allocate((int)width); this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); +#if DEBUG + // Filling those buffers with 205, is only useful for debugging, + // so the default values are the same as the reference libwebp implementation. this.YuvBuffer.Memory.Span.Fill(205); this.CacheY.Memory.Span.Fill(205); this.CacheU.Memory.Span.Fill(205); this.CacheV.Memory.Span.Fill(205); +#endif this.Vp8BitReaders = new Vp8BitReader[WebpConstants.MaxNumPartitions]; } From cded607d5cd0d30bc381f08170f9e7f9dc8d91e8 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 14:27:41 +0100 Subject: [PATCH 32/47] Allocate clean buffers --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index 14bc19e8a2..003bdc2682 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -66,15 +66,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int extraRows = WebpConstants.FilterExtraRows[(int)LoopFilter.Complex]; // assuming worst case: complex filter int extraY = extraRows * this.CacheYStride; int extraUv = extraRows / 2 * this.CacheUvStride; - this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY); - this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY); + this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY, AllocationOptions.Clean); + this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY, AllocationOptions.Clean); int cacheUvSize = (16 * this.CacheUvStride) + extraUv; - this.CacheU = memoryAllocator.Allocate(cacheUvSize); - this.CacheV = memoryAllocator.Allocate(cacheUvSize); - this.TmpYBuffer = memoryAllocator.Allocate((int)width); - this.TmpUBuffer = memoryAllocator.Allocate((int)width); - this.TmpVBuffer = memoryAllocator.Allocate((int)width); - this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); + this.CacheU = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); + this.CacheV = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); + this.TmpYBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.TmpUBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.TmpVBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); + this.Pixels = memoryAllocator.Allocate((int)(width * height * 4), AllocationOptions.Clean); #if DEBUG // Filling those buffers with 205, is only useful for debugging, From 22537b226b6c9f7517ec1cee670157f756abbecf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 15:31:40 +0100 Subject: [PATCH 33/47] Revert "Allocate clean buffers": the tmp buffers does not need to be clean, they will be overwritten anyway This reverts commit cded607d5cd0d30bc381f08170f9e7f9dc8d91e8. --- src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs index 003bdc2682..14bc19e8a2 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs @@ -66,15 +66,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int extraRows = WebpConstants.FilterExtraRows[(int)LoopFilter.Complex]; // assuming worst case: complex filter int extraY = extraRows * this.CacheYStride; int extraUv = extraRows / 2 * this.CacheUvStride; - this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY, AllocationOptions.Clean); - this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY, AllocationOptions.Clean); + this.YuvBuffer = memoryAllocator.Allocate((WebpConstants.Bps * 17) + (WebpConstants.Bps * 9) + extraY); + this.CacheY = memoryAllocator.Allocate((16 * this.CacheYStride) + extraY); int cacheUvSize = (16 * this.CacheUvStride) + extraUv; - this.CacheU = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); - this.CacheV = memoryAllocator.Allocate(cacheUvSize, AllocationOptions.Clean); - this.TmpYBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.TmpUBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.TmpVBuffer = memoryAllocator.Allocate((int)width, AllocationOptions.Clean); - this.Pixels = memoryAllocator.Allocate((int)(width * height * 4), AllocationOptions.Clean); + this.CacheU = memoryAllocator.Allocate(cacheUvSize); + this.CacheV = memoryAllocator.Allocate(cacheUvSize); + this.TmpYBuffer = memoryAllocator.Allocate((int)width); + this.TmpUBuffer = memoryAllocator.Allocate((int)width); + this.TmpVBuffer = memoryAllocator.Allocate((int)width); + this.Pixels = memoryAllocator.Allocate((int)(width * height * 4)); #if DEBUG // Filling those buffers with 205, is only useful for debugging, From c174ab42bea366257a410e8335055b9d27d487ff Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 20:13:46 +0100 Subject: [PATCH 34/47] Remove SSE2 version of VectorMismatch: Profiling does not show any speedup --- .../Formats/Webp/Lossless/LosslessUtils.cs | 63 +------------------ .../Formats/WebP/LosslessUtilsTests.cs | 28 +-------- 2 files changed, 4 insertions(+), 87 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 319aa8c3d6..0ed180a184 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -83,69 +83,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless ref uint array1Ref = ref MemoryMarshal.GetReference(array1); ref uint array2Ref = ref MemoryMarshal.GetReference(array2); -#if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) { - if (length >= 12) - { - Vector128 a0 = Unsafe.As>(ref array1Ref); - Vector128 a1 = Unsafe.As>(ref array2Ref); - - do - { - // Loop unrolling and early load both provide a speedup. - Vector128 cmpA = Sse2.CompareEqual(a0, a1); - Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); - Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); - if (Sse2.MoveMask(cmpA.AsByte()) != 0xffff) - { - break; - } - - matchLen += 4; - - Vector128 cmpB = Sse2.CompareEqual(b0, b1); - a0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); - a1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); - if (Sse2.MoveMask(cmpB.AsByte()) != 0xffff) - { - break; - } - - matchLen += 4; - } - while (matchLen + 12 < length); - } - else - { - // Unroll the potential first two loops. - if (length >= 4 - && Sse2.MoveMask( - Sse2.CompareEqual( - Unsafe.As>(ref array1Ref), - Unsafe.As>(ref array2Ref)).AsByte()) == 0xffff) - { - matchLen = 4; - if (length >= 8 - && Sse2.MoveMask( - Sse2.CompareEqual( - Unsafe.As>(ref Unsafe.Add(ref array1Ref, 4)), - Unsafe.As>(ref Unsafe.Add(ref array2Ref, 4))).AsByte()) == 0xffff) - { - matchLen = 8; - } - } - } + matchLen++; } -#endif - { - while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) - { - matchLen++; - } - return matchLen; - } + return matchLen; } [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index b6f15e5376..62e23c1cdf 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,24 +10,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { - private static void RunVectorMismatchTest() - { - uint[] array1 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; - uint[] array2 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; - int expected1 = 18; - - // Test unroll first two loops path also. - uint[] array3 = { 4279238656, 4278714368, 4279238656, 4279238656, 4279238656, 4279238656, 4279238896, 4279238896, 4279238884 }; - uint[] array4 = { 4279238656, 4278714368, 4279238656, 4279238656, 4278190080, 4278190080, 4278190080, 4278190080, 4278190080 }; - int expected2 = 4; - - int actual1 = LosslessUtils.VectorMismatch(array1, array2, 18); - int actual2 = LosslessUtils.VectorMismatch(array3, array4, 9); - - Assert.Equal(expected1, actual1); - Assert.Equal(expected2, actual2); - } - private static void RunSubtractGreenTest() { uint[] pixelData = @@ -211,9 +193,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp } } - [Fact] - public void VectorMismatch_Works() => RunVectorMismatchTest(); - [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -236,12 +215,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS - [Fact] - public void VectorMismatch_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.AllowAll); - - [Fact] - public void VectorMismatch_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.DisableSSE2); - + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); From 2d60b73b140b0ab8d6a85a941b0319c86be32d14 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 25 Nov 2021 13:28:35 +0100 Subject: [PATCH 35/47] Rename size to count, add DebugGuard --- .../Formats/Webp/Lossless/Vp8LHistogram.cs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs index ac8cc0f655..bfb8f40d4a 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs @@ -511,8 +511,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return cost; } - private static void AddVector(Span a, Span b, Span output, int size) + private static void AddVector(Span a, Span b, Span output, int count) { + DebugGuard.MustBeGreaterThanOrEqualTo(a.Length, count, nameof(a.Length)); + DebugGuard.MustBeGreaterThanOrEqualTo(b.Length, count, nameof(b.Length)); + DebugGuard.MustBeGreaterThanOrEqualTo(output.Length, count, nameof(output.Length)); + #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { @@ -521,7 +525,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless ref uint outputRef = ref MemoryMarshal.GetReference(output); int i; - for (i = 0; i + 32 <= size; i += 32) + for (i = 0; i + 32 <= count; i += 32) { // Load values. Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, i)); @@ -534,14 +538,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector256 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 24)); // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But - // that's ok since the histogram values are less than 1<<28 (max picture size). + // that's ok since the histogram values are less than 1<<28 (max picture count). Unsafe.As>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3); } - for (; i < size; i++) + for (; i < count; i++) { output[i] = a[i] + b[i]; } @@ -549,7 +553,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless else #endif { - for (int i = 0; i < size; i++) + for (int i = 0; i < count; i++) { output[i] = a[i] + b[i]; } From 7775c343049e1640dfd699aff0d005355081f042 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 25 Nov 2021 14:18:53 +0100 Subject: [PATCH 36/47] Group loading y, u, v together --- .../Formats/Webp/Lossy/YuvConversion.cs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index cf211c16ec..16d458ed88 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -739,9 +739,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Convert 32 samples of YUV444 to B/G/R private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { - Vector128 y0 = LoadHigh(ref y); - Vector128 u0 = LoadHigh(ref u); - Vector128 v0 = LoadHigh(ref v); + // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. + Vector128 y0 = Unsafe.As>(ref y); + Vector128 u0 = Unsafe.As>(ref u); + Vector128 v0 = Unsafe.As>(ref v); + y0 = Sse2.UnpackLow(Vector128.Zero, y0); + u0 = Sse2.UnpackLow(Vector128.Zero, u0); + v0 = Sse2.UnpackLow(Vector128.Zero, v0); Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16()); Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16()); @@ -765,13 +769,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } - // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadHigh(ref byte src) - { - Vector128 tmp = Unsafe.As>(ref src); - return Sse2.UnpackLow(Vector128.Zero, tmp); - } #endif [MethodImpl(InliningOptions.ShortMethod)] From 98f5a428169e9e31b903ba3252bd00223945ab7f Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 25 Nov 2021 22:49:29 +0100 Subject: [PATCH 37/47] Add SSE2 version off FTransform --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 217 +++++++++++++++--- 1 file changed, 185 insertions(+), 32 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index aa4ab5767b..143d9f17ee 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static class Vp8Encoding + internal static unsafe class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -382,43 +382,196 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void FTransform(Span src, Span reference, Span output, Span scratch) { - int i; - Span tmp = scratch.Slice(0, 16); - - int srcIdx = 0; - int refIdx = 0; - for (i = 0; i < 4; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255]) - int d1 = src[srcIdx + 1] - reference[refIdx + 1]; - int d2 = src[srcIdx + 2] - reference[refIdx + 2]; - int d3 = src[srcIdx + 3] - reference[refIdx + 3]; - int a0 = d0 + d3; // 10b [-510,510] - int a1 = d1 + d2; - int a2 = d1 - d2; - int a3 = d0 - d3; - tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] - tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] - tmp[2 + (i * 4)] = (a0 - a1) * 8; - tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; - - srcIdx += WebpConstants.Bps; - refIdx += WebpConstants.Bps; +#pragma warning disable SA1503 // Braces should not be omitted + fixed (byte* srcRef = src) + fixed (byte* referenceRef = reference) + { + // Load src. + Vector128 src0 = Sse2.LoadScalarVector128((ulong*)srcRef); + Vector128 src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps)); + Vector128 src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2))); + Vector128 src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3))); + + // Load ref. + Vector128 ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef); + Vector128 ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps)); + Vector128 ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2))); + Vector128 ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3))); + + // 00 01 02 03 * + // 10 11 12 13 * + // 20 21 22 23 * + // 30 31 32 33 * + // Shuffle. + Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); + Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); + Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); + Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); + + // 00 01 10 11 02 03 12 13 * * ... + // 20 21 30 31 22 22 32 33 * * ... + + // Convert both to 16 bit. + Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero); + Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero); + Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero); + Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero); + + // Compute the difference. + Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); + Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); + + // First pass + FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32); + + // Second pass + FTransformPass2SSE2(v01, v32, output); + } +#pragma warning restore SA1503 // Braces should not be omitted } - - for (i = 0; i < 4; i++) + else +#endif { - int a0 = tmp[0 + i] + tmp[12 + i]; // 15b - int a1 = tmp[4 + i] + tmp[8 + i]; - int a2 = tmp[4 + i] - tmp[8 + i]; - int a3 = tmp[0 + i] - tmp[12 + i]; - output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b - output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); - output[8 + i] = (short)((a0 - a1 + 7) >> 4); - output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); + int i; + Span tmp = scratch.Slice(0, 16); + + int srcIdx = 0; + int refIdx = 0; + for (i = 0; i < 4; i++) + { + int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255]) + int d1 = src[srcIdx + 1] - reference[refIdx + 1]; + int d2 = src[srcIdx + 2] - reference[refIdx + 2]; + int d3 = src[srcIdx + 3] - reference[refIdx + 3]; + int a0 = d0 + d3; // 10b [-510,510] + int a1 = d1 + d2; + int a2 = d1 - d2; + int a3 = d0 - d3; + tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] + tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] + tmp[2 + (i * 4)] = (a0 - a1) * 8; + tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; + + srcIdx += WebpConstants.Bps; + refIdx += WebpConstants.Bps; + } + + for (i = 0; i < 4; i++) + { + int a0 = tmp[0 + i] + tmp[12 + i]; // 15b + int a1 = tmp[4 + i] + tmp[8 + i]; + int a2 = tmp[4 + i] - tmp[8 + i]; + int a3 = tmp[0 + i] - tmp[12 + i]; + output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b + output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); + output[8 + i] = (short)((a0 - a1 + 7) >> 4); + output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); + } } } +#if SUPPORTS_RUNTIME_INTRINSICS + public static void FTransformPass1SSE2(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) + { + var k937 = Vector128.Create(937); + var k1812 = Vector128.Create(1812); + Vector128 k88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16(); + Vector128 k88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16(); + Vector128 k5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16(); + Vector128 k5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16(); + + // *in01 = 00 01 10 11 02 03 12 13 + // *in23 = 20 21 30 31 22 23 32 33 + Vector128 shuf01_p = Sse2.ShuffleHigh(row01.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1)); + Vector128 shuf32_p = Sse2.ShuffleHigh(row23.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1)); + + // 00 01 10 11 03 02 13 12 + // 20 21 30 31 23 22 33 32 + Vector128 s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + Vector128 s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + + // 00 01 10 11 20 21 30 31 + // 03 02 13 12 23 22 33 32 + Vector128 a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16()); + Vector128 a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16()); + + // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] + // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] + Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, k88p); // [ (a0 + a1) << 3, ... ] + Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, k88m); // [ (a0 - a1) << 3, ... ] + Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, k5352_2217p); + Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, k5352_2217m); + Vector128 tmp12 = Sse2.Add(tmp11, k1812); + Vector128 tmp32 = Sse2.Add(tmp31, k937); + Vector128 tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9); + Vector128 tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9); + Vector128 s03 = Sse2.PackSignedSaturate(tmp0, tmp2); + Vector128 s12 = Sse2.PackSignedSaturate(tmp1, tmp3); + Vector128 slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1... + Vector128 shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3 + Vector128 v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32()); + out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32()); + out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2)); + } + + public static void FTransformPass2SSE2(Vector128 v01, Vector128 v32, Span output) + { + var seven = Vector128.Create((short)7); + Vector128 k5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16(); + Vector128 k2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16(); + var k12000PlusOne = Vector128.Create(12000 + (1 << 16)); + var k51000 = Vector128.Create(51000); + + // Same operations are done on the (0,3) and (1,2) pairs. + // a3 = v0 - v3 + // a2 = v1 - v2 + Vector128 a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16()); + Vector128 a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64()); + + Vector128 b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16()); + Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, k5352_2217); + Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, k2217_5352); + Vector128 d1 = Sse2.Add(c1, k12000PlusOne); + Vector128 d3 = Sse2.Add(c3, k51000); + Vector128 e1 = Sse2.ShiftRightArithmetic(d1, 16); + Vector128 e3 = Sse2.ShiftRightArithmetic(d3, 16); + + // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) + // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) + Vector128 f1 = Sse2.PackSignedSaturate(e1, e1); + Vector128 f3 = Sse2.PackSignedSaturate(e3, e3); + + // g1 = f1 + (a3 != 0); + // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the + // desired (0, 1), we add one earlier through k12000_plus_one. + // -> g1 = f1 + 1 - (a3 == 0) + Vector128 g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128.Zero)); + + // a0 = v0 + v3 + // a1 = v1 + v2 + Vector128 a01 = Sse2.Add(v01, v32); + Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), seven); + Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); + Vector128 c0 = Sse2.Add(a01Plus7, a11); + Vector128 c2 = Sse2.Subtract(a01Plus7, a11); + + // d0 = (a0 + a1 + 7) >> 4; + // d2 = (a0 - a1 + 7) >> 4; + Vector128 d0 = Sse2.ShiftRightArithmetic(c0, 4); + Vector128 d2 = Sse2.ShiftRightArithmetic(c2, 4); + + Vector128 d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64()); + Vector128 d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64()); + + ref short outputRef = ref MemoryMarshal.GetReference(output); + Unsafe.As>(ref outputRef) = d0g1.AsInt16(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, 8)) = d2f3.AsInt16(); + } +#endif + public static void FTransformWht(Span input, Span output, Span scratch) { Span tmp = scratch.Slice(0, 16); From 4bb56eea71a6e3e909d3fcd2255f633a1007c643 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 12:50:55 +0100 Subject: [PATCH 38/47] Define mask and shuffle vectors as static readonly --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 75 +++++++++++-------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 143d9f17ee..a3a9c924cd 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -66,11 +66,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; #if SUPPORTS_RUNTIME_INTRINSICS - public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); +#pragma warning disable SA1310 // Field names should not contain underscore + private static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); - public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); + private static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); - public static readonly Vector128 Four = Vector128.Create((short)4); + private static readonly Vector128 Four = Vector128.Create((short)4); + + private static readonly Vector128 Seven = Vector128.Create((short)7); + + private static readonly Vector128 K88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16(); + + private static readonly Vector128 K88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16(); + + private static readonly Vector128 K5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16(); + + private static readonly Vector128 K5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16(); + + private static readonly Vector128 K937 = Vector128.Create(937); + + private static readonly Vector128 K1812 = Vector128.Create(1812); + + private static readonly Vector128 K5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16(); + + private static readonly Vector128 K2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16(); + + private static readonly Vector128 K12000PlusOne = Vector128.Create(12000 + (1 << 16)); + + private static readonly Vector128 K51000 = Vector128.Create(51000); + + private static readonly byte MmShuffle2301 = SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1); + + private static readonly byte MmShuffle1032 = SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2); +#pragma warning restore SA1310 // Field names should not contain underscore #endif static Vp8Encoding() @@ -476,17 +504,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS public static void FTransformPass1SSE2(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) { - var k937 = Vector128.Create(937); - var k1812 = Vector128.Create(1812); - Vector128 k88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16(); - Vector128 k88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16(); - Vector128 k5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16(); - Vector128 k5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16(); - // *in01 = 00 01 10 11 02 03 12 13 // *in23 = 20 21 30 31 22 23 32 33 - Vector128 shuf01_p = Sse2.ShuffleHigh(row01.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1)); - Vector128 shuf32_p = Sse2.ShuffleHigh(row23.AsInt16(), SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1)); + Vector128 shuf01_p = Sse2.ShuffleHigh(row01, MmShuffle2301); + Vector128 shuf32_p = Sse2.ShuffleHigh(row23, MmShuffle2301); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 @@ -500,12 +521,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] - Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, k88p); // [ (a0 + a1) << 3, ... ] - Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, k88m); // [ (a0 - a1) << 3, ... ] - Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, k5352_2217p); - Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, k5352_2217m); - Vector128 tmp12 = Sse2.Add(tmp11, k1812); - Vector128 tmp32 = Sse2.Add(tmp31, k937); + Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, K88p); // [ (a0 + a1) << 3, ... ] + Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, K88m); // [ (a0 - a1) << 3, ... ] + Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, K5352_2217p); + Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, K5352_2217m); + Vector128 tmp12 = Sse2.Add(tmp11, K1812); + Vector128 tmp32 = Sse2.Add(tmp31, K937); Vector128 tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9); Vector128 tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9); Vector128 s03 = Sse2.PackSignedSaturate(tmp0, tmp2); @@ -514,17 +535,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3 Vector128 v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32()); out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32()); - out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2)); + out32 = Sse2.Shuffle(v23, MmShuffle1032); } public static void FTransformPass2SSE2(Vector128 v01, Vector128 v32, Span output) { - var seven = Vector128.Create((short)7); - Vector128 k5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16(); - Vector128 k2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16(); - var k12000PlusOne = Vector128.Create(12000 + (1 << 16)); - var k51000 = Vector128.Create(51000); - // Same operations are done on the (0,3) and (1,2) pairs. // a3 = v0 - v3 // a2 = v1 - v2 @@ -532,10 +547,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64()); Vector128 b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16()); - Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, k5352_2217); - Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, k2217_5352); - Vector128 d1 = Sse2.Add(c1, k12000PlusOne); - Vector128 d3 = Sse2.Add(c3, k51000); + Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, K5352_2217); + Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, K2217_5352); + Vector128 d1 = Sse2.Add(c1, K12000PlusOne); + Vector128 d3 = Sse2.Add(c3, K51000); Vector128 e1 = Sse2.ShiftRightArithmetic(d1, 16); Vector128 e3 = Sse2.ShiftRightArithmetic(d3, 16); @@ -553,7 +568,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a0 = v0 + v3 // a1 = v1 + v2 Vector128 a01 = Sse2.Add(v01, v32); - Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), seven); + Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Seven); Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); Vector128 c0 = Sse2.Add(a01Plus7, a11); Vector128 c2 = Sse2.Subtract(a01Plus7, a11); From 38fd3a84582afa8e405316cc769c8832f44a35cb Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 12:51:25 +0100 Subject: [PATCH 39/47] Avoid bounds checks in IsFlat --- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 2fcea8ceea..f3b0e8e3df 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -744,19 +744,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static bool IsFlat(Span levels, int numBlocks, int thresh) { int score = 0; + ref short levelsRef = ref MemoryMarshal.GetReference(levels); + int offset = 0; while (numBlocks-- > 0) { for (int i = 1; i < 16; i++) { // omit DC, we're only interested in AC - score += levels[i] != 0 ? 1 : 0; + score += Unsafe.Add(ref levelsRef, offset) != 0 ? 1 : 0; if (score > thresh) { return false; } } - levels = levels.Slice(16); + offset += 16; } return true; From 798e9c3ad6e77e3bda0770a16e2e283c7bc45ff1 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 14:09:31 +0100 Subject: [PATCH 40/47] Add SSE2 version of FTransform2 --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 64 ++++++++++++++++++- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index a3a9c924cd..f657d32520 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -404,8 +404,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { - FTransform(src, reference, output, scratch); - FTransform(src.Slice(4), reference.Slice(4), output2, scratch); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { +#pragma warning disable SA1503 // Braces should not be omitted + fixed (byte* srcRef = src) + fixed (byte* referenceRef = reference) + { + // Load src. + Vector128 src0 = Sse2.LoadScalarVector128((ulong*)srcRef); + Vector128 src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps)); + Vector128 src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2))); + Vector128 src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3))); + + // Load ref. + Vector128 ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef); + Vector128 ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps)); + Vector128 ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2))); + Vector128 ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3))); + + // Convert both to 16 bit. + Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); + Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); + Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); + Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); + Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); + Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); + Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); + Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); + + // Compute difference. -> 00 01 02 03 00' 01' 02' 03' + Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); + Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); + Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); + Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); + + // Unpack and shuffle. + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); + + // First pass. + FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); + FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); + + // Second pass. + FTransformPass2SSE2(v01l, v32l, output); + FTransformPass2SSE2(v01h, v32h, output2); + } + } + else +#endif + { + FTransform(src, reference, output, scratch); + FTransform(src.Slice(4), reference.Slice(4), output2, scratch); + } } public static void FTransform(Span src, Span reference, Span output, Span scratch) @@ -567,7 +625,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a0 = v0 + v3 // a1 = v1 + v2 - Vector128 a01 = Sse2.Add(v01, v32); + Vector128 a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16()); Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Seven); Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); Vector128 c0 = Sse2.Add(a01Plus7, a11); From 0880c586521f0d87616ae579df35b068c186ecad Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 15:18:38 +0100 Subject: [PATCH 41/47] Add FTransform tests --- .../Formats/WebP/Vp8EncodingTests.cs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index 6bcb4f21f4..245e1cdc11 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -11,6 +11,57 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class Vp8EncodingTests { + private static void RunFTransform2Test() + { + // arrange + byte[] src = { 154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103, 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159, 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168, 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107, 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151, 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117, 170, 170, 169, 171, 171, 179, 173, 175 }; + byte[] reference = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129 }; + short[] actualOutput1 = new short[16]; + short[] actualOutput2 = new short[16]; + short[] expectedOutput1 = { 182, 4, 1, 1, 6, 7, -1, -4, 5, 0, -2, 1, 2, 1, 1, 1 }; + short[] expectedOutput2 = { 192, -34, 10, 1, -11, 8, 10, -7, 6, 3, -8, 4, 5, -3, -2, 6 }; + + // act + Vp8Encoding.FTransform2(src, reference, actualOutput1, actualOutput2, new int[16]); + + // assert + Assert.True(expectedOutput1.SequenceEqual(actualOutput1)); + Assert.True(expectedOutput2.SequenceEqual(actualOutput2)); + } + + private static void RunFTransformTest() + { + // arrange + byte[] src = + { + 154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103, + 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159, + 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168, + 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107, + 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151, + 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117, + 170, 170, 169, 171, 171, 179, 173, 175 + }; + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] actualOutput = new short[16]; + short[] expectedOutput = { 182, 4, 1, 1, 6, 7, -1, -4, 5, 0, -2, 1, 2, 1, 1, 1 }; + + // act + Vp8Encoding.FTransform(src, reference, actualOutput, new int[16]); + + // assert + Assert.True(expectedOutput.SequenceEqual(actualOutput)); + } + private static void RunOneInverseTransformTest() { // arrange @@ -75,6 +126,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp Assert.True(dst.SequenceEqual(expected)); } + [Fact] + public void FTransform2_Works() => RunFTransform2Test(); + + [Fact] + public void FTransform_Works() => RunFTransformTest(); + [Fact] public void OneInverseTransform_Works() => RunOneInverseTransformTest(); @@ -82,6 +139,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void TwoInverseTransform_Works() => RunTwoInverseTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void FTransform2_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransform2Test, HwIntrinsics.AllowAll); + + [Fact] + public void FTransform2_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransform2Test, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void FTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void FTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunFTransformTest, HwIntrinsics.DisableHWIntrinsic); + [Fact] public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll); From 81070c4e61060d019f043b012641ebe7dd02a388 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 15:30:36 +0100 Subject: [PATCH 42/47] Add missing #pragma warning restore SA1503 --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index f657d32520..d2b9704ab0 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -459,6 +459,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } else +#pragma warning restore SA1503 // Braces should not be omitted #endif { FTransform(src, reference, output, scratch); From cb084077281d30a20218ecb1e7f29009d91c191c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 15:58:15 +0100 Subject: [PATCH 43/47] Use nint in for loop --- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index f3b0e8e3df..de6f807da2 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -726,7 +726,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { uint v = src[0] * 0x01010101u; Span vSpan = BitConverter.GetBytes(v).AsSpan(); - for (int i = 0; i < 16; i++) + for (nint i = 0; i < 16; i++) { if (!src.Slice(0, 4).SequenceEqual(vSpan) || !src.Slice(4, 4).SequenceEqual(vSpan) || !src.Slice(8, 4).SequenceEqual(vSpan) || !src.Slice(12, 4).SequenceEqual(vSpan)) @@ -748,7 +748,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int offset = 0; while (numBlocks-- > 0) { - for (int i = 1; i < 16; i++) + for (nint i = 1; i < 16; i++) { // omit DC, we're only interested in AC score += Unsafe.Add(ref levelsRef, offset) != 0 ? 1 : 0; From 0215e99696d0e11295c5ce7506dbf33c5274174c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 16:19:12 +0100 Subject: [PATCH 44/47] Avoid pinning, avoid using LoadScalarVector128 --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 188 +++++++++--------- 1 file changed, 91 insertions(+), 97 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index d2b9704ab0..9fe526dbf5 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -407,59 +407,56 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) { -#pragma warning disable SA1503 // Braces should not be omitted - fixed (byte* srcRef = src) - fixed (byte* referenceRef = reference) - { - // Load src. - Vector128 src0 = Sse2.LoadScalarVector128((ulong*)srcRef); - Vector128 src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps)); - Vector128 src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2))); - Vector128 src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3))); - - // Load ref. - Vector128 ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef); - Vector128 ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps)); - Vector128 ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2))); - Vector128 ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3))); - - // Convert both to 16 bit. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); - Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); - Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); - Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); - Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); - Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); - Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); - - // Compute difference. -> 00 01 02 03 00' 01' 02' 03' - Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); - Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); - Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); - Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); - - // Unpack and shuffle. - // 00 01 02 03 0 0 0 0 - // 10 11 12 13 0 0 0 0 - // 20 21 22 23 0 0 0 0 - // 30 31 32 33 0 0 0 0 - Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); - Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); - - // First pass. - FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); - FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); - - // Second pass. - FTransformPass2SSE2(v01l, v32l, output); - FTransformPass2SSE2(v01h, v32h, output2); - } + ref byte srcRef = ref MemoryMarshal.GetReference(src); + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load src. + var src0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + var src1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0); + var src2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0); + var src3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0); + + // Load ref. + var ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0); + var ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0); + var ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0); + var ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); + + // Convert both to 16 bit. + Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); + Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); + Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); + Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); + Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); + Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); + Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); + Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); + + // Compute difference. -> 00 01 02 03 00' 01' 02' 03' + Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); + Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); + Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); + Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); + + // Unpack and shuffle. + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); + + // First pass. + FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); + FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); + + // Second pass. + FTransformPass2SSE2(v01l, v32l, output); + FTransformPass2SSE2(v01h, v32h, output2); } else -#pragma warning restore SA1503 // Braces should not be omitted #endif { FTransform(src, reference, output, scratch); @@ -472,52 +469,49 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) { -#pragma warning disable SA1503 // Braces should not be omitted - fixed (byte* srcRef = src) - fixed (byte* referenceRef = reference) - { - // Load src. - Vector128 src0 = Sse2.LoadScalarVector128((ulong*)srcRef); - Vector128 src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps)); - Vector128 src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2))); - Vector128 src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3))); - - // Load ref. - Vector128 ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef); - Vector128 ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps)); - Vector128 ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2))); - Vector128 ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3))); - - // 00 01 02 03 * - // 10 11 12 13 * - // 20 21 22 23 * - // 30 31 32 33 * - // Shuffle. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); - Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); - Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); - - // 00 01 10 11 02 03 12 13 * * ... - // 20 21 30 31 22 22 32 33 * * ... - - // Convert both to 16 bit. - Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero); - Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero); - Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero); - Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero); - - // Compute the difference. - Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); - Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); - - // First pass - FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32); - - // Second pass - FTransformPass2SSE2(v01, v32, output); - } -#pragma warning restore SA1503 // Braces should not be omitted + ref byte srcRef = ref MemoryMarshal.GetReference(src); + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load src. + var src0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + var src1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0); + var src2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0); + var src3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0); + + // Load ref. + var ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0); + var ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0); + var ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0); + var ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); + + // 00 01 02 03 * + // 10 11 12 13 * + // 20 21 22 23 * + // 30 31 32 33 * + // Shuffle. + Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); + Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); + Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); + Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); + + // 00 01 10 11 02 03 12 13 * * ... + // 20 21 30 31 22 22 32 33 * * ... + + // Convert both to 16 bit. + Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero); + Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero); + Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero); + Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero); + + // Compute the difference. + Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); + Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); + + // First pass. + FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32); + + // Second pass. + FTransformPass2SSE2(v01, v32, output); } else #endif From 83da0e069459d716bd3df4fcd7d53282419d295d Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sat, 27 Nov 2021 15:46:13 +0100 Subject: [PATCH 45/47] Reverse array access order to avoid bounds checks --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 9fe526dbf5..ab64a8ddb7 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -523,18 +523,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int refIdx = 0; for (i = 0; i < 4; i++) { - int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255]) - int d1 = src[srcIdx + 1] - reference[refIdx + 1]; - int d2 = src[srcIdx + 2] - reference[refIdx + 2]; int d3 = src[srcIdx + 3] - reference[refIdx + 3]; + int d2 = src[srcIdx + 2] - reference[refIdx + 2]; + int d1 = src[srcIdx + 1] - reference[refIdx + 1]; + int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255]) int a0 = d0 + d3; // 10b [-510,510] int a1 = d1 + d2; int a2 = d1 - d2; int a3 = d0 - d3; - tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] - tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] - tmp[2 + (i * 4)] = (a0 - a1) * 8; tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; + tmp[2 + (i * 4)] = (a0 - a1) * 8; + tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] + tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] srcIdx += WebpConstants.Bps; refIdx += WebpConstants.Bps; @@ -652,10 +652,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)]; int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)]; int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)]; - tmp[0 + (i * 4)] = a0 + a1; // 14b - tmp[1 + (i * 4)] = a3 + a2; - tmp[2 + (i * 4)] = a3 - a2; tmp[3 + (i * 4)] = a0 - a1; + tmp[2 + (i * 4)] = a3 - a2; + tmp[1 + (i * 4)] = a3 + a2; + tmp[0 + (i * 4)] = a0 + a1; // 14b inputIdx += 64; } From c0ee67b5b2b51eb51684b0c1fe3ae725331b9874 Mon Sep 17 00:00:00 2001 From: Justin Hopper Date: Sun, 28 Nov 2021 16:32:02 -0600 Subject: [PATCH 46/47] Added missing CancellationToken parameters to Image --- src/ImageSharp/Image.FromFile.cs | 15 +++++++----- src/ImageSharp/Image.FromStream.cs | 39 ++++++++++++++++++------------ 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/ImageSharp/Image.FromFile.cs b/src/ImageSharp/Image.FromFile.cs index 3a4b459c54..fce0835fba 100644 --- a/src/ImageSharp/Image.FromFile.cs +++ b/src/ImageSharp/Image.FromFile.cs @@ -255,6 +255,7 @@ namespace SixLabors.ImageSharp /// /// The file path to the image. /// The decoder. + /// The token to monitor for cancellation requests. /// The configuration is null. /// The path is null. /// The decoder is null. @@ -262,14 +263,15 @@ namespace SixLabors.ImageSharp /// Image format is not supported. /// Image contains invalid content. /// A representing the asynchronous operation. - public static Task LoadAsync(string path, IImageDecoder decoder) - => LoadAsync(Configuration.Default, path, decoder, default); + public static Task LoadAsync(string path, IImageDecoder decoder, CancellationToken cancellationToken = default) + => LoadAsync(Configuration.Default, path, decoder, cancellationToken); /// /// Create a new instance of the class from the given file. /// /// The file path to the image. /// The decoder. + /// The token to monitor for cancellation requests. /// The configuration is null. /// The path is null. /// The decoder is null. @@ -278,9 +280,9 @@ namespace SixLabors.ImageSharp /// Image contains invalid content. /// The pixel format. /// A representing the asynchronous operation. - public static Task> LoadAsync(string path, IImageDecoder decoder) + public static Task> LoadAsync(string path, IImageDecoder decoder, CancellationToken cancellationToken = default) where TPixel : unmanaged, IPixel - => LoadAsync(Configuration.Default, path, decoder, default); + => LoadAsync(Configuration.Default, path, decoder, cancellationToken); /// /// Create a new instance of the class from the given file. @@ -342,6 +344,7 @@ namespace SixLabors.ImageSharp /// Create a new instance of the class from the given file. /// /// The file path to the image. + /// The token to monitor for cancellation requests. /// The configuration is null. /// The path is null. /// Image format not recognised. @@ -349,9 +352,9 @@ namespace SixLabors.ImageSharp /// Image format is not supported. /// The pixel format. /// A representing the asynchronous operation. - public static Task> LoadAsync(string path) + public static Task> LoadAsync(string path, CancellationToken cancellationToken = default) where TPixel : unmanaged, IPixel - => LoadAsync(Configuration.Default, path, default(CancellationToken)); + => LoadAsync(Configuration.Default, path, cancellationToken); /// /// Create a new instance of the class from the given file. diff --git a/src/ImageSharp/Image.FromStream.cs b/src/ImageSharp/Image.FromStream.cs index 291d6f7cab..f5e32d8ce0 100644 --- a/src/ImageSharp/Image.FromStream.cs +++ b/src/ImageSharp/Image.FromStream.cs @@ -44,27 +44,29 @@ namespace SixLabors.ImageSharp /// By reading the header on the provided stream this calculates the images format type. /// /// The image stream to read the header from. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable. /// A representing the asynchronous operation or null if none is found. - public static Task DetectFormatAsync(Stream stream) - => DetectFormatAsync(Configuration.Default, stream); + public static Task DetectFormatAsync(Stream stream, CancellationToken cancellationToken = default) + => DetectFormatAsync(Configuration.Default, stream, cancellationToken); /// /// By reading the header on the provided stream this calculates the images format type. /// /// The configuration. /// The image stream to read the header from. + /// The token to monitor for cancellation requests. /// The configuration is null. /// The stream is null. /// The stream is not readable. /// A representing the asynchronous operation. - public static Task DetectFormatAsync(Configuration configuration, Stream stream) + public static Task DetectFormatAsync(Configuration configuration, Stream stream, CancellationToken cancellationToken = default) => WithSeekableStreamAsync( configuration, stream, (s, _) => InternalDetectFormatAsync(s, configuration), - default); + cancellationToken); /// /// Reads the raw image information from the specified stream without fully decoding it. @@ -83,6 +85,7 @@ namespace SixLabors.ImageSharp /// Reads the raw image information from the specified stream without fully decoding it. /// /// The image stream to read the header from. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable. /// Image contains invalid content. @@ -90,8 +93,8 @@ namespace SixLabors.ImageSharp /// A representing the asynchronous operation or null if /// a suitable detector is not found. /// - public static Task IdentifyAsync(Stream stream) - => IdentifyAsync(Configuration.Default, stream); + public static Task IdentifyAsync(Stream stream, CancellationToken cancellationToken = default) + => IdentifyAsync(Configuration.Default, stream, cancellationToken); /// /// Reads the raw image information from the specified stream without fully decoding it. @@ -227,13 +230,14 @@ namespace SixLabors.ImageSharp /// The pixel format is selected by the decoder. /// /// The stream containing image information. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable or the image format is not supported. /// Image format not recognised. /// Image contains invalid content. /// A representing the asynchronous operation. - public static Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream) - => LoadWithFormatAsync(Configuration.Default, stream); + public static Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream, CancellationToken cancellationToken = default) + => LoadWithFormatAsync(Configuration.Default, stream, cancellationToken); /// /// Decode a new instance of the class from the given stream. @@ -252,12 +256,14 @@ namespace SixLabors.ImageSharp /// The pixel format is selected by the decoder. /// /// The stream containing image information. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable or the image format is not supported. /// Image format not recognised. /// Image contains invalid content. /// A representing the asynchronous operation. - public static Task LoadAsync(Stream stream) => LoadAsync(Configuration.Default, stream); + public static Task LoadAsync(Stream stream, CancellationToken cancellationToken = default) + => LoadAsync(Configuration.Default, stream, cancellationToken); /// /// Decode a new instance of the class from the given stream. @@ -280,14 +286,15 @@ namespace SixLabors.ImageSharp /// /// The stream containing image information. /// The decoder. + /// The token to monitor for cancellation requests. /// The stream is null. /// The decoder is null. /// The stream is not readable or the image format is not supported. /// Image format not recognised. /// Image contains invalid content. /// A representing the asynchronous operation. - public static Task LoadAsync(Stream stream, IImageDecoder decoder) - => LoadAsync(Configuration.Default, stream, decoder); + public static Task LoadAsync(Stream stream, IImageDecoder decoder, CancellationToken cancellationToken = default) + => LoadAsync(Configuration.Default, stream, decoder, cancellationToken); /// /// Decode a new instance of the class from the given stream. @@ -388,15 +395,16 @@ namespace SixLabors.ImageSharp /// Create a new instance of the class from the given stream. /// /// The stream containing image information. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable or the image format is not supported. /// Image format not recognised. /// Image contains invalid content. /// The pixel format. /// A representing the asynchronous operation. - public static Task> LoadAsync(Stream stream) + public static Task> LoadAsync(Stream stream, CancellationToken cancellationToken = default) where TPixel : unmanaged, IPixel - => LoadAsync(Configuration.Default, stream); + => LoadAsync(Configuration.Default, stream, cancellationToken); /// /// Create a new instance of the class from the given stream. @@ -417,15 +425,16 @@ namespace SixLabors.ImageSharp /// Create a new instance of the class from the given stream. /// /// The stream containing image information. + /// The token to monitor for cancellation requests. /// The stream is null. /// The stream is not readable or the image format is not supported. /// Image format not recognised. /// Image contains invalid content. /// The pixel format. /// A representing the asynchronous operation. - public static async Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream) + public static async Task<(Image Image, IImageFormat Format)> LoadWithFormatAsync(Stream stream, CancellationToken cancellationToken = default) where TPixel : unmanaged, IPixel - => await LoadWithFormatAsync(Configuration.Default, stream).ConfigureAwait(false); + => await LoadWithFormatAsync(Configuration.Default, stream, cancellationToken).ConfigureAwait(false); /// /// Create a new instance of the class from the given stream. From 81433c2f5254b9eb6e55b96c8898f6b036c4d99f Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 29 Nov 2021 17:52:53 +1100 Subject: [PATCH 47/47] Remove more scalar bounds checks --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index ab64a8ddb7..f12a1a7855 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -542,14 +542,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (i = 0; i < 4; i++) { - int a0 = tmp[0 + i] + tmp[12 + i]; // 15b - int a1 = tmp[4 + i] + tmp[8 + i]; - int a2 = tmp[4 + i] - tmp[8 + i]; - int a3 = tmp[0 + i] - tmp[12 + i]; - output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b - output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); - output[8 + i] = (short)((a0 - a1 + 7) >> 4); + int t12 = tmp[12 + i]; // 15b + int t8 = tmp[8 + i]; + + int a1 = tmp[4 + i] + t8; + int a2 = tmp[4 + i] - t8; + int a0 = tmp[0 + i] + t12; // 15b + int a3 = tmp[0 + i] - t12; + output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); + output[8 + i] = (short)((a0 - a1 + 7) >> 4); + output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); + output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b } } } @@ -648,9 +652,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int inputIdx = 0; for (i = 0; i < 4; i++) { - int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)]; int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)]; + int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)]; tmp[3 + (i * 4)] = a0 - a1; tmp[2 + (i * 4)] = a3 - a2; @@ -662,18 +666,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (i = 0; i < 4; i++) { - int a0 = tmp[0 + i] + tmp[8 + i]; // 15b - int a1 = tmp[4 + i] + tmp[12 + i]; - int a2 = tmp[4 + i] - tmp[12 + i]; - int a3 = tmp[0 + i] - tmp[8 + i]; + int t12 = tmp[12 + i]; + int t8 = tmp[8 + i]; + + int a1 = tmp[4 + i] + t12; + int a2 = tmp[4 + i] - t12; + int a0 = tmp[0 + i] + t8; // 15b + int a3 = tmp[0 + i] - t8; + int b0 = a0 + a1; // 16b int b1 = a3 + a2; int b2 = a3 - a2; int b3 = a0 - a1; - output[0 + i] = (short)(b0 >> 1); // 15b - output[4 + i] = (short)(b1 >> 1); - output[8 + i] = (short)(b2 >> 1); + output[12 + i] = (short)(b3 >> 1); + output[8 + i] = (short)(b2 >> 1); + output[4 + i] = (short)(b1 >> 1); + output[0 + i] = (short)(b0 >> 1); // 15b } }