diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index ee224e0b0..3064ccc03 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -13,8 +13,12 @@ using System.Runtime.Intrinsics.X86; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal static unsafe class LossyUtils + internal static class LossyUtils { +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Sse16X16(Span a, Span b) => GetSse(a, b, 16, 16); @@ -938,26 +942,55 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); } - [MethodImpl(InliningOptions.ShortMethod)] - public static uint LoadUv(byte u, byte v) => - (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). - - [MethodImpl(InliningOptions.ShortMethod)] - public static void YuvToBgr(int y, int u, int v, Span bgr) + public static void Mean16x4(Span input, Span dc) { - bgr[0] = (byte)YuvToB(y, u); - bgr[1] = (byte)YuvToG(y, u, v); - bgr[2] = (byte)YuvToR(y, v); - } - - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); - - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Ssse3.IsSupported) + { + Vector128 a0 = Unsafe.As>(ref MemoryMarshal.GetReference(input)); + Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); + Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); + Vector128 a3 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); + Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte + Vector128 c1 = Sse2.And(a1, Mean16x4Mask); + Vector128 c2 = Sse2.And(a2, Mean16x4Mask); + Vector128 c3 = Sse2.And(a3, Mean16x4Mask); + Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); + Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); + Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); + Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); + Vector128 e0 = Sse2.Add(d0, d1); + Vector128 e1 = Sse2.Add(d2, d3); + Vector128 f0 = Sse2.Add(e0, e1); + Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector128 wide = Sse2.UnpackLow(hadd, Vector128.Zero).AsUInt32(); + + ref uint outputRef = ref MemoryMarshal.GetReference(dc); + Unsafe.As>(ref outputRef) = wide; + } + else +#endif + { + for (int k = 0; k < 4; k++) + { + uint avg = 0; + for (int y = 0; y < 4; y++) + { + for (int x = 0; x < 4; x++) + { + avg += input[x + (y * WebpConstants.Bps)]; + } + } - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); + dc[k] = avg; + input = input.Slice(4); // go to next 4x4 block. + } + } + } [MethodImpl(InliningOptions.ShortMethod)] public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1); @@ -1163,9 +1196,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh; } - [MethodImpl(InliningOptions.ShortMethod)] - private static int MultHi(int v, int coeff) => (v * coeff) >> 8; - [MethodImpl(InliningOptions.ShortMethod)] private static void Store(Span dst, int x, int y, int v) { @@ -1188,13 +1218,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy [MethodImpl(InliningOptions.ShortMethod)] private static int Mul2(int a) => (a * 35468) >> 16; - [MethodImpl(InliningOptions.ShortMethod)] - private static byte Clip8(int v) - { - int yuvMask = (256 << 6) - 1; - return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); - } - [MethodImpl(InliningOptions.ShortMethod)] private static void Put8x8uv(byte value, Span dst) { diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 79fd8d854..6279aef65 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -357,15 +357,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int q = quality; int kThreshold = 8 + ((17 - 8) * q / 100); int k; - uint[] dc = new uint[16]; + Span dc = stackalloc uint[16]; + Span tmp = stackalloc ushort[16]; uint m; uint m2; for (k = 0; k < 16; k += 4) { - this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k)); + LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4)); } - for (m = 0, m2 = 0, k = 0; k < 16; ++k) + for (m = 0, m2 = 0, k = 0; k < 16; k++) { m += dc[k]; m2 += dc[k] * dc[k]; @@ -823,24 +824,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.Nz[this.nzIdx] = nz; } - private void Mean16x4(Span input, Span dc) - { - for (int k = 0; k < 4; k++) - { - uint avg = 0; - for (int y = 0; y < 4; y++) - { - for (int x = 0; x < 4; x++) - { - avg += input[x + (y * WebpConstants.Bps)]; - } - } - - dc[k] = avg; - input = input.Slice(4); // go to next 4x4 block. - } - } - private void ImportBlock(Span src, int srcStride, Span dst, int w, int h, int size) { int dstIdx = 0; diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 4f283f9f5..2f78842c6 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -747,21 +747,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { int xStep = 3; int lastPixelPair = (len - 1) >> 1; - uint tluv = LossyUtils.LoadUv(topU[0], topV[0]); // top-left sample - uint luv = LossyUtils.LoadUv(curU[0], curV[0]); // left-sample + uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample + uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); + YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); if (bottomY != null) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); + YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); } for (int x = 1; x <= lastPixelPair; x++) { - uint tuv = LossyUtils.LoadUv(topU[x], topV[x]); // top sample - uint uv = LossyUtils.LoadUv(curU[x], curV[x]); // sample + uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample + uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample // Precompute invariant values associated with first and second diagonals. uint avg = tluv + tuv + luv + uv + 0x00080008u; @@ -770,15 +770,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uv0 = (diag12 + tluv) >> 1; uint uv1 = (diag03 + tuv) >> 1; int xMul2 = x * 2; - LossyUtils.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); - LossyUtils.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); + YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); + YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); if (bottomY != null) { uv0 = (diag03 + luv) >> 1; uv1 = (diag12 + uv) >> 1; - LossyUtils.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); - LossyUtils.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); + YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); + YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); } tluv = tuv; @@ -788,11 +788,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if ((len & 1) == 0) { uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); + YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); if (bottomY != null) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); + YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); } } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index ed03c2e71..a9cf876c8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -299,5 +299,36 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2); return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255; } + + [MethodImpl(InliningOptions.ShortMethod)] + public static uint LoadUv(byte u, byte v) => + (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). + + [MethodImpl(InliningOptions.ShortMethod)] + public static void YuvToBgr(int y, int u, int v, Span bgr) + { + bgr[2] = (byte)YuvToR(y, v); + bgr[1] = (byte)YuvToG(y, u, v); + bgr[0] = (byte)YuvToB(y, u); + } + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); + + [MethodImpl(InliningOptions.ShortMethod)] + private static int MultHi(int v, int coeff) => (v * coeff) >> 8; + + [MethodImpl(InliningOptions.ShortMethod)] + private static byte Clip8(int v) + { + int yuvMask = (256 << 6) - 1; + return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); + } } } diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index f8b488fde..09727293c 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System.Linq; using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; @@ -10,6 +11,29 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP [Trait("Format", "Webp")] public class LossyUtilsTests { + private static void RunMean16x4Test() + { + // arrange + byte[] input = + { + 154, 145, 102, 115, 127, 129, 126, 125, 126, 120, 133, 152, 157, 153, 119, 94, 104, 116, 111, 113, + 113, 109, 105, 124, 173, 175, 177, 170, 175, 172, 166, 164, 151, 141, 99, 114, 125, 126, 135, 150, + 133, 115, 127, 149, 141, 168, 100, 54, 110, 117, 115, 116, 119, 115, 117, 130, 174, 174, 174, 157, + 146, 171, 166, 158, 117, 140, 96, 111, 119, 119, 136, 171, 188, 134, 121, 126, 136, 119, 59, 77, + 109, 115, 113, 120, 120, 117, 128, 115, 174, 173, 173, 161, 152, 148, 153, 162, 105, 140, 96, 114, + 115, 122, 141, 173, 190, 190, 142, 106, 151, 78, 66, 141, 110, 117, 123, 136, 118, 124, 127, 114, + 173, 175, 166, 155, 155, 159, 159, 158 + }; + uint[] dc = new uint[4]; + uint[] expectedDc = { 1940, 2139, 2252, 1813 }; + + // act + LossyUtils.Mean16x4(input, dc); + + // assert + Assert.True(dc.SequenceEqual(expectedDc)); + } + private static void RunHadamardTransformTest() { byte[] a = @@ -37,16 +61,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP Assert.Equal(expected, actual); } + [Fact] + public void Mean16x4_Works() => RunMean16x4Test(); + [Fact] public void HadamardTransform_Works() => RunHadamardTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); + + [Fact] + public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); + [Fact] public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); [Fact] public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic); #endif - } }