diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 3064ccc03..a10ec6eab 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -19,17 +19,63 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); #endif + // Note: method name in libwebp reference implementation is called VP8SSE16x16. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8Sse16X16(Span a, Span b) => GetSse(a, b, 16, 16); + public static int Vp8_Sse16X16(Span a, Span b) => Vp8_SseNxN(a, b, 16, 16); + // Note: method name in libwebp reference implementation is called VP8SSE16x8. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8Sse16X8(Span a, Span b) => GetSse(a, b, 16, 8); + public static int Vp8_Sse16X8(Span a, Span b) => Vp8_SseNxN(a, b, 16, 8); + // Note: method name in libwebp reference implementation is called VP8SSE4x4. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8Sse4X4(Span a, Span b) => GetSse(a, b, 4, 4); + public static int Vp8_Sse4X4(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + Vector128 a0 = Unsafe.As>(ref aRef); + Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps)); + Vector128 a2 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)); + Vector128 a3 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3)); + ref byte bRef = ref MemoryMarshal.GetReference(b); + Vector128 b0 = Unsafe.As>(ref bRef); + Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps)); + Vector128 b2 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)); + Vector128 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)); + + // Combine pair of lines. + Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32()); + Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32()); + + // Convert to 16b. + Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero); + Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero); + Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero); + Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero); + + // subtract, square and accumulate. + Vector128 d0 = Sse2.SubtractSaturate(a01s, b01s); + Vector128 d1 = Sse2.SubtractSaturate(a23s, b23s); + Vector128 e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16()); + Vector128 e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16()); + Vector128 sum = Sse2.Add(e0, e1); + + return Numerics.ReduceSum(sum); + } + else +#endif + { + return Vp8_SseNxN(a, b, 4, 4); + } + } [MethodImpl(InliningOptions.ShortMethod)] - public static int GetSse(Span a, Span b, int w, int h) + public static int Vp8_SseNxN(Span a, Span b, int w, int h) { int count = 0; int aOffset = 0; @@ -88,7 +134,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { - int diffSum = TTransformSse41(a, b, w, scratch); + int diffSum = TTransformSse41(a, b, w); return Math.Abs(diffSum) >> 5; } else @@ -615,11 +661,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// Returns the weighted sum of the absolute value of transformed coefficients. /// w[] contains a row-major 4 by 4 symmetric matrix. /// - public static int TTransformSse41(Span inputA, Span inputB, Span w, Span scratch) + public static int TTransformSse41(Span inputA, Span inputB, Span w) { - Span sum = scratch.Slice(0, 4); - sum.Clear(); - // Load and combine inputs. Vector128 ina0 = Unsafe.As>(ref MemoryMarshal.GetReference(inputA)); Vector128 ina1 = Unsafe.As>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16))); @@ -724,9 +767,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // difference of weighted sums. Vector128 result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32()); - ref int outputRef = ref MemoryMarshal.GetReference(sum); - Unsafe.As>(ref outputRef) = result.AsInt32(); - return sum[3] + sum[2] + sum[1] + sum[0]; + return Numerics.ReduceSum(result); } #endif @@ -739,7 +780,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void TransformOne(Span src, Span dst, Span scratch) { Span tmp = scratch.Slice(0, 16); - tmp.Clear(); int tmpOffset = 0; for (int srcOffset = 0; srcOffset < 4; srcOffset++) { diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 97ef27d25..38ed80590 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode); // Measure RD-score. - rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst); + rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst); rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0; rdCur.H = WebpConstants.Vp8FixedCostsI16[mode]; rdCur.R = it.GetCostLuma16(rdCur, proba, res); @@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode); // Compute RD-score. - rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst); + rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst); rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0; rdTmp.H = modeCosts[mode]; @@ -251,7 +251,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode); // Compute RD-score - rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst); + rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst); rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas. rdUv.H = WebpConstants.Vp8FixedCostsUv[mode]; rdUv.R = it.GetCostUv(rdUv, proba, res); @@ -340,8 +340,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]); Span tmp = it.Scratch2.AsSpan(0, 16); Span scratch = it.Scratch3.AsSpan(0, 16); - tmp.Clear(); - scratch.Clear(); Vp8Encoding.FTransform(src, reference, tmp, scratch); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); @@ -357,8 +355,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int n; Span tmp = it.Scratch2.AsSpan(0, 8 * 16); Span scratch = it.Scratch3.AsSpan(0, 16); - tmp.Clear(); - scratch.Clear(); for (n = 0; n < 8; n += 2) { @@ -411,7 +407,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]); - long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16); + long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16); if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit) { @@ -458,7 +454,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (mode = 0; mode < WebpConstants.NumBModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]); - long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4); + long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4); if (score < bestI4Score) { bestI4Mode = mode; @@ -507,7 +503,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) { Span reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]); - long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv); + long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv); if (score < bestUvScore) { bestMode = mode; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 0567a0f27..af7e8eaa3 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -81,7 +81,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { int i; Span tmp = scratch.Slice(0, 16); - tmp.Clear(); for (i = 0; i < 4; i++) { // vertical pass. @@ -124,7 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { int i; Span tmp = scratch.Slice(0, 16); - tmp.Clear(); int srcIdx = 0; int refIdx = 0; @@ -163,7 +161,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void FTransformWht(Span input, Span output, Span scratch) { Span tmp = scratch.Slice(0, 16); - tmp.Clear(); int i; int inputIdx = 0; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs index 7192fa2d0..6e724e475 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs @@ -49,7 +49,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.distribution.AsSpan().Clear(); for (j = startBlock; j < endBlock; j++) { - this.output.AsSpan().Clear(); this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output); // Convert coefficients to bin. diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs index 1c92a9d2d..69841b557 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs @@ -97,18 +97,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public void Clear() { - this.YDcLevels.AsSpan().Clear(); - this.YAcLevels.AsSpan().Clear(); - this.UvLevels.AsSpan().Clear(); - this.ModesI4.AsSpan().Clear(); - - for (int i = 0; i < 2; i++) - { - for (int j = 0; j < 3; j++) - { - this.Derr[i, j] = 0; - } - } + Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length); + Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length); + Array.Clear(this.UvLevels, 0, this.UvLevels.Length); + Array.Clear(this.ModesI4, 0, this.ModesI4.Length); + Array.Clear(this.Derr, 0, this.Derr.Length); } public void InitScore() diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 09727293c..d176a5933 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP [Trait("Format", "Webp")] public class LossyUtilsTests { + private static void RunVp8Sse4X4Test() + { + byte[] a = + { + 27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, + 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28, + 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26, + 26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, + 128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27, + 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128 + }; + + byte[] b = + { + 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, + 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204 + }; + + int expected = 27; + + int actual = LossyUtils.Vp8_Sse4X4(a, b); + + Assert.Equal(expected, actual); + } + private static void RunMean16x4Test() { // arrange @@ -61,6 +90,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP Assert.Equal(expected, actual); } + [Fact] + public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test(); + [Fact] public void Mean16x4_Works() => RunMean16x4Test(); @@ -68,6 +100,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP public void HadamardTransform_Works() => RunHadamardTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); + + [Fact] + public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic); + [Fact] public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);