diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 79fd8d8543..489977cb82 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -2,6 +2,10 @@ // Licensed under the Apache License, Version 2.0. using System; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -9,7 +13,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// Iterator structure to iterate through macroblocks, pointing to the /// right neighbouring data (samples, predictions, contexts, ...) /// - internal class Vp8EncIterator + internal unsafe class Vp8EncIterator { public const int YOffEnc = 0; @@ -29,6 +33,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private readonly int mbh; +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); +#endif + /// /// Stride of the prediction plane(=4*mbw + 1). /// @@ -357,12 +365,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int q = quality; int kThreshold = 8 + ((17 - 8) * q / 100); int k; - uint[] dc = new uint[16]; + Span dc = stackalloc uint[16]; + Span tmp = stackalloc ushort[16]; uint m; uint m2; for (k = 0; k < 16; k += 4) { - this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k)); + this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp); } for (m = 0, m2 = 0, k = 0; k < 16; ++k) @@ -823,21 +832,61 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.Nz[this.nzIdx] = nz; } - private void Mean16x4(Span input, Span dc) + private void Mean16x4(Span input, Span dc, Span tmp) { - for (int k = 0; k < 4; k++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - uint avg = 0; - for (int y = 0; y < 4; y++) +#pragma warning disable SA1503 // Braces should not be omitted + tmp.Clear(); + fixed (byte* inputPtr = input) + fixed (ushort* tmpPtr = tmp) { - for (int x = 0; x < 4; x++) + Vector128 a0 = Sse2.LoadVector128(inputPtr); + Vector128 a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); + Vector128 a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); + Vector128 a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); + Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte + Vector128 c1 = Sse2.And(a1, Mean16x4Mask); + Vector128 c2 = Sse2.And(a2, Mean16x4Mask); + Vector128 c3 = Sse2.And(a3, Mean16x4Mask); + Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); + Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); + Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); + Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); + Vector128 e0 = Sse2.Add(d0, d1); + Vector128 e1 = Sse2.Add(d2, d3); + Vector128 f0 = Sse2.Add(e0, e1); + Sse2.Store(tmpPtr, f0.AsUInt16()); + } +#pragma warning restore SA1503 // Braces should not be omitted + + dc[0] = (uint)(tmp[1] + tmp[0]); + dc[1] = (uint)(tmp[3] + tmp[2]); + dc[2] = (uint)(tmp[5] + tmp[4]); + dc[3] = (uint)(tmp[7] + tmp[6]); + } + else +#endif + { + for (int k = 0; k < 4; k++) + { + uint avg = 0; + for (int y = 0; y < 4; y++) { - avg += input[x + (y * WebpConstants.Bps)]; + for (int x = 0; x < 4; x++) + { + avg += input[x + (y * WebpConstants.Bps)]; + } } - } - dc[k] = avg; - input = input.Slice(4); // go to next 4x4 block. + dc[k] = avg; + input = input.Slice(4); // go to next 4x4 block. + } } }