From 765f5a23138ce905056a2e7f69f4a3c0feaf4842 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 7 Nov 2021 16:13:28 +0100 Subject: [PATCH 01/12] Add SSE2 version of Mean16x4 --- .../Formats/Webp/Lossy/Vp8EncIterator.cs | 73 ++++++++++++++++--- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 79fd8d854..489977cb8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -2,6 +2,10 @@ // Licensed under the Apache License, Version 2.0. using System; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -9,7 +13,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// Iterator structure to iterate through macroblocks, pointing to the /// right neighbouring data (samples, predictions, contexts, ...) /// - internal class Vp8EncIterator + internal unsafe class Vp8EncIterator { public const int YOffEnc = 0; @@ -29,6 +33,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private readonly int mbh; +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); +#endif + /// /// Stride of the prediction plane(=4*mbw + 1). /// @@ -357,12 +365,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int q = quality; int kThreshold = 8 + ((17 - 8) * q / 100); int k; - uint[] dc = new uint[16]; + Span dc = stackalloc uint[16]; + Span tmp = stackalloc ushort[16]; uint m; uint m2; for (k = 0; k < 16; k += 4) { - this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k)); + this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp); } for (m = 0, m2 = 0, k = 0; k < 16; ++k) @@ -823,21 +832,61 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.Nz[this.nzIdx] = nz; } - private void Mean16x4(Span input, Span dc) + private void Mean16x4(Span input, Span dc, Span tmp) { - for (int k = 0; k < 4; k++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - uint avg = 0; - for (int y = 0; y < 4; y++) +#pragma warning disable SA1503 // Braces should not be omitted + tmp.Clear(); + fixed (byte* inputPtr = input) + fixed (ushort* tmpPtr = tmp) { - for (int x = 0; x < 4; x++) + Vector128 a0 = Sse2.LoadVector128(inputPtr); + Vector128 a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); + Vector128 a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); + Vector128 a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); + Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte + Vector128 c1 = Sse2.And(a1, Mean16x4Mask); + Vector128 c2 = Sse2.And(a2, Mean16x4Mask); + Vector128 c3 = Sse2.And(a3, Mean16x4Mask); + Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); + Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); + Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); + Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); + Vector128 e0 = Sse2.Add(d0, d1); + Vector128 e1 = Sse2.Add(d2, d3); + Vector128 f0 = Sse2.Add(e0, e1); + Sse2.Store(tmpPtr, f0.AsUInt16()); + } +#pragma warning restore SA1503 // Braces should not be omitted + + dc[0] = (uint)(tmp[1] + tmp[0]); + dc[1] = (uint)(tmp[3] + tmp[2]); + dc[2] = (uint)(tmp[5] + tmp[4]); + dc[3] = (uint)(tmp[7] + tmp[6]); + } + else +#endif + { + for (int k = 0; k < 4; k++) + { + uint avg = 0; + for (int y = 0; y < 4; y++) { - avg += input[x + (y * WebpConstants.Bps)]; + for (int x = 0; x < 4; x++) + { + avg += input[x + (y * WebpConstants.Bps)]; + } } - } - dc[k] = avg; - input = input.Slice(4); // go to next 4x4 block. + dc[k] = avg; + input = input.Slice(4); // go to next 4x4 block. + } } } From 8b8871b3ba75581ee2ff5f3fcb294bd640743136 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 7 Nov 2021 16:39:42 +0100 Subject: [PATCH 02/12] Make Mean16x4 static and move to LossyUtils --- .../Formats/Webp/Lossy/LossyUtils.cs | 68 +++++++++++++++++- .../Formats/Webp/Lossy/Vp8EncIterator.cs | 72 +------------------ 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index d5db3dffa..c3f6e522a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -4,12 +4,20 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal static class LossyUtils + internal static unsafe class LossyUtils { +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Sse16X16(Span a, Span b) => GetSse(a, b, 16, 16); @@ -801,6 +809,64 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); } + public static void Mean16x4(Span input, Span dc, Span tmp) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { +#pragma warning disable SA1503 // Braces should not be omitted + tmp.Clear(); + fixed (byte* inputPtr = input) + fixed (ushort* tmpPtr = tmp) + { + Vector128 a0 = Sse2.LoadVector128(inputPtr); + Vector128 a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); + Vector128 a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); + Vector128 a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); + Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte + Vector128 c1 = Sse2.And(a1, Mean16x4Mask); + Vector128 c2 = Sse2.And(a2, Mean16x4Mask); + Vector128 c3 = Sse2.And(a3, Mean16x4Mask); + Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); + Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); + Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); + Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); + Vector128 e0 = Sse2.Add(d0, d1); + Vector128 e1 = Sse2.Add(d2, d3); + Vector128 f0 = Sse2.Add(e0, e1); + Sse2.Store(tmpPtr, f0.AsUInt16()); + } +#pragma warning restore SA1503 // Braces should not be omitted + + dc[0] = (uint)(tmp[1] + tmp[0]); + dc[1] = (uint)(tmp[3] + tmp[2]); + dc[2] = (uint)(tmp[5] + tmp[4]); + dc[3] = (uint)(tmp[7] + tmp[6]); + } + else +#endif + { + for (int k = 0; k < 4; k++) + { + uint avg = 0; + for (int y = 0; y < 4; y++) + { + for (int x = 0; x < 4; x++) + { + avg += input[x + (y * WebpConstants.Bps)]; + } + } + + dc[k] = avg; + input = input.Slice(4); // go to next 4x4 block. + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] public static uint LoadUv(byte u, byte v) => (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 489977cb8..57e18832e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -2,10 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -13,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// Iterator structure to iterate through macroblocks, pointing to the /// right neighbouring data (samples, predictions, contexts, ...) /// - internal unsafe class Vp8EncIterator + internal class Vp8EncIterator { public const int YOffEnc = 0; @@ -33,10 +29,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy private readonly int mbh; -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector128 Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); -#endif - /// /// Stride of the prediction plane(=4*mbw + 1). /// @@ -371,10 +363,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uint m2; for (k = 0; k < 16; k += 4) { - this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp); + LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp); } - for (m = 0, m2 = 0, k = 0; k < 16; ++k) + for (m = 0, m2 = 0, k = 0; k < 16; k++) { m += dc[k]; m2 += dc[k] * dc[k]; @@ -832,64 +824,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.Nz[this.nzIdx] = nz; } - private void Mean16x4(Span input, Span dc, Span tmp) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) - { -#pragma warning disable SA1503 // Braces should not be omitted - tmp.Clear(); - fixed (byte* inputPtr = input) - fixed (ushort* tmpPtr = tmp) - { - Vector128 a0 = Sse2.LoadVector128(inputPtr); - Vector128 a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); - Vector128 a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); - Vector128 a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); - Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte - Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); - Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); - Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); - Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte - Vector128 c1 = Sse2.And(a1, Mean16x4Mask); - Vector128 c2 = Sse2.And(a2, Mean16x4Mask); - Vector128 c3 = Sse2.And(a3, Mean16x4Mask); - Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); - Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); - Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); - Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); - Vector128 e0 = Sse2.Add(d0, d1); - Vector128 e1 = Sse2.Add(d2, d3); - Vector128 f0 = Sse2.Add(e0, e1); - Sse2.Store(tmpPtr, f0.AsUInt16()); - } -#pragma warning restore SA1503 // Braces should not be omitted - - dc[0] = (uint)(tmp[1] + tmp[0]); - dc[1] = (uint)(tmp[3] + tmp[2]); - dc[2] = (uint)(tmp[5] + tmp[4]); - dc[3] = (uint)(tmp[7] + tmp[6]); - } - else -#endif - { - for (int k = 0; k < 4; k++) - { - uint avg = 0; - for (int y = 0; y < 4; y++) - { - for (int x = 0; x < 4; x++) - { - avg += input[x + (y * WebpConstants.Bps)]; - } - } - - dc[k] = avg; - input = input.Slice(4); // go to next 4x4 block. - } - } - } - private void ImportBlock(Span src, int srcStride, Span dst, int w, int h, int size) { int dstIdx = 0; From 984971e1d9aca406cfd41b742da96b2d8447fa1b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 7 Nov 2021 16:48:10 +0100 Subject: [PATCH 03/12] Move yuv related methods to YuvConversion class --- .../Formats/Webp/Lossy/LossyUtils.cs | 31 ------------------- .../Formats/Webp/Lossy/WebpLossyDecoder.cs | 24 +++++++------- .../Formats/Webp/Lossy/YuvConversion.cs | 31 +++++++++++++++++++ 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index c3f6e522a..b2513feb5 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -867,27 +867,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } - [MethodImpl(InliningOptions.ShortMethod)] - public static uint LoadUv(byte u, byte v) => - (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). - - [MethodImpl(InliningOptions.ShortMethod)] - public static void YuvToBgr(int y, int u, int v, Span bgr) - { - bgr[0] = (byte)YuvToB(y, u); - bgr[1] = (byte)YuvToG(y, u, v); - bgr[2] = (byte)YuvToR(y, v); - } - - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); - - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); - - [MethodImpl(InliningOptions.ShortMethod)] - public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); - [MethodImpl(InliningOptions.ShortMethod)] public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1); @@ -1092,9 +1071,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh; } - [MethodImpl(InliningOptions.ShortMethod)] - private static int MultHi(int v, int coeff) => (v * coeff) >> 8; - [MethodImpl(InliningOptions.ShortMethod)] private static void Store(Span dst, int x, int y, int v) { @@ -1117,13 +1093,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy [MethodImpl(InliningOptions.ShortMethod)] private static int Mul2(int a) => (a * 35468) >> 16; - [MethodImpl(InliningOptions.ShortMethod)] - private static byte Clip8(int v) - { - int yuvMask = (256 << 6) - 1; - return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); - } - [MethodImpl(InliningOptions.ShortMethod)] private static void Put8x8uv(byte value, Span dst) { diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs index 4f283f9f5..2f78842c6 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs @@ -747,21 +747,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { int xStep = 3; int lastPixelPair = (len - 1) >> 1; - uint tluv = LossyUtils.LoadUv(topU[0], topV[0]); // top-left sample - uint luv = LossyUtils.LoadUv(curU[0], curV[0]); // left-sample + uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample + uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); + YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst); if (bottomY != null) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); + YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst); } for (int x = 1; x <= lastPixelPair; x++) { - uint tuv = LossyUtils.LoadUv(topU[x], topV[x]); // top sample - uint uv = LossyUtils.LoadUv(curU[x], curV[x]); // sample + uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample + uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample // Precompute invariant values associated with first and second diagonals. uint avg = tluv + tuv + luv + uv + 0x00080008u; @@ -770,15 +770,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uv0 = (diag12 + tluv) >> 1; uint uv1 = (diag03 + tuv) >> 1; int xMul2 = x * 2; - LossyUtils.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); - LossyUtils.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); + YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep)); + YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep)); if (bottomY != null) { uv0 = (diag03 + luv) >> 1; uv1 = (diag12 + uv) >> 1; - LossyUtils.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); - LossyUtils.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); + YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep)); + YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep)); } tluv = tuv; @@ -788,11 +788,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if ((len & 1) == 0) { uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); + YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep)); if (bottomY != null) { uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2; - LossyUtils.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); + YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep)); } } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index ed03c2e71..24143785a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -299,5 +299,36 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2); return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255; } + + [MethodImpl(InliningOptions.ShortMethod)] + public static uint LoadUv(byte u, byte v) => + (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). + + [MethodImpl(InliningOptions.ShortMethod)] + public static void YuvToBgr(int y, int u, int v, Span bgr) + { + bgr[0] = (byte)YuvToB(y, u); + bgr[1] = (byte)YuvToG(y, u, v); + bgr[2] = (byte)YuvToR(y, v); + } + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); + + [MethodImpl(InliningOptions.ShortMethod)] + public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); + + [MethodImpl(InliningOptions.ShortMethod)] + private static int MultHi(int v, int coeff) => (v * coeff) >> 8; + + [MethodImpl(InliningOptions.ShortMethod)] + private static byte Clip8(int v) + { + int yuvMask = (256 << 6) - 1; + return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); + } } } From 0c96e37ba639d1d44b64840c41f01455a53eb9af Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 7 Nov 2021 17:39:50 +0100 Subject: [PATCH 04/12] Add Mean16x4 sse tests --- .../Formats/Webp/Lossy/LossyUtils.cs | 2 +- .../Formats/WebP/LossyUtilsTests.cs | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index b2513feb5..74448cf52 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy internal static unsafe class LossyUtils { #if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector128 Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); + private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); #endif [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs new file mode 100644 index 000000000..5062f845b --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -0,0 +1,49 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossy; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + [Trait("Format", "Webp")] + public class LossyUtilsTests + { + private static void RunMean16x4Test() + { + // arrange + byte[] input = + { + 154, 145, 102, 115, 127, 129, 126, 125, 126, 120, 133, 152, 157, 153, 119, 94, 104, 116, 111, 113, + 113, 109, 105, 124, 173, 175, 177, 170, 175, 172, 166, 164, 151, 141, 99, 114, 125, 126, 135, 150, + 133, 115, 127, 149, 141, 168, 100, 54, 110, 117, 115, 116, 119, 115, 117, 130, 174, 174, 174, 157, + 146, 171, 166, 158, 117, 140, 96, 111, 119, 119, 136, 171, 188, 134, 121, 126, 136, 119, 59, 77, + 109, 115, 113, 120, 120, 117, 128, 115, 174, 173, 173, 161, 152, 148, 153, 162, 105, 140, 96, 114, + 115, 122, 141, 173, 190, 190, 142, 106, 151, 78, 66, 141, 110, 117, 123, 136, 118, 124, 127, 114, + 173, 175, 166, 155, 155, 159, 159, 158 + }; + uint[] dc = new uint[4]; + ushort[] tmp = new ushort[8]; + uint[] expectedDc = { 1940, 2139, 2252, 1813 }; + + // act + LossyUtils.Mean16x4(input, dc, tmp); + + // assert + Assert.True(dc.SequenceEqual(expectedDc)); + } + + [Fact] + public void Mean16x4_Works() => RunMean16x4Test(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); + + [Fact] + public void Mean16x4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableSSE2); +#endif + } +} From 3c9c1bb23eb63863fcac38ac4478f097d73e1e0f Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 11:21:18 +0100 Subject: [PATCH 05/12] Avoid pinning --- .../Formats/Webp/Lossy/LossyUtils.cs | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 74448cf52..6de2989bd 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -4,6 +4,7 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -814,33 +815,28 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) { -#pragma warning disable SA1503 // Braces should not be omitted tmp.Clear(); - fixed (byte* inputPtr = input) - fixed (ushort* tmpPtr = tmp) - { - Vector128 a0 = Sse2.LoadVector128(inputPtr); - Vector128 a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); - Vector128 a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); - Vector128 a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); - Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte - Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); - Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); - Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); - Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte - Vector128 c1 = Sse2.And(a1, Mean16x4Mask); - Vector128 c2 = Sse2.And(a2, Mean16x4Mask); - Vector128 c3 = Sse2.And(a3, Mean16x4Mask); - Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); - Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); - Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); - Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); - Vector128 e0 = Sse2.Add(d0, d1); - Vector128 e1 = Sse2.Add(d2, d3); - Vector128 f0 = Sse2.Add(e0, e1); - Sse2.Store(tmpPtr, f0.AsUInt16()); - } -#pragma warning restore SA1503 // Braces should not be omitted + Vector128 a0 = Unsafe.As>(ref MemoryMarshal.GetReference(input)); + Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); + Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); + Vector128 a3 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); + Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = Sse2.And(a0, Mean16x4Mask); // lo byte + Vector128 c1 = Sse2.And(a1, Mean16x4Mask); + Vector128 c2 = Sse2.And(a2, Mean16x4Mask); + Vector128 c3 = Sse2.And(a3, Mean16x4Mask); + Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); + Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); + Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); + Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); + Vector128 e0 = Sse2.Add(d0, d1); + Vector128 e1 = Sse2.Add(d2, d3); + Vector128 f0 = Sse2.Add(e0, e1); + ref ushort outputRef = ref MemoryMarshal.GetReference(tmp); + Unsafe.As>(ref outputRef) = f0.AsUInt16(); dc[0] = (uint)(tmp[1] + tmp[0]); dc[1] = (uint)(tmp[3] + tmp[2]); From 1418e53bfbb719c36d57f4ac46317ca990d2fba2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 14:58:31 +0100 Subject: [PATCH 06/12] Remove not need Clear of tmp buffer --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 408f6f066..7c262a30e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -947,7 +947,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) { - tmp.Clear(); Vector128 a0 = Unsafe.As>(ref MemoryMarshal.GetReference(input)); Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); From 3cfa040b2099a5c91c8b1e15e5f2fd4c440a6f77 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 15:38:20 +0100 Subject: [PATCH 07/12] Use Ssse3.HorizontalAdd --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 14 +++++++------- .../Formats/Webp/Lossy/Vp8EncIterator.cs | 2 +- .../Formats/WebP/LossyUtilsTests.cs | 5 ++--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7c262a30e..5b27af821 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -942,7 +942,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); } - public static void Mean16x4(Span input, Span dc, Span tmp) + public static void Mean16x4(Span input, Span dc) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) @@ -966,13 +966,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 e0 = Sse2.Add(d0, d1); Vector128 e1 = Sse2.Add(d2, d3); Vector128 f0 = Sse2.Add(e0, e1); - ref ushort outputRef = ref MemoryMarshal.GetReference(tmp); - Unsafe.As>(ref outputRef) = f0.AsUInt16(); + Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector64 lower = hadd.GetLower(); - dc[0] = (uint)(tmp[1] + tmp[0]); - dc[1] = (uint)(tmp[3] + tmp[2]); - dc[2] = (uint)(tmp[5] + tmp[4]); - dc[3] = (uint)(tmp[7] + tmp[6]); + dc[0] = (uint)lower.GetElement(0); + dc[1] = (uint)lower.GetElement(1); + dc[2] = (uint)lower.GetElement(2); + dc[3] = (uint)lower.GetElement(3); } else #endif diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 57e18832e..6279aef65 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -363,7 +363,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy uint m2; for (k = 0; k < 16; k += 4) { - LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp); + LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4)); } for (m = 0, m2 = 0, k = 0; k < 16; k++) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 16b8e1166..09727293c 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -25,11 +25,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP 173, 175, 166, 155, 155, 159, 159, 158 }; uint[] dc = new uint[4]; - ushort[] tmp = new ushort[8]; uint[] expectedDc = { 1940, 2139, 2252, 1813 }; // act - LossyUtils.Mean16x4(input, dc, tmp); + LossyUtils.Mean16x4(input, dc); // assert Assert.True(dc.SequenceEqual(expectedDc)); @@ -73,7 +72,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); [Fact] - public void Mean16x4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableSSE2); + public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic); [Fact] public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll); From 84732bf14722ef50e01f1fd21c6c86e61a77eae2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 15:39:16 +0100 Subject: [PATCH 08/12] Reverse access to bgr --- src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 24143785a..a9cf876c8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -307,9 +307,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy [MethodImpl(InliningOptions.ShortMethod)] public static void YuvToBgr(int y, int u, int v, Span bgr) { - bgr[0] = (byte)YuvToB(y, u); - bgr[1] = (byte)YuvToG(y, u, v); bgr[2] = (byte)YuvToR(y, v); + bgr[1] = (byte)YuvToG(y, u, v); + bgr[0] = (byte)YuvToB(y, u); } [MethodImpl(InliningOptions.ShortMethod)] From 50013d70f28c2d67e1a7e96e61174460e67fbc7f Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Tue, 9 Nov 2021 15:51:02 +0100 Subject: [PATCH 09/12] Update src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs Reverse access to dc Co-authored-by: James Jackson-South --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 5b27af821..e6a4e6170 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -969,10 +969,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); Vector64 lower = hadd.GetLower(); - dc[0] = (uint)lower.GetElement(0); - dc[1] = (uint)lower.GetElement(1); - dc[2] = (uint)lower.GetElement(2); dc[3] = (uint)lower.GetElement(3); + dc[2] = (uint)lower.GetElement(2); + dc[1] = (uint)lower.GetElement(1); + dc[0] = (uint)lower.GetElement(0); } else #endif From f0cb89e811be0fefc6a5a4d2f76797e7a2d8822c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 16:36:42 +0100 Subject: [PATCH 10/12] Change IsSupported check from SSE2 to Ssse3 --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index e6a4e6170..4ef9c5694 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -945,7 +945,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void Mean16x4(Span input, Span dc) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Ssse3.IsSupported) { Vector128 a0 = Unsafe.As>(ref MemoryMarshal.GetReference(input)); Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); From 1452ba00836cca274719844100259606750d56b7 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 16:40:55 +0100 Subject: [PATCH 11/12] Remove not needed GetLower --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 4ef9c5694..ac3b1d380 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -967,12 +967,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 e1 = Sse2.Add(d2, d3); Vector128 f0 = Sse2.Add(e0, e1); Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); - Vector64 lower = hadd.GetLower(); - dc[3] = (uint)lower.GetElement(3); - dc[2] = (uint)lower.GetElement(2); - dc[1] = (uint)lower.GetElement(1); - dc[0] = (uint)lower.GetElement(0); + dc[3] = (uint)hadd.GetElement(3); + dc[2] = (uint)hadd.GetElement(2); + dc[1] = (uint)hadd.GetElement(1); + dc[0] = (uint)hadd.GetElement(0); } else #endif From 7d8225b59a633b08b51e74bbb960d4d52b420a84 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 9 Nov 2021 19:38:12 +0100 Subject: [PATCH 12/12] Use UnpackLow to set the dc values --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index ac3b1d380..3064ccc03 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -13,7 +13,7 @@ using System.Runtime.Intrinsics.X86; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal static unsafe class LossyUtils + internal static class LossyUtils { #if SUPPORTS_RUNTIME_INTRINSICS private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); @@ -967,11 +967,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 e1 = Sse2.Add(d2, d3); Vector128 f0 = Sse2.Add(e0, e1); Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector128 wide = Sse2.UnpackLow(hadd, Vector128.Zero).AsUInt32(); - dc[3] = (uint)hadd.GetElement(3); - dc[2] = (uint)hadd.GetElement(2); - dc[1] = (uint)hadd.GetElement(1); - dc[0] = (uint)hadd.GetElement(0); + ref uint outputRef = ref MemoryMarshal.GetReference(dc); + Unsafe.As>(ref outputRef) = wide; } else #endif