From 56a7ebe2ae4da5e3426d52dc2746efe2d99c9c7f Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 1 Dec 2021 21:27:15 +0100 Subject: [PATCH] Add SSE2 version of VFilter8i --- .../Formats/Webp/Lossy/LossyUtils.cs | 60 ++++++++++++++++++- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7daec2d7e..2bc1983d4 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1148,9 +1148,47 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - int offset4mulstride = offset + (4 * stride); - FilterLoop24(u, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); - FilterLoop24(v, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // load uv h-edges. + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + Vector128 t2 = LoadUvEdge(ref uRef, ref vRef, offset); + Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); + Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); + Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, p1)); + + offset += 4 * stride; + + Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); + t1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); + t2 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + + // Store. + StoreUv(p1, ref uRef, ref vRef, offset + (-2 * stride)); + StoreUv(p0, ref uRef, ref vRef, offset + (-1 * stride)); + StoreUv(q1, ref uRef, ref vRef, offset); + StoreUv(q1, ref uRef, ref vRef, offset + stride); + } + else +#endif + { + int offset4mulstride = offset + (4 * stride); + FilterLoop24(u, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); + FilterLoop24(v, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); + } } [MethodImpl(InliningOptions.ShortMethod)] @@ -1648,6 +1686,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return Sse2.PackSignedSaturate(low1, high1); } + [MethodImpl(InliningOptions.ShortMethod)] private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) { var it = Vector128.Create((byte)ithresh); @@ -1658,6 +1697,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy mask = Sse2.And(threshMask, filterMask); } + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) + { + var uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, offset)), 0); + var vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, offset)), 0); + return Sse2.UnpackLow(uVec, vVec).AsByte(); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int offset) + { + Unsafe.As>(ref Unsafe.Add(ref uRef, offset)) = x.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref vRef, offset)) = x.GetUpper(); + } + // Compute abs(p - q) = subs(p - q) OR subs(q - p) [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 Abs(Vector128 p, Vector128 q)