From f2e4257d6675a131d93067a14e9e0a5d36716079 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 00:15:44 +1000 Subject: [PATCH] Port filters --- .../Common/Helpers/Vector128Utilities.cs | 67 +++++- .../Formats/Webp/Lossy/LossyUtils.cs | 202 +++++++++--------- 2 files changed, 167 insertions(+), 102 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 9b0c1d68d8..bfd237a2d7 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -3,7 +3,6 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -23,6 +22,35 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector128_ #pragma warning restore SA1649 // File name should match first type name { + /// + /// Average packed unsigned 8-bit integers in and , and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to average. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to average. + /// + /// + /// A vector containing the average of the packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Average(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.Average(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.FusedAddRoundedHalving(left, right); + } + + // Portable fallback: (a + b + 1) >> 1 + return (left + right + Vector128.Create((byte)1)) >> 1; + } + /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -444,6 +472,43 @@ internal static class Vector128_ } } + /// + /// Horizontally add adjacent pairs of 16-bit integers in and , and + /// pack the signed 16-bit results. + /// + /// + /// The first vector containing packed signed 16-bit integers to add. + /// + /// + /// The second vector containing packed signed 16-bit integers to add. + /// + /// + /// A vector containing the results of horizontally adding adjacent pairs of packed signed 16-bit integers + /// + public static Vector128 HorizontalAdd(Vector128 left, Vector128 right) + { + if (Ssse3.IsSupported) + { + return Ssse3.HorizontalAdd(left, right); + } + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.AddPairwise(left, right); + } + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8); + Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8); + Vector128 v0 = Vector128.Shuffle(right, even); + Vector128 v1 = Vector128.Shuffle(right, odd); + Vector128 v2 = Vector128.Shuffle(left, even); + Vector128 v3 = Vector128.Shuffle(left, odd); + + return v0 + v1 + v2 + v3; + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 5b85e39987..b8c4c9c312 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1413,7 +1413,7 @@ internal static class LossyUtils // Simple In-loop filtering (Paragraph 15.2) public static void SimpleVFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load. ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1423,7 +1423,7 @@ internal static class LossyUtils Vector128 q0 = Unsafe.As>(ref pRef); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)stride)); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); // Store. ref byte outputRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1446,13 +1446,13 @@ internal static class LossyUtils public static void SimpleHFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Beginning of p1 ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)(offset - 2)); Load16x4Vector128(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); Store16x4Vector128(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); } else @@ -1471,7 +1471,7 @@ internal static class LossyUtils public static void SimpleVFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1491,7 +1491,7 @@ internal static class LossyUtils public static void SimpleHFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1513,7 +1513,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (4 * stride)))); @@ -1555,21 +1555,21 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); Load16x4Vector128(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(q3, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1585,7 +1585,7 @@ internal static class LossyUtils public static void VFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); @@ -1600,22 +1600,22 @@ internal static class LossyUtils offset += 4 * stride; Vector128 mask = AbsVector128(p0, p1); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); // Store. ref byte outputRef = ref MemoryMarshal.GetReference(b); @@ -1641,7 +1641,7 @@ internal static class LossyUtils public static void HFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); @@ -1657,17 +1657,17 @@ internal static class LossyUtils // Compute partial mask. mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); - mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); Store16x4Vector128(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1690,7 +1690,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); @@ -1701,17 +1701,17 @@ internal static class LossyUtils Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - stride); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t1, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); Vector128 q2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (2 * stride)); t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (3 * stride)); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t1, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1734,21 +1734,21 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(q3, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1766,7 +1766,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); @@ -1777,8 +1777,8 @@ internal static class LossyUtils Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, p1)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); offset += 4 * stride; @@ -1787,12 +1787,12 @@ internal static class LossyUtils t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Store. StoreUvVector128(p1, ref uRef, ref vRef, offset + (-2 * stride)); @@ -1811,27 +1811,27 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, p1)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); // Beginning of q0. offset += 4; Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Beginning of p1. offset -= 2; @@ -1847,7 +1847,7 @@ internal static class LossyUtils public static void Mean16x4(Span input, Span dc) { - if (Ssse3.IsSupported) + if (Vector128.IsHardwareAccelerated) { Vector128 mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); @@ -1855,23 +1855,23 @@ internal static class LossyUtils Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); Vector128 a3 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); - Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte - Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); - Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); - Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); - Vector128 c0 = Sse2.And(a0, mean16x4Mask); // lo byte - Vector128 c1 = Sse2.And(a1, mean16x4Mask); - Vector128 c2 = Sse2.And(a2, mean16x4Mask); - Vector128 c3 = Sse2.And(a3, mean16x4Mask); - Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); - Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); - Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); - Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); - Vector128 e0 = Sse2.Add(d0, d1); - Vector128 e1 = Sse2.Add(d2, d3); - Vector128 f0 = Sse2.Add(e0, e1); - Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); - Vector128 wide = Sse2.UnpackLow(hadd, Vector128.Zero).AsUInt32(); + Vector128 b0 = Vector128.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Vector128.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Vector128.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Vector128.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = a0 & mean16x4Mask; // lo byte + Vector128 c1 = a1 & mean16x4Mask; + Vector128 c2 = a2 & mean16x4Mask; + Vector128 c3 = a3 & mean16x4Mask; + Vector128 d0 = b0.AsInt32() + c0.AsInt32(); + Vector128 d1 = b1.AsInt32() + c1.AsInt32(); + Vector128 d2 = b2.AsInt32() + c2.AsInt32(); + Vector128 d3 = b3.AsInt32() + c3.AsInt32(); + Vector128 e0 = d0 + d1; + Vector128 e1 = d2 + d3; + Vector128 f0 = e0 + e1; + Vector128 hadd = Vector128_.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector128 wide = Vector128_.UnpackLow(hadd, Vector128.Zero).AsUInt32(); ref uint outputRef = ref MemoryMarshal.GetReference(dc); Unsafe.As>(ref outputRef) = wide; @@ -2052,7 +2052,7 @@ internal static class LossyUtils } // Applies filter on 2 pixels (p0 and q0) - private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) + private static void DoFilter2Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) { Vector128 signBit = Vector128.Create((byte)0x80); @@ -2078,7 +2078,7 @@ internal static class LossyUtils } // Applies filter on 4 pixels (p1, p0, q0 and q1) - private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) + private static void DoFilter4Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) { // Compute hev mask. Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); @@ -2086,38 +2086,38 @@ internal static class LossyUtils Vector128 signBit = Vector128.Create((byte)0x80); // Convert to signed values. - p1 = Sse2.Xor(p1, signBit); - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); - q1 = Sse2.Xor(q1, signBit); - - Vector128 t1 = Sse2.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 - t1 = Sse2.AndNot(notHev, t1.AsByte()).AsSByte(); // hev(p1 - q1) - Vector128 t2 = Sse2.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) - t1 = Sse2.And(t1.AsByte(), mask).AsSByte(); // mask filter values we don't care about. - - t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 - Vector128 t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 + p1 ^= signBit; + p0 ^= signBit; + q0 ^= signBit; + q1 ^= signBit; + + Vector128 t1 = Vector128_.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 + t1 = (~notHev & t1.AsByte()).AsSByte(); // hev(p1 - q1) + Vector128 t2 = Vector128_.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = (t1.AsByte() & mask).AsSByte(); // mask filter values we don't care about. + + t2 = Vector128_.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 + Vector128 t3 = Vector128_.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 t2 = SignedShift8bVector128(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 t3 = SignedShift8bVector128(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 - p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 - q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 = Vector128_.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 + q0 = Vector128_.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 + p0 ^= signBit; + q0 ^= signBit; // This is equivalent to signed (a + 1) >> 1 calculation. - t2 = Sse2.Add(t3, signBit.AsSByte()); - t3 = Sse2.Average(t2.AsByte(), Vector128.Zero).AsSByte(); - t3 = Sse2.Subtract(t3, Vector128.Create((sbyte)64)); - - t3 = Sse2.And(notHev, t3.AsByte()).AsSByte(); // if !hev - q1 = Sse2.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 - p1 = Sse2.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 - p1 = Sse2.Xor(p1.AsByte(), signBit); - q1 = Sse2.Xor(q1.AsByte(), signBit); + t2 = t3 + signBit.AsSByte(); + t3 = Vector128_.Average(t2.AsByte(), Vector128.Zero).AsSByte(); + t3 -= Vector128.Create((sbyte)64); + + t3 = (notHev & t3.AsByte()).AsSByte(); // if !hev + q1 = Vector128_.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 + p1 = Vector128_.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 + p1 = p1.AsByte() ^ signBit; + q1 = q1.AsByte() ^ signBit; } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)