diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 8fa4ab7a1b..04632ded87 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -932,13 +932,35 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Simple In-loop filtering (Paragraph 15.2) public static void SimpleVFilter16(Span p, int offset, int stride, int thresh) { - int thresh2 = (2 * thresh) + 1; - int end = 16 + offset; - for (int i = offset; i < end; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // Load. + ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset); + + Vector128 p1 = Unsafe.As>(ref Unsafe.Subtract(ref pRef, 2 * stride)); + Vector128 p0 = Unsafe.As>(ref Unsafe.Subtract(ref pRef, stride)); + Vector128 q0 = Unsafe.As>(ref pRef); + Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, stride)); + + DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + + // Store. + ref byte outputRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset); + Unsafe.As>(ref Unsafe.Subtract(ref outputRef, stride)) = p0.AsSByte(); + Unsafe.As>(ref outputRef) = q0.AsSByte(); + } + else +#endif { - if (NeedsFilter(p, i, stride, thresh2)) + int thresh2 = (2 * thresh) + 1; + int end = 16 + offset; + for (int i = offset; i < end; i++) { - DoFilter2(p, i, stride); + if (NeedsFilter(p, i, stride, thresh2)) + { + DoFilter2(p, i, stride); + } } } } @@ -1185,6 +1207,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } + // Applies filter on 2 pixels (p0 and q0) private static void DoFilter2(Span p, int offset, int step) { // 4 pixels in, 2 pixels out. @@ -1199,6 +1222,47 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy p[offset] = WebpLookupTables.Clip1(q0 - a1); } +#if SUPPORTS_RUNTIME_INTRINSICS + // Applies filter on 2 pixels (p0 and q0) + private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) + { + var signBit = Vector128.Create((byte)0x80); + + // Convert p1/q1 to byte (for GetBaseDeltaSse2). + Vector128 p1s = Sse2.Xor(p1, signBit); + Vector128 q1s = Sse2.Xor(q1, signBit); + Vector128 mask = NeedsFilterSse2(p1, p0, q0, q1, thresh); + + // Flip sign. + p0 = Sse2.Xor(p0, signBit); + q0 = Sse2.Xor(q0, signBit); + + Vector128 a = GetBaseDeltaSse2(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); + + // Mask filter values we don't care about. + a = Sse2.And(a, mask); + + DoSimpleFilterSse2(ref p0, ref q0, a); + + // Flip sign. + p0 = Sse2.Xor(p0, signBit); + q0 = Sse2.Xor(q0, signBit); + } + + private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) + { + Vector128 three = Vector128.Create((byte)3).AsSByte(); + Vector128 four = Vector128.Create((byte)4).AsSByte(); + Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), three); + Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), four); + + v4 = SignedShift8bSse2(v4.AsByte()).AsSByte(); // v4 >> 3 + v3 = SignedShift8bSse2(v3.AsByte()).AsSByte(); // v3 >> 3 + q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 + p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 + } +#endif + private static void DoFilter4(Span p, int offset, int step) { // 4 pixels in, 4 pixels out. @@ -1275,6 +1339,53 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it; } +#if SUPPORTS_RUNTIME_INTRINSICS + private static Vector128 NeedsFilterSse2(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) + { + var mthresh = Vector128.Create((byte)thresh); + Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) + var fe = Vector128.Create((byte)0xFE); + Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. + Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 + + Vector128 t4 = Abs(p0, q0); // abs(p0 - q0) + Vector128 t5 = Sse2.AddSaturate(t4, t4); // abs(p0 - q0) * 2 + Vector128 t6 = Sse2.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 + + Vector128 t7 = Sse2.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh + + return Sse2.CompareEqual(t7, Vector128.Zero); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 GetBaseDeltaSse2(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) + { + // Beware of addition order, for saturation! + Vector128 p1q1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 q0p0 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 + Vector128 s1 = Sse2.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) + Vector128 s2 = Sse2.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) + Vector128 s3 = Sse2.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) + + return s3; + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 SignedShift8bSse2(Vector128 x) + { + Vector128 low0 = Sse2.UnpackLow(Vector128.Zero, x); + Vector128 high0 = Sse2.UnpackHigh(Vector128.Zero, x); + Vector128 low1 = Sse2.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); + Vector128 high1 = Sse2.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); + + return Sse2.PackSignedSaturate(low1, high1); + } + + // Compute abs(p - q) = subs(p - q) OR subs(q - p) + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 Abs(Vector128 p, Vector128 q) => Sse2.Or(Sse2.SubtractSaturate(q, p), Sse2.SubtractSaturate(p, q)); +#endif + [MethodImpl(InliningOptions.ShortMethod)] private static bool Hev(Span p, int offset, int step, int thresh) {