diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 9642d2afbf..9462db6304 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -17,6 +17,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { #if SUPPORTS_RUNTIME_INTRINSICS private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); + + private static readonly Vector128 SignBit = Vector128.Create((byte)0x80); + + private static readonly Vector128 Three = Vector128.Create((byte)3).AsSByte(); + + private static readonly Vector128 Four = Vector128.Create((byte)4).AsSByte(); + + private static readonly Vector128 SixtyFour = Vector128.Create((byte)64).AsSByte(); #endif // Note: method name in libwebp reference implementation is called VP8SSE16x16. @@ -1240,16 +1248,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Applies filter on 2 pixels (p0 and q0) private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) { - var signBit = Vector128.Create((byte)0x80); - // Convert p1/q1 to byte (for GetBaseDeltaSse2). - Vector128 p1s = Sse2.Xor(p1, signBit); - Vector128 q1s = Sse2.Xor(q1, signBit); + Vector128 p1s = Sse2.Xor(p1, SignBit); + Vector128 q1s = Sse2.Xor(q1, SignBit); Vector128 mask = NeedsFilterSse2(p1, p0, q0, q1, thresh); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); Vector128 a = GetBaseDeltaSse2(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); @@ -1259,22 +1265,75 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy DoSimpleFilterSse2(ref p0, ref q0, a); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + } + + // Applies filter on 4 pixels (p1, p0, q0 and q1) + private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) + { + // Compute hev mask. + Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + + // Convert to signed values. + p1 = Sse2.Xor(p1, SignBit); + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + q1 = Sse2.Xor(q1, SignBit); + + Vector128 t1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 t2 = Sse2.AndNot(notHev, t1); // hev(p1 - q1) + Vector128 t3 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = Sse2.Add(t1, mask); // mask filter values we don't care about. + + t2 = Sse2.AddSaturate(t1.AsSByte(), Three).AsByte(); // 3 * (q0 - p0) + hev(p1 - q1) + 3 + t3 = Sse2.AddSaturate(t1.AsSByte(), Four).AsByte(); // 3 * (q0 - p0) + hev(p1 - q1) + 4 + Vector128 t2SignedShift = SignedShift8bSse2(t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + Vector128 t3SignedShift = SignedShift8bSse2(t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + p0 = Sse2.AddSaturate(p0.AsSByte(), t2SignedShift).AsByte(); // p0 += t2 + q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3SignedShift).AsByte(); // q0 -= t3 + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + + // This is equivalent to signed (a + 1) >> 1 calculation. + t2 = Sse2.Add(t3.AsByte(), SignBit); + t3 = Sse2.Average(t2, Vector128.Zero); + t3 = Sse2.Subtract(t3.AsSByte(), SixtyFour).AsByte(); + + t3 = Sse2.And(notHev, t3); // if !hev + q1 = Sse2.SubtractSaturate(q1, t3); // q1 -= t3 + p1 = Sse2.AddSaturate(p1, t3); // p1 += t3 + p1 = Sse2.Xor(p1, SignBit); + q1 = Sse2.Xor(q1, SignBit); } private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) { - Vector128 three = Vector128.Create((byte)3).AsSByte(); - Vector128 four = Vector128.Create((byte)4).AsSByte(); - Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), three); - Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), four); + Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Three); + Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), Four); v4 = SignedShift8bSse2(v4.AsByte()).AsSByte(); // v4 >> 3 v3 = SignedShift8bSse2(v3.AsByte()).AsSByte(); // v3 >> 3 q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 } + + private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) + { + Vector128 t1 = Abs(p1, q0); + Vector128 t2 = Abs(q1, q0); + + var h = Vector128.Create((byte)hevThresh); + Vector128 tMax = Sse2.Max(t1, t2); + + Vector128 tMaxH = Sse2.SubtractSaturate(tMax, h); + + // not_hev <= t1 && not_hev <= t2 + return Sse2.CompareEqual(tMaxH, Vector128.Zero); + } #endif private static void DoFilter4(Span p, int offset, int step)