diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs
index 6a89a1122a..0553eb46a9 100644
--- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs
+++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs
@@ -22,9 +22,9 @@ namespace SixLabors.ImageSharp.Formats.Png.Filters
internal static class PaethFilter
{
///
- /// Decodes the scanline
+ /// Decodes a scanline, which was filtered with the paeth filter.
///
- /// The scanline to decode
+ /// The scanline to decode.
/// The previous scanline.
/// The bytes per pixel.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -32,6 +32,86 @@ namespace SixLabors.ImageSharp.Formats.Png.Filters
{
DebugGuard.MustBeSameSized(scanline, previousScanline, nameof(scanline));
+ // Paeth tries to predict pixel d using the pixel to the left of it, a,
+ // and two pixels from the previous row, b and c:
+ // prev: c b
+ // row: a d
+ // The Paeth function predicts d to be whichever of a, b, or c is nearest to
+ // p = a + b - c.
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse41.IsSupported && bytesPerPixel is 4)
+ {
+ DecodeSse41(scanline, previousScanline);
+ }
+ else
+#endif
+ {
+ DecodeScalar(scanline, previousScanline, bytesPerPixel);
+ }
+ }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void DecodeSse41(Span scanline, Span previousScanline)
+ {
+ ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline);
+ ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
+
+ Vector128 b = Vector128.Zero;
+ Vector128 d = Vector128.Zero;
+
+ int rb = scanline.Length;
+ nint offset = 1;
+ while (rb >= 4)
+ {
+ ref byte scanRef = ref Unsafe.Add(ref scanBaseRef, offset);
+
+ // It's easiest to do this math (particularly, deal with pc) with 16-bit intermediates.
+ Vector128 c = b;
+ Vector128 a = d;
+ b = Sse2.UnpackLow(
+ Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref prevBaseRef, offset))).AsByte(),
+ Vector128.Zero);
+ d = Sse2.UnpackLow(
+ Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref scanRef)).AsByte(),
+ Vector128.Zero);
+
+ // (p-a) == (a+b-c - a) == (b-c)
+ Vector128 pa = Sse2.Subtract(b.AsInt16(), c.AsInt16());
+
+ // (p-b) == (a+b-c - b) == (a-c)
+ Vector128 pb = Sse2.Subtract(a.AsInt16(), c.AsInt16());
+
+ // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
+ Vector128 pc = Sse2.Add(pa.AsInt16(), pb.AsInt16());
+
+ pa = Ssse3.Abs(pa.AsInt16()).AsInt16(); /* |p-a| */
+ pb = Ssse3.Abs(pb.AsInt16()).AsInt16(); /* |p-b| */
+ pc = Ssse3.Abs(pc.AsInt16()).AsInt16(); /* |p-c| */
+
+ Vector128 smallest = Sse2.Min(pc, Sse2.Min(pa, pb));
+
+ // Paeth breaks ties favoring a over b over c.
+ Vector128 mask = Sse41.BlendVariable(c, b, Sse2.CompareEqual(smallest, pb).AsByte());
+ Vector128 nearest = Sse41.BlendVariable(mask, a, Sse2.CompareEqual(smallest, pa).AsByte());
+
+ // Note `_epi8`: we need addition to wrap modulo 255.
+ d = Sse2.Add(d, nearest);
+
+ // Store the result.
+ Unsafe.As(ref scanRef) = Sse2.ConvertToInt32(Sse2.PackUnsignedSaturate(d.AsInt16(), d.AsInt16()).AsInt32());
+
+ rb -= 4;
+ offset += 4;
+ }
+ }
+
+#endif
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void DecodeScalar(Span scanline, Span previousScanline, int bytesPerPixel)
+ {
ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline);
ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
@@ -56,13 +136,13 @@ namespace SixLabors.ImageSharp.Formats.Png.Filters
}
///
- /// Encodes the scanline
+ /// Encodes a scanline and applies the paeth filter.
///
/// The scanline to encode
/// The previous scanline.
/// The filtered scanline result.
/// The bytes per pixel.
- /// The sum of the total variance of the filtered row
+ /// The sum of the total variance of the filtered row.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void Encode(ReadOnlySpan scanline, ReadOnlySpan previousScanline, Span result, int bytesPerPixel, out int sum)
{