diff --git a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs index d1c214e3d..57416a737 100644 --- a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs @@ -5,6 +5,11 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace SixLabors.ImageSharp.Formats.Png.Filters { /// @@ -79,6 +84,89 @@ namespace SixLabors.ImageSharp.Formats.Png.Filters sum += Numerics.Abs(unchecked((sbyte)res)); } +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Vector256 sumAccumulator = Vector256.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) + { + Vector256 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector256 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector256 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector256 res = Avx2.Subtract(scan, Average(left, above)); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector256.Count; + + Vector256 absRes = Avx2.Abs(res.AsSByte()).AsSByte(); + Vector256 loRes16 = Avx2.UnpackLow(absRes, Vector256.Zero).AsInt16(); + Vector256 hiRes16 = Avx2.UnpackHigh(absRes, Vector256.Zero).AsInt16(); + + Vector256 loRes32 = Avx2.UnpackLow(loRes16, Vector256.Zero).AsInt32(); + Vector256 hiRes32 = Avx2.UnpackHigh(loRes16, Vector256.Zero).AsInt32(); + sumAccumulator = Avx2.Add(sumAccumulator, loRes32); + sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); + + loRes32 = Avx2.UnpackLow(hiRes16, Vector256.Zero).AsInt32(); + hiRes32 = Avx2.UnpackHigh(hiRes16, Vector256.Zero).AsInt32(); + sumAccumulator = Avx2.Add(sumAccumulator, loRes32); + sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); + } + + for (int i = 0; i < Vector256.Count; i++) + { + sum += sumAccumulator.GetElement(i); + } + } + else if (Sse2.IsSupported) + { + var allBitsSet = Vector128.Create((sbyte)-1); + Vector128 sumAccumulator = Vector128.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector128.Count <= scanline.Length; xLeft += Vector128.Count) + { + Vector128 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector128 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector128 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector128 res = Sse2.Subtract(scan, Average(left, above)); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector128.Count; + + Vector128 absRes; + if (Ssse3.IsSupported) + { + absRes = Ssse3.Abs(res.AsSByte()).AsSByte(); + } + else + { + Vector128 mask = Sse2.CompareGreaterThan(res.AsSByte(), Vector128.Zero); + mask = Sse2.Xor(mask, allBitsSet); + absRes = Sse2.Xor(Sse2.Add(res.AsSByte(), mask), mask); + } + + Vector128 loRes16 = Sse2.UnpackLow(absRes, Vector128.Zero).AsInt16(); + Vector128 hiRes16 = Sse2.UnpackHigh(absRes, Vector128.Zero).AsInt16(); + + Vector128 loRes32 = Sse2.UnpackLow(loRes16, Vector128.Zero).AsInt32(); + Vector128 hiRes32 = Sse2.UnpackHigh(loRes16, Vector128.Zero).AsInt32(); + sumAccumulator = Sse2.Add(sumAccumulator, loRes32); + sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); + + loRes32 = Sse2.UnpackLow(hiRes16, Vector128.Zero).AsInt32(); + hiRes32 = Sse2.UnpackHigh(hiRes16, Vector128.Zero).AsInt32(); + sumAccumulator = Sse2.Add(sumAccumulator, loRes32); + sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); + } + + for (int i = 0; i < Vector128.Count; i++) + { + sum += sumAccumulator.GetElement(i); + } + } +#endif + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) { byte scan = Unsafe.Add(ref scanBaseRef, x); @@ -101,5 +189,37 @@ namespace SixLabors.ImageSharp.Formats.Png.Filters /// The [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Average(byte left, byte above) => (left + above) >> 1; + +#if SUPPORTS_RUNTIME_INTRINSICS + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Average(Vector128 left, Vector128 above) + { + Vector128 loLeft16 = Sse2.UnpackLow(left, Vector128.Zero).AsUInt16(); + Vector128 hiLeft16 = Sse2.UnpackHigh(left, Vector128.Zero).AsUInt16(); + + Vector128 loAbove16 = Sse2.UnpackLow(above, Vector128.Zero).AsUInt16(); + Vector128 hiAbove16 = Sse2.UnpackHigh(above, Vector128.Zero).AsUInt16(); + + Vector128 div1 = Sse2.ShiftRightLogical(Sse2.Add(loLeft16, loAbove16), 1); + Vector128 div2 = Sse2.ShiftRightLogical(Sse2.Add(hiLeft16, hiAbove16), 1); + + return Sse2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Average(Vector256 left, Vector256 above) + { + Vector256 loLeft16 = Avx2.UnpackLow(left, Vector256.Zero).AsUInt16(); + Vector256 hiLeft16 = Avx2.UnpackHigh(left, Vector256.Zero).AsUInt16(); + + Vector256 loAbove16 = Avx2.UnpackLow(above, Vector256.Zero).AsUInt16(); + Vector256 hiAbove16 = Avx2.UnpackHigh(above, Vector256.Zero).AsUInt16(); + + Vector256 div1 = Avx2.ShiftRightLogical(Avx2.Add(loLeft16, loAbove16), 1); + Vector256 div2 = Avx2.ShiftRightLogical(Avx2.Add(hiLeft16, hiAbove16), 1); + + return Avx2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); + } +#endif } }