From e7cdb0aaab2d1c0c122f8e0cb7618bc5b624befa Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 20:19:47 +0100 Subject: [PATCH] Add FMA support, more SSE optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Special thanks to @tannergooding for the help 🚀 --- src/ImageSharp/Common/Helpers/Numerics.cs | 33 ++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index e7ae71210f..88b6d83ee6 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -590,12 +590,12 @@ namespace SixLabors.ImageSharp ref Vector128 vectors128Ref = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); ref Vector128 vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length); - var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); - var v128_0x3F8000000 = Vector128.Create(0x3F800000); var v128_341 = Vector128.Create(341); - var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000)); - var v4_23rds = Vector128.Create(2 / 3f); - var v4_13rds = Vector128.Create(1 / 3f); + Vector128 v128_negativeZero = Vector128.Create(-0.0f).AsInt32(); + Vector128 v128_one = Vector128.Create(1.0f).AsInt32(); + + var v128_13rd = Vector128.Create(1 / 3f); + var v128_23rds = Vector128.Create(2 / 3f); while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End)) { @@ -607,18 +607,27 @@ namespace SixLabors.ImageSharp // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html. // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here // using a different succession (the same used below), which should be less unstable due to not having cube pow. - veax = Sse2.And(veax, v128_0x7FFFFFFF); - veax = Sse2.Subtract(veax, v128_0x3F8000000); + veax = Sse2.AndNot(v128_negativeZero, veax); + veax = Sse2.Subtract(veax, v128_one); veax = Sse2.ShiftRightArithmetic(veax, 10); veax = Sse41.MultiplyLow(veax, v128_341); - veax = Sse2.Add(veax, v128_0x3F8000000); - veax = Sse2.And(veax, v128_0x7FFFFFFF); - veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000)); + veax = Sse2.Add(veax, v128_one); + veax = Sse2.AndNot(v128_negativeZero, veax); + veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero)); Vector128 y4 = veax.AsSingle(); - y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); - y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + if (Fma.IsSupported) + { + y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + } + else + { + y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + } + y4 = Sse41.Insert(y4, vecx, 0xF0); vectors128Ref = y4;