diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 55718e724..e7ae71210 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -587,22 +587,20 @@ namespace SixLabors.ImageSharp #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { - ref Vector4 vectors4Ref = ref MemoryMarshal.GetReference(vectors); - ref Vector4 vectors4End = ref Unsafe.Add(ref vectors4Ref, vectors.Length); + ref Vector128 vectors128Ref = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); + ref Vector128 vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length); var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); var v128_0x3F8000000 = Vector128.Create(0x3F800000); var v128_341 = Vector128.Create(341); var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000)); - var v4_23rds = new Vector4(2 / 3f); - var v4_13rds = new Vector4(1 / 3f); + var v4_23rds = Vector128.Create(2 / 3f); + var v4_13rds = Vector128.Create(1 / 3f); - while (Unsafe.IsAddressLessThan(ref vectors4Ref, ref vectors4End)) + while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End)) { - Vector4 vx = vectors4Ref; - float a = vx.W; - Vector128 veax = Unsafe.As>(ref vx); - Vector128 vecx = veax; + Vector128 vecx = vectors128Ref; + Vector128 veax = vecx.AsInt32(); // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it // directly on 32 bit floating point values. What follows is a vectorized implementation of this method: @@ -615,17 +613,16 @@ namespace SixLabors.ImageSharp veax = Sse41.MultiplyLow(veax, v128_341); veax = Sse2.Add(veax, v128_0x3F8000000); veax = Sse2.And(veax, v128_0x7FFFFFFF); - vecx = Sse2.And(vecx, v128_0x80000000); - veax = Sse2.Or(veax, vecx); + veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000)); - Vector4 y4 = *(Vector4*)&veax; + Vector128 y4 = veax.AsSingle(); - y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); - y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); - y4.W = a; + y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse41.Insert(y4, vecx, 0xF0); - vectors4Ref = y4; - vectors4Ref = ref Unsafe.Add(ref vectors4Ref, 1); + vectors128Ref = y4; + vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1); } return;